More WAL remote_write tweaks. (#5300)
* Consistently pre-lookup the metrics for a given queue in queue manager. * Don't open the WAL (for writing) in the remote_write code. * Add some more logging. Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
This commit is contained in:
		
							parent
							
								
									1684dc750a
								
							
						
					
					
						commit
						2fa93595d6
					
				|  | @ -159,17 +159,12 @@ type StorageClient interface { | |||
| // used by WAL Watcher.
 | ||||
| type QueueManager struct { | ||||
| 	logger         log.Logger | ||||
| 
 | ||||
| 	flushDeadline  time.Duration | ||||
| 	cfg            config.QueueConfig | ||||
| 	externalLabels model.LabelSet | ||||
| 	relabelConfigs []*pkgrelabel.Config | ||||
| 	client         StorageClient | ||||
| 	queueName                  string | ||||
| 	watcher        *WALWatcher | ||||
| 	highestSentTimestampMetric *maxGauge | ||||
| 	pendingSamplesMetric       prometheus.Gauge | ||||
| 	enqueueRetriesMetric       prometheus.Counter | ||||
| 
 | ||||
| 	seriesMtx            sync.Mutex | ||||
| 	seriesLabels         map[uint64][]prompb.Label | ||||
|  | @ -184,6 +179,16 @@ type QueueManager struct { | |||
| 
 | ||||
| 	samplesIn, samplesDropped, samplesOut, samplesOutDuration *ewmaRate | ||||
| 	integralAccumulator                                       float64 | ||||
| 
 | ||||
| 	highestSentTimestampMetric *maxGauge | ||||
| 	pendingSamplesMetric       prometheus.Gauge | ||||
| 	enqueueRetriesMetric       prometheus.Counter | ||||
| 	droppedSamplesTotal        prometheus.Counter | ||||
| 	numShardsMetric            prometheus.Gauge | ||||
| 	failedSamplesTotal         prometheus.Counter | ||||
| 	sentBatchDuration          prometheus.Observer | ||||
| 	succeededSamplesTotal      prometheus.Counter | ||||
| 	retriedSamplesTotal        prometheus.Counter | ||||
| } | ||||
| 
 | ||||
| // NewQueueManager builds a new QueueManager.
 | ||||
|  | @ -191,14 +196,16 @@ func NewQueueManager(logger log.Logger, walDir string, samplesIn *ewmaRate, cfg | |||
| 	if logger == nil { | ||||
| 		logger = log.NewNopLogger() | ||||
| 	} | ||||
| 
 | ||||
| 	name := client.Name() | ||||
| 	logger = log.With(logger, "queue", name) | ||||
| 	t := &QueueManager{ | ||||
| 		logger:         log.With(logger, "queue", client.Name()), | ||||
| 		logger:         logger, | ||||
| 		flushDeadline:  flushDeadline, | ||||
| 		cfg:            cfg, | ||||
| 		externalLabels: externalLabels, | ||||
| 		relabelConfigs: relabelConfigs, | ||||
| 		client:         client, | ||||
| 		queueName:      client.Name(), | ||||
| 
 | ||||
| 		seriesLabels:         make(map[uint64][]prompb.Label), | ||||
| 		seriesSegmentIndexes: make(map[uint64]int), | ||||
|  | @ -212,26 +219,25 @@ func NewQueueManager(logger log.Logger, walDir string, samplesIn *ewmaRate, cfg | |||
| 		samplesDropped:     newEWMARate(ewmaWeight, shardUpdateDuration), | ||||
| 		samplesOut:         newEWMARate(ewmaWeight, shardUpdateDuration), | ||||
| 		samplesOutDuration: newEWMARate(ewmaWeight, shardUpdateDuration), | ||||
| 
 | ||||
| 		highestSentTimestampMetric: &maxGauge{ | ||||
| 			Gauge: queueHighestSentTimestamp.WithLabelValues(name), | ||||
| 		}, | ||||
| 		pendingSamplesMetric:  queuePendingSamples.WithLabelValues(name), | ||||
| 		enqueueRetriesMetric:  enqueueRetriesTotal.WithLabelValues(name), | ||||
| 		droppedSamplesTotal:   droppedSamplesTotal.WithLabelValues(name), | ||||
| 		numShardsMetric:       numShards.WithLabelValues(name), | ||||
| 		failedSamplesTotal:    failedSamplesTotal.WithLabelValues(name), | ||||
| 		sentBatchDuration:     sentBatchDuration.WithLabelValues(name), | ||||
| 		succeededSamplesTotal: succeededSamplesTotal.WithLabelValues(name), | ||||
| 		retriedSamplesTotal:   retriedSamplesTotal.WithLabelValues(name), | ||||
| 	} | ||||
| 
 | ||||
| 	t.highestSentTimestampMetric = &maxGauge{ | ||||
| 		Gauge: queueHighestSentTimestamp.WithLabelValues(t.queueName), | ||||
| 	} | ||||
| 	t.pendingSamplesMetric = queuePendingSamples.WithLabelValues(t.queueName) | ||||
| 	t.enqueueRetriesMetric = enqueueRetriesTotal.WithLabelValues(t.queueName) | ||||
| 	t.watcher = NewWALWatcher(logger, client.Name(), t, walDir) | ||||
| 	t.watcher = NewWALWatcher(logger, name, t, walDir) | ||||
| 	t.shards = t.newShards() | ||||
| 
 | ||||
| 	numShards.WithLabelValues(t.queueName).Set(float64(t.numShards)) | ||||
| 	shardCapacity.WithLabelValues(t.queueName).Set(float64(t.cfg.Capacity)) | ||||
| 
 | ||||
| 	// Initialize counter labels to zero.
 | ||||
| 	sentBatchDuration.WithLabelValues(t.queueName) | ||||
| 	succeededSamplesTotal.WithLabelValues(t.queueName) | ||||
| 	failedSamplesTotal.WithLabelValues(t.queueName) | ||||
| 	droppedSamplesTotal.WithLabelValues(t.queueName) | ||||
| 	retriedSamplesTotal.WithLabelValues(t.queueName) | ||||
| 	// Reset pending samples metric to 0.
 | ||||
| 	// Initialise some metrics.
 | ||||
| 	shardCapacity.WithLabelValues(name).Set(float64(t.cfg.Capacity)) | ||||
| 	t.pendingSamplesMetric.Set(0) | ||||
| 
 | ||||
| 	return t | ||||
|  | @ -250,7 +256,7 @@ func (t *QueueManager) Append(s []tsdb.RefSample) bool { | |||
| 	for _, sample := range s { | ||||
| 		// If we have no labels for the series, due to relabelling or otherwise, don't send the sample.
 | ||||
| 		if _, ok := t.seriesLabels[sample.Ref]; !ok { | ||||
| 			droppedSamplesTotal.WithLabelValues(t.queueName).Inc() | ||||
| 			t.droppedSamplesTotal.Inc() | ||||
| 			t.samplesDropped.incr(1) | ||||
| 			if _, ok := t.droppedSeries[sample.Ref]; !ok { | ||||
| 				level.Info(t.logger).Log("msg", "dropped sample for series that was not explicitly dropped via relabelling", "ref", sample.Ref) | ||||
|  | @ -523,7 +529,7 @@ func (s *shards) start(n int) { | |||
| 	for i := 0; i < n; i++ { | ||||
| 		go s.runShard(hardShutdownCtx, i, newQueues[i]) | ||||
| 	} | ||||
| 	numShards.WithLabelValues(s.qm.queueName).Set(float64(n)) | ||||
| 	s.qm.numShardsMetric.Set(float64(n)) | ||||
| } | ||||
| 
 | ||||
| // stop the shards; subsequent call to enqueue will return false.
 | ||||
|  | @ -652,7 +658,7 @@ func (s *shards) sendSamples(ctx context.Context, samples []prompb.TimeSeries) { | |||
| 	err := s.sendSamplesWithBackoff(ctx, samples) | ||||
| 	if err != nil { | ||||
| 		level.Error(s.qm.logger).Log("msg", "non-recoverable error", "count", len(samples), "err", err) | ||||
| 		failedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples))) | ||||
| 		s.qm.failedSamplesTotal.Add(float64(len(samples))) | ||||
| 	} | ||||
| 
 | ||||
| 	// These counters are used to calculate the dynamic sharding, and as such
 | ||||
|  | @ -680,10 +686,10 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti | |||
| 		begin := time.Now() | ||||
| 		err := s.qm.client.Store(ctx, req) | ||||
| 
 | ||||
| 		sentBatchDuration.WithLabelValues(s.qm.queueName).Observe(time.Since(begin).Seconds()) | ||||
| 		s.qm.sentBatchDuration.Observe(time.Since(begin).Seconds()) | ||||
| 
 | ||||
| 		if err == nil { | ||||
| 			succeededSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples))) | ||||
| 			s.qm.succeededSamplesTotal.Add(float64(len(samples))) | ||||
| 			s.qm.highestSentTimestampMetric.Set(float64(highest / 1000)) | ||||
| 			return nil | ||||
| 		} | ||||
|  | @ -691,7 +697,7 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti | |||
| 		if _, ok := err.(recoverableError); !ok { | ||||
| 			return err | ||||
| 		} | ||||
| 		retriedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples))) | ||||
| 		s.qm.retriedSamplesTotal.Add(float64(len(samples))) | ||||
| 		level.Debug(s.qm.logger).Log("msg", "failed to send batch, retrying", "err", err) | ||||
| 
 | ||||
| 		time.Sleep(time.Duration(backoff)) | ||||
|  |  | |||
|  | @ -35,7 +35,7 @@ type startTimeCallback func() (int64, error) | |||
| // storage.Storage.
 | ||||
| type Storage struct { | ||||
| 	logger log.Logger | ||||
| 	mtx    sync.RWMutex | ||||
| 	mtx    sync.Mutex | ||||
| 
 | ||||
| 	// For writes
 | ||||
| 	walDir        string | ||||
|  | @ -112,8 +112,7 @@ func (s *Storage) ApplyConfig(conf *config.Config) error { | |||
| 	} | ||||
| 
 | ||||
| 	// Update read clients
 | ||||
| 
 | ||||
| 	s.queryables = make([]storage.Queryable, 0, len(conf.RemoteReadConfigs)) | ||||
| 	queryables := make([]storage.Queryable, 0, len(conf.RemoteReadConfigs)) | ||||
| 	for i, rrConf := range conf.RemoteReadConfigs { | ||||
| 		c, err := NewClient(i, &ClientConfig{ | ||||
| 			URL:              rrConf.URL, | ||||
|  | @ -132,8 +131,9 @@ func (s *Storage) ApplyConfig(conf *config.Config) error { | |||
| 		if !rrConf.ReadRecent { | ||||
| 			q = PreferLocalStorageFilter(q, s.localStartTimeCallback) | ||||
| 		} | ||||
| 		s.queryables = append(s.queryables, q) | ||||
| 		queryables = append(queryables, q) | ||||
| 	} | ||||
| 	s.queryables = queryables | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
|  |  | |||
|  | @ -169,12 +169,7 @@ func (w *WALWatcher) loop() { | |||
| } | ||||
| 
 | ||||
| func (w *WALWatcher) run() error { | ||||
| 	nw, err := wal.New(nil, nil, w.walDir) | ||||
| 	if err != nil { | ||||
| 		return errors.Wrap(err, "wal.New") | ||||
| 	} | ||||
| 
 | ||||
| 	_, lastSegment, err := nw.Segments() | ||||
| 	_, lastSegment, err := w.firstAndLast() | ||||
| 	if err != nil { | ||||
| 		return errors.Wrap(err, "wal.Segments") | ||||
| 	} | ||||
|  | @ -200,10 +195,11 @@ func (w *WALWatcher) run() error { | |||
| 	level.Debug(w.logger).Log("msg", "tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment, "lastSegment", lastSegment) | ||||
| 	for !isClosed(w.quit) { | ||||
| 		w.currentSegmentMetric.Set(float64(currentSegment)) | ||||
| 		level.Debug(w.logger).Log("msg", "processing segment", "currentSegment", currentSegment) | ||||
| 
 | ||||
| 		// On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment.
 | ||||
| 		// On subsequent calls to this function, currentSegment will have been incremented and we should open that segment.
 | ||||
| 		if err := w.watch(nw, currentSegment, currentSegment >= lastSegment); err != nil { | ||||
| 		if err := w.watch(currentSegment, currentSegment >= lastSegment); err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| 
 | ||||
|  | @ -220,26 +216,11 @@ func (w *WALWatcher) run() error { | |||
| 
 | ||||
| // findSegmentForIndex finds the first segment greater than or equal to index.
 | ||||
| func (w *WALWatcher) findSegmentForIndex(index int) (int, error) { | ||||
| 	files, err := fileutil.ReadDir(w.walDir) | ||||
| 	refs, err := w.segments() | ||||
| 	if err != nil { | ||||
| 		return -1, err | ||||
| 		return -1, nil | ||||
| 	} | ||||
| 
 | ||||
| 	var refs []int | ||||
| 	var last int | ||||
| 	for _, fn := range files { | ||||
| 		k, err := strconv.Atoi(fn) | ||||
| 		if err != nil { | ||||
| 			continue | ||||
| 		} | ||||
| 		if len(refs) > 0 && k > last+1 { | ||||
| 			return -1, errors.New("segments are not sequential") | ||||
| 		} | ||||
| 		refs = append(refs, k) | ||||
| 		last = k | ||||
| 	} | ||||
| 	sort.Ints(refs) | ||||
| 
 | ||||
| 	for _, r := range refs { | ||||
| 		if r >= index { | ||||
| 			return r, nil | ||||
|  | @ -249,10 +230,48 @@ func (w *WALWatcher) findSegmentForIndex(index int) (int, error) { | |||
| 	return -1, errors.New("failed to find segment for index") | ||||
| } | ||||
| 
 | ||||
| func (w *WALWatcher) firstAndLast() (int, int, error) { | ||||
| 	refs, err := w.segments() | ||||
| 	if err != nil { | ||||
| 		return -1, -1, nil | ||||
| 	} | ||||
| 
 | ||||
| 	if len(refs) == 0 { | ||||
| 		return -1, -1, nil | ||||
| 	} | ||||
| 	return refs[0], refs[len(refs)-1], nil | ||||
| } | ||||
| 
 | ||||
| // Copied from tsdb/wal/wal.go so we do not have to open a WAL.
 | ||||
| // Plan is to move WAL watcher to TSDB and dedupe these implementations.
 | ||||
| func (w *WALWatcher) segments() ([]int, error) { | ||||
| 	files, err := fileutil.ReadDir(w.walDir) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 
 | ||||
| 	var refs []int | ||||
| 	var last int | ||||
| 	for _, fn := range files { | ||||
| 		k, err := strconv.Atoi(fn) | ||||
| 		if err != nil { | ||||
| 			continue | ||||
| 		} | ||||
| 		if len(refs) > 0 && k > last+1 { | ||||
| 			return nil, errors.New("segments are not sequential") | ||||
| 		} | ||||
| 		refs = append(refs, k) | ||||
| 		last = k | ||||
| 	} | ||||
| 	sort.Ints(refs) | ||||
| 
 | ||||
| 	return refs, nil | ||||
| } | ||||
| 
 | ||||
| // Use tail true to indicate that the reader is currently on a segment that is
 | ||||
| // actively being written to. If false, assume it's a full segment and we're
 | ||||
| // replaying it on start to cache the series records.
 | ||||
| func (w *WALWatcher) watch(wl *wal.WAL, segmentNum int, tail bool) error { | ||||
| func (w *WALWatcher) watch(segmentNum int, tail bool) error { | ||||
| 	segment, err := wal.OpenReadSegment(wal.SegmentName(w.walDir, segmentNum)) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
|  | @ -297,7 +316,7 @@ func (w *WALWatcher) watch(wl *wal.WAL, segmentNum int, tail bool) error { | |||
| 			} | ||||
| 
 | ||||
| 		case <-segmentTicker.C: | ||||
| 			_, last, err := wl.Segments() | ||||
| 			_, last, err := w.firstAndLast() | ||||
| 			if err != nil { | ||||
| 				return errors.Wrap(err, "segments") | ||||
| 			} | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue