Unified storage search: Introduce min index update interval (#111978)

* Don't update index more often than specified index_min_update_interval.

* Add artificial sleep at the end of write operations.

* Improve test: check for number of update calls, make diff check less flaky.

* Make test less flaky by allowing for higher diff variance.

* Make test less flaky by allowing for expected update calls variance.
This commit is contained in:
Peter Štibraný 2025-10-06 10:02:03 +02:00 committed by GitHub
parent cd889fef9b
commit a44af81082
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 244 additions and 31 deletions

View File

@ -582,6 +582,7 @@ type Cfg struct {
IndexMinCount int
IndexRebuildInterval time.Duration
IndexCacheTTL time.Duration
IndexMinUpdateInterval time.Duration // Don't update index if it was updated less than this interval ago.
MaxFileIndexAge time.Duration // Max age of file-based indexes. Index older than this will be rebuilt asynchronously.
MinFileIndexBuildVersion string // Minimum version of Grafana that built the file-based index. If index was built with older Grafana, it will be rebuilt asynchronously.
EnableSharding bool

View File

@ -73,6 +73,7 @@ func (cfg *Cfg) setUnifiedStorageConfig() {
// default to 24 hours because usage insights summarizes the data every 24 hours
cfg.IndexRebuildInterval = section.Key("index_rebuild_interval").MustDuration(24 * time.Hour)
cfg.IndexCacheTTL = section.Key("index_cache_ttl").MustDuration(10 * time.Minute)
cfg.IndexMinUpdateInterval = section.Key("index_min_update_interval").MustDuration(0)
cfg.SprinklesApiServer = section.Key("sprinkles_api_server").String()
cfg.SprinklesApiServerPageLimit = section.Key("sprinkles_api_server_page_limit").MustInt(10000)
cfg.CACertPath = section.Key("ca_cert_path").String()

View File

@ -22,6 +22,7 @@ import (
claims "github.com/grafana/authlib/types"
"github.com/grafana/dskit/backoff"
"github.com/grafana/grafana/pkg/apimachinery/utils"
"github.com/grafana/grafana/pkg/apimachinery/validation"
secrets "github.com/grafana/grafana/pkg/registry/apis/secret/contracts"
@ -194,6 +195,10 @@ type SearchOptions struct {
// Number of workers to use for index rebuilds.
IndexRebuildWorkers int
// Minimum time between index updates. This is also used as a delay after a successful write operation, to guarantee
// that subsequent search will observe the effect of the writing.
IndexMinUpdateInterval time.Duration
}
type ResourceServerOptions struct {
@ -336,6 +341,8 @@ func NewResourceServer(opts ResourceServerOptions) (*server, error) {
reg: opts.Reg,
queue: opts.QOSQueue,
queueConfig: opts.QOSConfig,
artificialSuccessfulWriteDelay: opts.Search.IndexMinUpdateInterval,
}
if opts.Search.Resources != nil {
@ -386,6 +393,11 @@ type server struct {
reg prometheus.Registerer
queue QOSEnqueuer
queueConfig QueueConfig
// This value is used by storage server to artificially delay returning response after successful
// write operations to make sure that subsequent search by the same client will return up-to-date results.
// Set from SearchOptions.IndexMinUpdateInterval.
artificialSuccessfulWriteDelay time.Duration
}
// Init implements ResourceServer.
@ -661,6 +673,8 @@ func (s *server) Create(ctx context.Context, req *resourcepb.CreateRequest) (*re
})
}
s.sleepAfterSuccessfulWriteOperation(res, err)
return res, err
}
@ -684,6 +698,37 @@ func (s *server) create(ctx context.Context, user claims.AuthInfo, req *resource
return rsp, nil
}
type responseWithErrorResult interface {
GetError() *resourcepb.ErrorResult
}
// sleepAfterSuccessfulWriteOperation will sleep for a specified time if the operation was successful.
// Returns boolean indicating whether the sleep was performed or not (used in testing).
//
// This sleep is performed to guarantee search-after-write consistency, when rate-limiting updates to search index.
func (s *server) sleepAfterSuccessfulWriteOperation(res responseWithErrorResult, err error) bool {
if s.artificialSuccessfulWriteDelay <= 0 {
return false
}
if err != nil {
// No sleep necessary if operation failed.
return false
}
// We expect that non-nil interface values with typed nils can still handle GetError() call.
if res != nil {
errRes := res.GetError()
if errRes != nil {
// No sleep necessary if operation failed.
return false
}
}
time.Sleep(s.artificialSuccessfulWriteDelay)
return true
}
func (s *server) Update(ctx context.Context, req *resourcepb.UpdateRequest) (*resourcepb.UpdateResponse, error) {
ctx, span := s.tracer.Start(ctx, "storage_server.Update")
defer span.End()
@ -715,6 +760,8 @@ func (s *server) Update(ctx context.Context, req *resourcepb.UpdateRequest) (*re
})
}
s.sleepAfterSuccessfulWriteOperation(res, err)
return res, err
}
@ -787,6 +834,8 @@ func (s *server) Delete(ctx context.Context, req *resourcepb.DeleteRequest) (*re
})
}
s.sleepAfterSuccessfulWriteOperation(res, err)
return res, err
}

View File

@ -3,6 +3,7 @@ package resource
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"net/http"
@ -21,6 +22,7 @@ import (
authlib "github.com/grafana/authlib/types"
"github.com/grafana/dskit/services"
"github.com/grafana/grafana/pkg/apimachinery/identity"
"github.com/grafana/grafana/pkg/apimachinery/utils"
"github.com/grafana/grafana/pkg/infra/log"
@ -587,3 +589,30 @@ func newTestServerWithQueue(t *testing.T, maxSizePerTenant int, numWorkers int)
}
return s, q
}
func TestArtificialDelayAfterSuccessfulOperation(t *testing.T) {
s := &server{artificialSuccessfulWriteDelay: 1 * time.Millisecond}
check := func(t *testing.T, expectedSleep bool, res responseWithErrorResult, err error) {
slept := s.sleepAfterSuccessfulWriteOperation(res, err)
require.Equal(t, expectedSleep, slept)
}
// Successful responses should sleep
check(t, true, nil, nil)
check(t, true, (responseWithErrorResult)((*resourcepb.CreateResponse)(nil)), nil)
check(t, true, &resourcepb.CreateResponse{}, nil)
check(t, true, (responseWithErrorResult)((*resourcepb.UpdateResponse)(nil)), nil)
check(t, true, &resourcepb.UpdateResponse{}, nil)
check(t, true, (responseWithErrorResult)((*resourcepb.DeleteResponse)(nil)), nil)
check(t, true, &resourcepb.DeleteResponse{}, nil)
// Failed responses should return without sleeping
check(t, false, nil, errors.New("some error"))
check(t, false, &resourcepb.CreateResponse{Error: AsErrorResult(errors.New("some error"))}, nil)
check(t, false, &resourcepb.UpdateResponse{Error: AsErrorResult(errors.New("some error"))}, nil)
check(t, false, &resourcepb.DeleteResponse{Error: AsErrorResult(errors.New("some error"))}, nil)
}

View File

@ -79,6 +79,9 @@ type BleveOptions struct {
UseFullNgram bool
// Minimum time between index updates.
IndexMinUpdateInterval time.Duration
// This function is called to check whether the index is owned by the current instance.
// Indexes that are not owned by current instance are eligible for cleanup.
// If nil, all indexes are owned by the current instance.
@ -689,7 +692,7 @@ func (b *bleveBackend) closeAllIndexes() {
}
type updateRequest struct {
reason string
requestTime time.Time
callback chan updateResult
}
@ -705,6 +708,10 @@ type bleveIndex struct {
// RV returned by last List/ListModifiedSince operation. Updated when updating index.
resourceVersion int64
// Timestamp when the last update to the index was done (started).
// Subsequent update requests only trigger new update if minUpdateInterval has elapsed.
nextUpdateTime time.Time
standard resource.SearchableDocumentFields
fields resource.SearchableDocumentFields
@ -720,6 +727,7 @@ type bleveIndex struct {
logger *slog.Logger
updaterFn resource.UpdateFn
minUpdateInterval time.Duration
updaterMu sync.Mutex
updaterCond *sync.Cond // Used to signal the updater goroutine that there is work to do, or updater is no longer enabled and should stop. Also used by updater itself to stop early if there's no work to be done.
@ -755,6 +763,7 @@ func (b *bleveBackend) newBleveIndex(
tracing: b.tracer,
logger: logger,
updaterFn: updaterFn,
minUpdateInterval: b.opts.IndexMinUpdateInterval,
}
bi.updaterCond = sync.NewCond(&bi.updaterMu)
if b.indexMetrics != nil {
@ -1356,7 +1365,7 @@ func (b *bleveIndex) UpdateIndex(ctx context.Context, reason string) (int64, err
}
// Use chan with buffer size 1 to ensure that we can always send the result back, even if there's no reader anymore.
req := updateRequest{reason: reason, callback: make(chan updateResult, 1)}
req := updateRequest{requestTime: time.Now(), callback: make(chan updateResult, 1)}
// Make sure that the updater goroutine is running.
b.updaterMu.Lock()
@ -1413,7 +1422,7 @@ func (b *bleveIndex) runUpdater(ctx context.Context) {
b.updaterMu.Lock()
for !b.updaterShutdown && ctx.Err() == nil && len(b.updaterQueue) == 0 && time.Since(start) < maxWait {
// Cond is signalled when updaterShutdown changes, updaterQueue gets new element or when timeout occurs.
// Cond is signaled when updaterShutdown changes, updaterQueue gets new element or when timeout occurs.
b.updaterCond.Wait()
}
@ -1436,6 +1445,26 @@ func (b *bleveIndex) runUpdater(ctx context.Context) {
return
}
// Check if requests arrived before minUpdateInterval since the last update has elapsed, and remove such requests.
for ix := 0; ix < len(batch); {
req := batch[ix]
if req.requestTime.Before(b.nextUpdateTime) {
req.callback <- updateResult{rv: b.resourceVersion}
batch = append(batch[:ix], batch[ix+1:]...)
} else {
// Keep in the batch
ix++
}
}
// If all requests are now handled, don't perform update.
if len(batch) == 0 {
continue
}
// Bump next update time
b.nextUpdateTime = time.Now().Add(b.minUpdateInterval)
var rv int64
var err = ctx.Err()
if err == nil {

View File

@ -833,6 +833,12 @@ func withOwnsIndexFn(fn func(key resource.NamespacedResource) (bool, error)) set
}
}
func withIndexMinUpdateInterval(d time.Duration) setupOption {
return func(options *BleveOptions) {
options.IndexMinUpdateInterval = d
}
}
func TestBuildIndexExpiration(t *testing.T) {
ns := resource.NamespacedResource{
Namespace: "test",
@ -1132,6 +1138,37 @@ func updateTestDocs(ns resource.NamespacedResource, docs int) resource.UpdateFn
}
}
func updateTestDocsReturningMillisTimestamp(ns resource.NamespacedResource, docs int) (resource.UpdateFn, *atomic.Int64) {
cnt := 0
updateCalls := atomic.NewInt64(0)
return func(context context.Context, index resource.ResourceIndex, sinceRV int64) (newRV int64, updatedDocs int, _ error) {
now := time.Now()
updateCalls.Inc()
cnt++
var items []*resource.BulkIndexItem
for i := 0; i < docs; i++ {
items = append(items, &resource.BulkIndexItem{
Action: resource.ActionIndex,
Doc: &resource.IndexableDocument{
Key: &resourcepb.ResourceKey{
Namespace: ns.Namespace,
Group: ns.Group,
Resource: ns.Resource,
Name: fmt.Sprintf("doc%d", i),
},
Title: fmt.Sprintf("Document %d (gen_%d)", i, cnt),
},
})
}
err := index.BulkIndex(&resource.BulkIndexRequest{Items: items})
return now.UnixMilli(), docs, err
}, updateCalls
}
func TestCleanOldIndexes(t *testing.T) {
dir := t.TempDir()
@ -1353,7 +1390,7 @@ func TestConcurrentIndexUpdateSearchAndRebuild(t *testing.T) {
cancel()
wg.Wait()
fmt.Println("Updates:", updates.Load(), "searches:", searches.Load(), "rebuilds:", rebuilds.Load())
t.Log("Updates:", updates.Load(), "searches:", searches.Load(), "rebuilds:", rebuilds.Load())
}
// Verify concurrent updates and searches work as expected.
@ -1415,7 +1452,72 @@ func TestConcurrentIndexUpdateAndSearch(t *testing.T) {
require.Greater(t, rvUpdatedByMultipleGoroutines, int64(0))
}
// Verify concurrent updates and searches work as expected.
func TestConcurrentIndexUpdateAndSearchWithIndexMinUpdateInterval(t *testing.T) {
ns := resource.NamespacedResource{
Namespace: "test",
Group: "group",
Resource: "resource",
}
const minInterval = 100 * time.Millisecond
be, _ := setupBleveBackend(t, withIndexMinUpdateInterval(minInterval))
updateFn, updateCalls := updateTestDocsReturningMillisTimestamp(ns, 5)
idx, err := be.BuildIndex(t.Context(), ns, 10 /* file based */, nil, "test", indexTestDocs(ns, 10, 100), updateFn, false)
require.NoError(t, err)
wg := sync.WaitGroup{}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
attemptedUpdates := atomic.NewInt64(0)
// Verify that each returned RV (unix timestamp in millis) is either the same as before, or at least minInterval later.
const searchConcurrency = 25
for i := 0; i < searchConcurrency; i++ {
wg.Add(1)
go func() {
defer wg.Done()
prevRV := int64(0)
for ctx.Err() == nil {
attemptedUpdates.Inc()
// We use t.Context() here to avoid getting errors from context cancellation.
rv, err := idx.UpdateIndex(t.Context(), "test")
require.NoError(t, err)
// Our update function returns unix timestamp in millis. We expect it to not change at all, or change by minInterval.
if prevRV > 0 {
rvDiff := rv - prevRV
if rvDiff == 0 {
// OK
} else {
// Allow returned RV to be within 10% of minInterval.
require.InDelta(t, minInterval.Milliseconds(), rvDiff, float64(minInterval.Milliseconds())*0.10)
}
}
prevRV = rv
require.Equal(t, int64(10), searchTitle(t, idx, "Document", 10, ns).TotalHits)
}
}()
}
// Run updates and searches for this time.
testTime := 1 * time.Second
time.Sleep(testTime)
cancel()
wg.Wait()
expectedUpdateCalls := int64(testTime / minInterval)
require.InDelta(t, expectedUpdateCalls, updateCalls.Load(), float64(expectedUpdateCalls/2))
require.Greater(t, attemptedUpdates.Load(), updateCalls.Load())
t.Log("Attempted updates:", attemptedUpdates.Load(), "update calls:", updateCalls.Load())
}
func TestIndexUpdateWithErrors(t *testing.T) {
ns := resource.NamespacedResource{
Namespace: "test",

View File

@ -48,6 +48,7 @@ func NewSearchOptions(
BuildVersion: cfg.BuildVersion,
UseFullNgram: features.IsEnabledGlobally(featuremgmt.FlagUnifiedStorageUseFullNgram),
OwnsIndex: ownsIndexFn,
IndexMinUpdateInterval: cfg.IndexMinUpdateInterval,
}, tracer, indexMetrics)
if err != nil {
@ -63,6 +64,7 @@ func NewSearchOptions(
DashboardIndexMaxAge: cfg.IndexRebuildInterval,
MaxIndexAge: cfg.MaxFileIndexAge,
MinBuildVersion: minVersion,
IndexMinUpdateInterval: cfg.IndexMinUpdateInterval,
}, nil
}
return resource.SearchOptions{}, nil