mirror of https://github.com/grafana/grafana.git
Unified storage search: Introduce min index update interval (#111978)
* Don't update index more often than specified index_min_update_interval. * Add artificial sleep at the end of write operations. * Improve test: check for number of update calls, make diff check less flaky. * Make test less flaky by allowing for higher diff variance. * Make test less flaky by allowing for expected update calls variance.
This commit is contained in:
parent
cd889fef9b
commit
a44af81082
|
@ -582,6 +582,7 @@ type Cfg struct {
|
|||
IndexMinCount int
|
||||
IndexRebuildInterval time.Duration
|
||||
IndexCacheTTL time.Duration
|
||||
IndexMinUpdateInterval time.Duration // Don't update index if it was updated less than this interval ago.
|
||||
MaxFileIndexAge time.Duration // Max age of file-based indexes. Index older than this will be rebuilt asynchronously.
|
||||
MinFileIndexBuildVersion string // Minimum version of Grafana that built the file-based index. If index was built with older Grafana, it will be rebuilt asynchronously.
|
||||
EnableSharding bool
|
||||
|
|
|
@ -73,6 +73,7 @@ func (cfg *Cfg) setUnifiedStorageConfig() {
|
|||
// default to 24 hours because usage insights summarizes the data every 24 hours
|
||||
cfg.IndexRebuildInterval = section.Key("index_rebuild_interval").MustDuration(24 * time.Hour)
|
||||
cfg.IndexCacheTTL = section.Key("index_cache_ttl").MustDuration(10 * time.Minute)
|
||||
cfg.IndexMinUpdateInterval = section.Key("index_min_update_interval").MustDuration(0)
|
||||
cfg.SprinklesApiServer = section.Key("sprinkles_api_server").String()
|
||||
cfg.SprinklesApiServerPageLimit = section.Key("sprinkles_api_server_page_limit").MustInt(10000)
|
||||
cfg.CACertPath = section.Key("ca_cert_path").String()
|
||||
|
|
|
@ -22,6 +22,7 @@ import (
|
|||
|
||||
claims "github.com/grafana/authlib/types"
|
||||
"github.com/grafana/dskit/backoff"
|
||||
|
||||
"github.com/grafana/grafana/pkg/apimachinery/utils"
|
||||
"github.com/grafana/grafana/pkg/apimachinery/validation"
|
||||
secrets "github.com/grafana/grafana/pkg/registry/apis/secret/contracts"
|
||||
|
@ -194,6 +195,10 @@ type SearchOptions struct {
|
|||
|
||||
// Number of workers to use for index rebuilds.
|
||||
IndexRebuildWorkers int
|
||||
|
||||
// Minimum time between index updates. This is also used as a delay after a successful write operation, to guarantee
|
||||
// that subsequent search will observe the effect of the writing.
|
||||
IndexMinUpdateInterval time.Duration
|
||||
}
|
||||
|
||||
type ResourceServerOptions struct {
|
||||
|
@ -336,6 +341,8 @@ func NewResourceServer(opts ResourceServerOptions) (*server, error) {
|
|||
reg: opts.Reg,
|
||||
queue: opts.QOSQueue,
|
||||
queueConfig: opts.QOSConfig,
|
||||
|
||||
artificialSuccessfulWriteDelay: opts.Search.IndexMinUpdateInterval,
|
||||
}
|
||||
|
||||
if opts.Search.Resources != nil {
|
||||
|
@ -386,6 +393,11 @@ type server struct {
|
|||
reg prometheus.Registerer
|
||||
queue QOSEnqueuer
|
||||
queueConfig QueueConfig
|
||||
|
||||
// This value is used by storage server to artificially delay returning response after successful
|
||||
// write operations to make sure that subsequent search by the same client will return up-to-date results.
|
||||
// Set from SearchOptions.IndexMinUpdateInterval.
|
||||
artificialSuccessfulWriteDelay time.Duration
|
||||
}
|
||||
|
||||
// Init implements ResourceServer.
|
||||
|
@ -661,6 +673,8 @@ func (s *server) Create(ctx context.Context, req *resourcepb.CreateRequest) (*re
|
|||
})
|
||||
}
|
||||
|
||||
s.sleepAfterSuccessfulWriteOperation(res, err)
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
|
@ -684,6 +698,37 @@ func (s *server) create(ctx context.Context, user claims.AuthInfo, req *resource
|
|||
return rsp, nil
|
||||
}
|
||||
|
||||
type responseWithErrorResult interface {
|
||||
GetError() *resourcepb.ErrorResult
|
||||
}
|
||||
|
||||
// sleepAfterSuccessfulWriteOperation will sleep for a specified time if the operation was successful.
|
||||
// Returns boolean indicating whether the sleep was performed or not (used in testing).
|
||||
//
|
||||
// This sleep is performed to guarantee search-after-write consistency, when rate-limiting updates to search index.
|
||||
func (s *server) sleepAfterSuccessfulWriteOperation(res responseWithErrorResult, err error) bool {
|
||||
if s.artificialSuccessfulWriteDelay <= 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
// No sleep necessary if operation failed.
|
||||
return false
|
||||
}
|
||||
|
||||
// We expect that non-nil interface values with typed nils can still handle GetError() call.
|
||||
if res != nil {
|
||||
errRes := res.GetError()
|
||||
if errRes != nil {
|
||||
// No sleep necessary if operation failed.
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
time.Sleep(s.artificialSuccessfulWriteDelay)
|
||||
return true
|
||||
}
|
||||
|
||||
func (s *server) Update(ctx context.Context, req *resourcepb.UpdateRequest) (*resourcepb.UpdateResponse, error) {
|
||||
ctx, span := s.tracer.Start(ctx, "storage_server.Update")
|
||||
defer span.End()
|
||||
|
@ -715,6 +760,8 @@ func (s *server) Update(ctx context.Context, req *resourcepb.UpdateRequest) (*re
|
|||
})
|
||||
}
|
||||
|
||||
s.sleepAfterSuccessfulWriteOperation(res, err)
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
|
@ -787,6 +834,8 @@ func (s *server) Delete(ctx context.Context, req *resourcepb.DeleteRequest) (*re
|
|||
})
|
||||
}
|
||||
|
||||
s.sleepAfterSuccessfulWriteOperation(res, err)
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package resource
|
|||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
|
@ -21,6 +22,7 @@ import (
|
|||
|
||||
authlib "github.com/grafana/authlib/types"
|
||||
"github.com/grafana/dskit/services"
|
||||
|
||||
"github.com/grafana/grafana/pkg/apimachinery/identity"
|
||||
"github.com/grafana/grafana/pkg/apimachinery/utils"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
|
@ -587,3 +589,30 @@ func newTestServerWithQueue(t *testing.T, maxSizePerTenant int, numWorkers int)
|
|||
}
|
||||
return s, q
|
||||
}
|
||||
|
||||
func TestArtificialDelayAfterSuccessfulOperation(t *testing.T) {
|
||||
s := &server{artificialSuccessfulWriteDelay: 1 * time.Millisecond}
|
||||
|
||||
check := func(t *testing.T, expectedSleep bool, res responseWithErrorResult, err error) {
|
||||
slept := s.sleepAfterSuccessfulWriteOperation(res, err)
|
||||
require.Equal(t, expectedSleep, slept)
|
||||
}
|
||||
|
||||
// Successful responses should sleep
|
||||
check(t, true, nil, nil)
|
||||
|
||||
check(t, true, (responseWithErrorResult)((*resourcepb.CreateResponse)(nil)), nil)
|
||||
check(t, true, &resourcepb.CreateResponse{}, nil)
|
||||
|
||||
check(t, true, (responseWithErrorResult)((*resourcepb.UpdateResponse)(nil)), nil)
|
||||
check(t, true, &resourcepb.UpdateResponse{}, nil)
|
||||
|
||||
check(t, true, (responseWithErrorResult)((*resourcepb.DeleteResponse)(nil)), nil)
|
||||
check(t, true, &resourcepb.DeleteResponse{}, nil)
|
||||
|
||||
// Failed responses should return without sleeping
|
||||
check(t, false, nil, errors.New("some error"))
|
||||
check(t, false, &resourcepb.CreateResponse{Error: AsErrorResult(errors.New("some error"))}, nil)
|
||||
check(t, false, &resourcepb.UpdateResponse{Error: AsErrorResult(errors.New("some error"))}, nil)
|
||||
check(t, false, &resourcepb.DeleteResponse{Error: AsErrorResult(errors.New("some error"))}, nil)
|
||||
}
|
||||
|
|
|
@ -79,6 +79,9 @@ type BleveOptions struct {
|
|||
|
||||
UseFullNgram bool
|
||||
|
||||
// Minimum time between index updates.
|
||||
IndexMinUpdateInterval time.Duration
|
||||
|
||||
// This function is called to check whether the index is owned by the current instance.
|
||||
// Indexes that are not owned by current instance are eligible for cleanup.
|
||||
// If nil, all indexes are owned by the current instance.
|
||||
|
@ -689,8 +692,8 @@ func (b *bleveBackend) closeAllIndexes() {
|
|||
}
|
||||
|
||||
type updateRequest struct {
|
||||
reason string
|
||||
callback chan updateResult
|
||||
requestTime time.Time
|
||||
callback chan updateResult
|
||||
}
|
||||
|
||||
type updateResult struct {
|
||||
|
@ -705,6 +708,10 @@ type bleveIndex struct {
|
|||
// RV returned by last List/ListModifiedSince operation. Updated when updating index.
|
||||
resourceVersion int64
|
||||
|
||||
// Timestamp when the last update to the index was done (started).
|
||||
// Subsequent update requests only trigger new update if minUpdateInterval has elapsed.
|
||||
nextUpdateTime time.Time
|
||||
|
||||
standard resource.SearchableDocumentFields
|
||||
fields resource.SearchableDocumentFields
|
||||
|
||||
|
@ -719,7 +726,8 @@ type bleveIndex struct {
|
|||
tracing trace.Tracer
|
||||
logger *slog.Logger
|
||||
|
||||
updaterFn resource.UpdateFn
|
||||
updaterFn resource.UpdateFn
|
||||
minUpdateInterval time.Duration
|
||||
|
||||
updaterMu sync.Mutex
|
||||
updaterCond *sync.Cond // Used to signal the updater goroutine that there is work to do, or updater is no longer enabled and should stop. Also used by updater itself to stop early if there's no work to be done.
|
||||
|
@ -746,15 +754,16 @@ func (b *bleveBackend) newBleveIndex(
|
|||
logger *slog.Logger,
|
||||
) *bleveIndex {
|
||||
bi := &bleveIndex{
|
||||
key: key,
|
||||
index: index,
|
||||
indexStorage: newIndexType,
|
||||
fields: fields,
|
||||
allFields: allFields,
|
||||
standard: standardSearchFields,
|
||||
tracing: b.tracer,
|
||||
logger: logger,
|
||||
updaterFn: updaterFn,
|
||||
key: key,
|
||||
index: index,
|
||||
indexStorage: newIndexType,
|
||||
fields: fields,
|
||||
allFields: allFields,
|
||||
standard: standardSearchFields,
|
||||
tracing: b.tracer,
|
||||
logger: logger,
|
||||
updaterFn: updaterFn,
|
||||
minUpdateInterval: b.opts.IndexMinUpdateInterval,
|
||||
}
|
||||
bi.updaterCond = sync.NewCond(&bi.updaterMu)
|
||||
if b.indexMetrics != nil {
|
||||
|
@ -1356,7 +1365,7 @@ func (b *bleveIndex) UpdateIndex(ctx context.Context, reason string) (int64, err
|
|||
}
|
||||
|
||||
// Use chan with buffer size 1 to ensure that we can always send the result back, even if there's no reader anymore.
|
||||
req := updateRequest{reason: reason, callback: make(chan updateResult, 1)}
|
||||
req := updateRequest{requestTime: time.Now(), callback: make(chan updateResult, 1)}
|
||||
|
||||
// Make sure that the updater goroutine is running.
|
||||
b.updaterMu.Lock()
|
||||
|
@ -1413,7 +1422,7 @@ func (b *bleveIndex) runUpdater(ctx context.Context) {
|
|||
|
||||
b.updaterMu.Lock()
|
||||
for !b.updaterShutdown && ctx.Err() == nil && len(b.updaterQueue) == 0 && time.Since(start) < maxWait {
|
||||
// Cond is signalled when updaterShutdown changes, updaterQueue gets new element or when timeout occurs.
|
||||
// Cond is signaled when updaterShutdown changes, updaterQueue gets new element or when timeout occurs.
|
||||
b.updaterCond.Wait()
|
||||
}
|
||||
|
||||
|
@ -1436,6 +1445,26 @@ func (b *bleveIndex) runUpdater(ctx context.Context) {
|
|||
return
|
||||
}
|
||||
|
||||
// Check if requests arrived before minUpdateInterval since the last update has elapsed, and remove such requests.
|
||||
for ix := 0; ix < len(batch); {
|
||||
req := batch[ix]
|
||||
if req.requestTime.Before(b.nextUpdateTime) {
|
||||
req.callback <- updateResult{rv: b.resourceVersion}
|
||||
batch = append(batch[:ix], batch[ix+1:]...)
|
||||
} else {
|
||||
// Keep in the batch
|
||||
ix++
|
||||
}
|
||||
}
|
||||
|
||||
// If all requests are now handled, don't perform update.
|
||||
if len(batch) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Bump next update time
|
||||
b.nextUpdateTime = time.Now().Add(b.minUpdateInterval)
|
||||
|
||||
var rv int64
|
||||
var err = ctx.Err()
|
||||
if err == nil {
|
||||
|
|
|
@ -833,6 +833,12 @@ func withOwnsIndexFn(fn func(key resource.NamespacedResource) (bool, error)) set
|
|||
}
|
||||
}
|
||||
|
||||
func withIndexMinUpdateInterval(d time.Duration) setupOption {
|
||||
return func(options *BleveOptions) {
|
||||
options.IndexMinUpdateInterval = d
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildIndexExpiration(t *testing.T) {
|
||||
ns := resource.NamespacedResource{
|
||||
Namespace: "test",
|
||||
|
@ -1132,6 +1138,37 @@ func updateTestDocs(ns resource.NamespacedResource, docs int) resource.UpdateFn
|
|||
}
|
||||
}
|
||||
|
||||
func updateTestDocsReturningMillisTimestamp(ns resource.NamespacedResource, docs int) (resource.UpdateFn, *atomic.Int64) {
|
||||
cnt := 0
|
||||
updateCalls := atomic.NewInt64(0)
|
||||
|
||||
return func(context context.Context, index resource.ResourceIndex, sinceRV int64) (newRV int64, updatedDocs int, _ error) {
|
||||
now := time.Now()
|
||||
updateCalls.Inc()
|
||||
|
||||
cnt++
|
||||
|
||||
var items []*resource.BulkIndexItem
|
||||
for i := 0; i < docs; i++ {
|
||||
items = append(items, &resource.BulkIndexItem{
|
||||
Action: resource.ActionIndex,
|
||||
Doc: &resource.IndexableDocument{
|
||||
Key: &resourcepb.ResourceKey{
|
||||
Namespace: ns.Namespace,
|
||||
Group: ns.Group,
|
||||
Resource: ns.Resource,
|
||||
Name: fmt.Sprintf("doc%d", i),
|
||||
},
|
||||
Title: fmt.Sprintf("Document %d (gen_%d)", i, cnt),
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
err := index.BulkIndex(&resource.BulkIndexRequest{Items: items})
|
||||
return now.UnixMilli(), docs, err
|
||||
}, updateCalls
|
||||
}
|
||||
|
||||
func TestCleanOldIndexes(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
|
@ -1353,7 +1390,7 @@ func TestConcurrentIndexUpdateSearchAndRebuild(t *testing.T) {
|
|||
cancel()
|
||||
wg.Wait()
|
||||
|
||||
fmt.Println("Updates:", updates.Load(), "searches:", searches.Load(), "rebuilds:", rebuilds.Load())
|
||||
t.Log("Updates:", updates.Load(), "searches:", searches.Load(), "rebuilds:", rebuilds.Load())
|
||||
}
|
||||
|
||||
// Verify concurrent updates and searches work as expected.
|
||||
|
@ -1415,7 +1452,72 @@ func TestConcurrentIndexUpdateAndSearch(t *testing.T) {
|
|||
require.Greater(t, rvUpdatedByMultipleGoroutines, int64(0))
|
||||
}
|
||||
|
||||
// Verify concurrent updates and searches work as expected.
|
||||
func TestConcurrentIndexUpdateAndSearchWithIndexMinUpdateInterval(t *testing.T) {
|
||||
ns := resource.NamespacedResource{
|
||||
Namespace: "test",
|
||||
Group: "group",
|
||||
Resource: "resource",
|
||||
}
|
||||
|
||||
const minInterval = 100 * time.Millisecond
|
||||
be, _ := setupBleveBackend(t, withIndexMinUpdateInterval(minInterval))
|
||||
|
||||
updateFn, updateCalls := updateTestDocsReturningMillisTimestamp(ns, 5)
|
||||
idx, err := be.BuildIndex(t.Context(), ns, 10 /* file based */, nil, "test", indexTestDocs(ns, 10, 100), updateFn, false)
|
||||
require.NoError(t, err)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
attemptedUpdates := atomic.NewInt64(0)
|
||||
|
||||
// Verify that each returned RV (unix timestamp in millis) is either the same as before, or at least minInterval later.
|
||||
const searchConcurrency = 25
|
||||
for i := 0; i < searchConcurrency; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
prevRV := int64(0)
|
||||
for ctx.Err() == nil {
|
||||
attemptedUpdates.Inc()
|
||||
|
||||
// We use t.Context() here to avoid getting errors from context cancellation.
|
||||
rv, err := idx.UpdateIndex(t.Context(), "test")
|
||||
require.NoError(t, err)
|
||||
|
||||
// Our update function returns unix timestamp in millis. We expect it to not change at all, or change by minInterval.
|
||||
if prevRV > 0 {
|
||||
rvDiff := rv - prevRV
|
||||
if rvDiff == 0 {
|
||||
// OK
|
||||
} else {
|
||||
// Allow returned RV to be within 10% of minInterval.
|
||||
require.InDelta(t, minInterval.Milliseconds(), rvDiff, float64(minInterval.Milliseconds())*0.10)
|
||||
}
|
||||
}
|
||||
|
||||
prevRV = rv
|
||||
require.Equal(t, int64(10), searchTitle(t, idx, "Document", 10, ns).TotalHits)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Run updates and searches for this time.
|
||||
testTime := 1 * time.Second
|
||||
|
||||
time.Sleep(testTime)
|
||||
cancel()
|
||||
wg.Wait()
|
||||
|
||||
expectedUpdateCalls := int64(testTime / minInterval)
|
||||
require.InDelta(t, expectedUpdateCalls, updateCalls.Load(), float64(expectedUpdateCalls/2))
|
||||
require.Greater(t, attemptedUpdates.Load(), updateCalls.Load())
|
||||
|
||||
t.Log("Attempted updates:", attemptedUpdates.Load(), "update calls:", updateCalls.Load())
|
||||
}
|
||||
|
||||
func TestIndexUpdateWithErrors(t *testing.T) {
|
||||
ns := resource.NamespacedResource{
|
||||
Namespace: "test",
|
||||
|
|
|
@ -41,13 +41,14 @@ func NewSearchOptions(
|
|||
}
|
||||
|
||||
bleve, err := NewBleveBackend(BleveOptions{
|
||||
Root: root,
|
||||
FileThreshold: int64(cfg.IndexFileThreshold), // fewer than X items will use a memory index
|
||||
BatchSize: cfg.IndexMaxBatchSize, // This is the batch size for how many objects to add to the index at once
|
||||
IndexCacheTTL: cfg.IndexCacheTTL, // How long to keep the index cache in memory
|
||||
BuildVersion: cfg.BuildVersion,
|
||||
UseFullNgram: features.IsEnabledGlobally(featuremgmt.FlagUnifiedStorageUseFullNgram),
|
||||
OwnsIndex: ownsIndexFn,
|
||||
Root: root,
|
||||
FileThreshold: int64(cfg.IndexFileThreshold), // fewer than X items will use a memory index
|
||||
BatchSize: cfg.IndexMaxBatchSize, // This is the batch size for how many objects to add to the index at once
|
||||
IndexCacheTTL: cfg.IndexCacheTTL, // How long to keep the index cache in memory
|
||||
BuildVersion: cfg.BuildVersion,
|
||||
UseFullNgram: features.IsEnabledGlobally(featuremgmt.FlagUnifiedStorageUseFullNgram),
|
||||
OwnsIndex: ownsIndexFn,
|
||||
IndexMinUpdateInterval: cfg.IndexMinUpdateInterval,
|
||||
}, tracer, indexMetrics)
|
||||
|
||||
if err != nil {
|
||||
|
@ -55,14 +56,15 @@ func NewSearchOptions(
|
|||
}
|
||||
|
||||
return resource.SearchOptions{
|
||||
Backend: bleve,
|
||||
Resources: docs,
|
||||
InitWorkerThreads: cfg.IndexWorkers,
|
||||
IndexRebuildWorkers: cfg.IndexRebuildWorkers,
|
||||
InitMinCount: cfg.IndexMinCount,
|
||||
DashboardIndexMaxAge: cfg.IndexRebuildInterval,
|
||||
MaxIndexAge: cfg.MaxFileIndexAge,
|
||||
MinBuildVersion: minVersion,
|
||||
Backend: bleve,
|
||||
Resources: docs,
|
||||
InitWorkerThreads: cfg.IndexWorkers,
|
||||
IndexRebuildWorkers: cfg.IndexRebuildWorkers,
|
||||
InitMinCount: cfg.IndexMinCount,
|
||||
DashboardIndexMaxAge: cfg.IndexRebuildInterval,
|
||||
MaxIndexAge: cfg.MaxFileIndexAge,
|
||||
MinBuildVersion: minVersion,
|
||||
IndexMinUpdateInterval: cfg.IndexMinUpdateInterval,
|
||||
}, nil
|
||||
}
|
||||
return resource.SearchOptions{}, nil
|
||||
|
|
Loading…
Reference in New Issue