Alerting: Keep the latest version of deleted rule in version table (#101481)

* add feature toggle alertRuleRestore
* Update delete rule to require UserUID, remove all versions and create "delete" version that holds information about who and when deleted the rule
This commit is contained in:
Yuri Tseretyan 2025-03-05 09:15:26 -05:00 committed by GitHub
parent da2c382d80
commit 374380d1f6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 215 additions and 39 deletions

View File

@ -117,6 +117,7 @@ Most [generally available](https://grafana.com/docs/release-life-cycle/#general-
| `improvedExternalSessionHandling` | Enables improved support for OAuth external sessions. After enabling this feature, users might need to re-authenticate themselves. |
| `elasticsearchCrossClusterSearch` | Enables cross cluster search in the Elasticsearch datasource |
| `improvedExternalSessionHandlingSAML` | Enables improved support for SAML external sessions. Ensure the NameID format is correctly configured in Grafana for SAML Single Logout to function properly. |
| `alertRuleRestore` | Enables the alert rule restore feature |
## Experimental feature toggles

View File

@ -255,4 +255,5 @@ export interface FeatureToggles {
newShareReportDrawer?: boolean;
rendererDisableAppPluginsPreload?: boolean;
assetSriChecks?: boolean;
alertRuleRestore?: boolean;
}

View File

@ -1782,6 +1782,13 @@ var (
Owner: grafanaFrontendOpsWG,
FrontendOnly: true,
},
{
Name: "alertRuleRestore",
Description: "Enables the alert rule restore feature",
Stage: FeatureStagePublicPreview,
Owner: grafanaAlertingSquad,
Expression: "true", // enabled by default
},
}
)

View File

@ -236,3 +236,4 @@ alertingRuleVersionHistoryRestore,GA,@grafana/alerting-squad,false,false,true
newShareReportDrawer,experimental,@grafana/sharing-squad,false,false,false
rendererDisableAppPluginsPreload,experimental,@grafana/sharing-squad,false,false,true
assetSriChecks,experimental,@grafana/frontend-ops,false,false,true
alertRuleRestore,preview,@grafana/alerting-squad,false,false,false

1 Name Stage Owner requiresDevMode RequiresRestart FrontendOnly
236 newShareReportDrawer experimental @grafana/sharing-squad false false false
237 rendererDisableAppPluginsPreload experimental @grafana/sharing-squad false false true
238 assetSriChecks experimental @grafana/frontend-ops false false true
239 alertRuleRestore preview @grafana/alerting-squad false false false

View File

@ -954,4 +954,8 @@ const (
// FlagAssetSriChecks
// Enables SRI checks for Grafana JavaScript assets
FlagAssetSriChecks = "assetSriChecks"
// FlagAlertRuleRestore
// Enables the alert rule restore feature
FlagAlertRuleRestore = "alertRuleRestore"
)

View File

@ -108,6 +108,22 @@
"frontend": true
}
},
{
"metadata": {
"name": "alertRuleRestore",
"resourceVersion": "1741127758142",
"creationTimestamp": "2025-03-04T22:29:36Z",
"annotations": {
"grafana.app/updatedTimestamp": "2025-03-04 22:35:58.1421143 +0000 UTC"
}
},
"spec": {
"description": "Enables the alert rule restore feature",
"stage": "preview",
"codeowner": "@grafana/alerting-squad",
"expression": "true"
}
},
{
"metadata": {
"name": "alertStateHistoryLokiOnly",

View File

@ -1930,8 +1930,9 @@ func createTestEnv(t *testing.T, testConfig string) testEnvironment {
Cfg: setting.UnifiedAlertingSettings{
BaseInterval: time.Second * 10,
},
FolderService: folderService,
Bus: bus.ProvideBus(tracing.InitializeTracerForTest()),
FolderService: folderService,
Bus: bus.ProvideBus(tracing.InitializeTracerForTest()),
FeatureToggles: featuremgmt.WithFeatures(),
}
user := &user.SignedInUser{
OrgID: 1,

View File

@ -157,7 +157,7 @@ func (srv RulerSrv) RouteDeleteAlertRules(c *contextmodel.ReqContext, namespaceU
rulesToDelete = append(rulesToDelete, uid...)
}
if len(rulesToDelete) > 0 {
err := srv.store.DeleteAlertRulesByUID(ctx, c.SignedInUser.GetOrgID(), rulesToDelete...)
err := srv.store.DeleteAlertRulesByUID(ctx, c.SignedInUser.GetOrgID(), ngmodels.NewUserUID(c.SignedInUser), rulesToDelete...)
if err != nil {
return err
}
@ -461,7 +461,7 @@ func (srv RulerSrv) updateAlertRulesInGroup(c *contextmodel.ReqContext, groupKey
UIDs = append(UIDs, rule.UID)
}
if err = srv.store.DeleteAlertRulesByUID(tranCtx, c.SignedInUser.GetOrgID(), UIDs...); err != nil {
if err = srv.store.DeleteAlertRulesByUID(tranCtx, c.SignedInUser.GetOrgID(), ngmodels.NewUserUID(c.SignedInUser), UIDs...); err != nil {
return fmt.Errorf("failed to delete rules: %w", err)
}
}

View File

@ -60,7 +60,7 @@ func TestRouteDeleteAlertRules(t *testing.T) {
deleteCommands := getRecordedCommand(ruleStore)
require.Len(t, deleteCommands, 1)
cmd := deleteCommands[0]
actualUIDs := cmd.Params[1].([]string)
actualUIDs := cmd.Params[2].([]string)
require.Len(t, actualUIDs, len(expectedRules))
for _, rule := range expectedRules {
require.Containsf(t, actualUIDs, rule.UID, "Rule %s was expected to be deleted but it wasn't", rule.UID)

View File

@ -28,7 +28,7 @@ type RuleStore interface {
// and return the map of uuid to id.
InsertAlertRules(ctx context.Context, user *ngmodels.UserUID, rules []ngmodels.AlertRule) ([]ngmodels.AlertRuleKeyWithId, error)
UpdateAlertRules(ctx context.Context, user *ngmodels.UserUID, rules []ngmodels.UpdateRule) error
DeleteAlertRulesByUID(ctx context.Context, orgID int64, ruleUID ...string) error
DeleteAlertRulesByUID(ctx context.Context, orgID int64, user *ngmodels.UserUID, ruleUID ...string) error
// IncreaseVersionForAllRulesInNamespaces Increases version for all rules that have specified namespace uids
IncreaseVersionForAllRulesInNamespaces(ctx context.Context, orgID int64, namespaceUIDs []string) ([]ngmodels.AlertRuleKeyWithVersion, error)

View File

@ -556,7 +556,7 @@ func (service *AlertRuleService) persistDelta(ctx context.Context, user identity
})
}
}
if err := service.deleteRules(ctx, user.GetOrgID(), delta.Delete...); err != nil {
if err := service.deleteRules(ctx, user, delta.Delete...); err != nil {
return err
}
}
@ -749,7 +749,7 @@ func (service *AlertRuleService) DeleteAlertRule(ctx context.Context, user ident
// This is different from deleting groups. We delete the rules directly rather than persisting a delta here to keep the semantics the same.
// TODO: Either persist a delta here as a breaking change, or deprecate this endpoint in favor of the group endpoint.
return service.xact.InTransaction(ctx, func(ctx context.Context) error {
return service.deleteRules(ctx, user.GetOrgID(), rule)
return service.deleteRules(ctx, user, rule)
})
}
@ -775,18 +775,18 @@ func (service *AlertRuleService) checkLimitsTransactionCtx(ctx context.Context,
}
// deleteRules deletes a set of target rules and associated data, while checking for database consistency.
func (service *AlertRuleService) deleteRules(ctx context.Context, orgID int64, targets ...*models.AlertRule) error {
func (service *AlertRuleService) deleteRules(ctx context.Context, user identity.Requester, targets ...*models.AlertRule) error {
uids := make([]string, 0, len(targets))
for _, tgt := range targets {
if tgt != nil {
uids = append(uids, tgt.UID)
}
}
if err := service.ruleStore.DeleteAlertRulesByUID(ctx, orgID, uids...); err != nil {
if err := service.ruleStore.DeleteAlertRulesByUID(ctx, user.GetOrgID(), models.NewUserUID(user), uids...); err != nil {
return err
}
for _, uid := range uids {
if err := service.provenanceStore.DeleteProvenance(ctx, &models.AlertRule{UID: uid}, orgID); err != nil {
if err := service.provenanceStore.DeleteProvenance(ctx, &models.AlertRule{UID: uid}, user.GetOrgID()); err != nil {
// We failed to clean up the record, but this doesn't break things. Log it and move on.
service.log.Warn("Failed to delete provenance record for rule: %w", err)
}

View File

@ -1726,7 +1726,7 @@ func TestDeleteRuleGroup(t *testing.T) {
func TestDeleteRuleGroups(t *testing.T) {
orgID1 := rand.Int63()
orgID2 := rand.Int63()
u := &user.SignedInUser{OrgID: orgID1}
u := &user.SignedInUser{OrgID: orgID1, UserUID: "test-test"}
// Create groups across different orgs and namespaces
groupKey1 := models.AlertRuleGroupKey{
@ -1805,6 +1805,7 @@ func TestDeleteRuleGroups(t *testing.T) {
// Verify only rules from group1 in org1 were deleted
deletes := getDeletedRules(t, ruleStore)
require.Len(t, deletes, 1)
require.Equal(t, "test-test", deletes[0].userID)
require.ElementsMatch(t, getUIDs(rules1), deletes[0].uids)
})
@ -2045,8 +2046,9 @@ func getDeleteQueries(ruleStore *fakes.RuleStore) []fakes.GenericRecordedQuery {
}
type deleteRuleOperation struct {
orgID int64
uids []string
orgID int64
userID string
uids []string
}
func getDeletedRules(t *testing.T, ruleStore *fakes.RuleStore) []deleteRuleOperation {
@ -2058,12 +2060,20 @@ func getDeletedRules(t *testing.T, ruleStore *fakes.RuleStore) []deleteRuleOpera
orgID, ok := q.Params[0].(int64)
require.True(t, ok, "orgID parameter should be int64")
uids, ok := q.Params[1].([]string)
uid := ""
userUID, ok := q.Params[1].(*models.UserUID)
require.True(t, ok, "parameter should be UserUID")
if userUID != nil {
uid = string(*userUID)
}
uids, ok := q.Params[2].([]string)
require.True(t, ok, "uids parameter should be []string")
operations = append(operations, deleteRuleOperation{
orgID: orgID,
uids: uids,
orgID: orgID,
userID: uid,
uids: uids,
})
}
return operations
@ -2077,9 +2087,10 @@ func createAlertRuleService(t *testing.T, folderService folder.Service) AlertRul
Cfg: setting.UnifiedAlertingSettings{
BaseInterval: time.Second * 10,
},
Logger: log.NewNopLogger(),
FolderService: folderService,
Bus: bus.ProvideBus(tracing.InitializeTracerForTest()),
Logger: log.NewNopLogger(),
FolderService: folderService,
Bus: bus.ProvideBus(tracing.InitializeTracerForTest()),
FeatureToggles: featuremgmt.WithFeatures(),
}
// store := fakes.NewRuleStore(t)
quotas := MockQuotaChecker{}

View File

@ -35,7 +35,7 @@ type RuleStore interface {
GetRuleGroupInterval(ctx context.Context, orgID int64, namespaceUID string, ruleGroup string) (int64, error)
InsertAlertRules(ctx context.Context, user *models.UserUID, rule []models.AlertRule) ([]models.AlertRuleKeyWithId, error)
UpdateAlertRules(ctx context.Context, user *models.UserUID, rule []models.UpdateRule) error
DeleteAlertRulesByUID(ctx context.Context, orgID int64, ruleUID ...string) error
DeleteAlertRulesByUID(ctx context.Context, orgID int64, user *models.UserUID, ruleUID ...string) error
GetAlertRulesGroupByRuleUID(ctx context.Context, query *models.GetAlertRulesGroupByRuleUIDQuery) ([]*models.AlertRule, error)
}

View File

@ -39,7 +39,10 @@ var (
)
// DeleteAlertRulesByUID is a handler for deleting an alert rule.
func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, ruleUID ...string) error {
func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, user *ngmodels.UserUID, ruleUID ...string) error {
if len(ruleUID) == 0 {
return nil
}
logger := st.Logger.New("org_id", orgID, "rule_uids", ruleUID)
return st.SQLStore.WithTransactionalDbSession(ctx, func(sess *db.Session) error {
rows, err := sess.Table(alertRule{}).Where("org_id = ?", orgID).In("uid", ruleUID).Delete(alertRule{})
@ -57,12 +60,6 @@ func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, ruleUI
})
}
rows, err = sess.Table(alertRuleVersion{}).Where("rule_org_id = ?", orgID).In("rule_uid", ruleUID).Delete(alertRule{})
if err != nil {
return err
}
logger.Debug("Deleted alert rule versions", "count", rows)
rows, err = sess.Table("alert_instance").Where("rule_org_id = ?", orgID).In("rule_uid", ruleUID).Delete(alertRule{})
if err != nil {
return err
@ -75,10 +72,78 @@ func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, ruleUI
}
logger.Debug("Deleted alert rule state", "count", rows)
var versions []alertRuleVersion
if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) {
versions, err = st.getLatestVersionOfRulesByUID(ctx, orgID, ruleUID)
if err != nil {
logger.Error("Failed to get latest version of deleted alert rules. The recovery will not be possible", "error", err)
}
for idx := range versions {
version := &versions[idx]
version.ID = 0
version.RuleUID = ""
version.Created = TimeNow()
version.CreatedBy = nil
if user != nil {
version.CreatedBy = util.Pointer(string(*user))
}
}
}
rows, err = sess.Table(alertRuleVersion{}).Where("rule_org_id = ?", orgID).In("rule_uid", ruleUID).Delete(alertRule{})
if err != nil {
return err
}
logger.Debug("Deleted alert rule versions", "count", rows)
if len(versions) > 0 {
_, err = sess.Insert(versions)
if err != nil {
return fmt.Errorf("failed to persist deleted rule for recovery: %w", err)
}
logger.Debug("Inserted alert rule versions for recovery", "count", len(versions))
}
return nil
})
}
func (st DBstore) getLatestVersionOfRulesByUID(ctx context.Context, orgID int64, ruleUIDs []string) ([]alertRuleVersion, error) {
var result []alertRuleVersion
err := st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error {
args, in := getINSubQueryArgs(ruleUIDs)
// take only the latest versions of each rule by GUID
rows, err := sess.SQL(fmt.Sprintf(`
SELECT v1.* FROM alert_rule_version AS v1
INNER JOIN (
SELECT rule_guid, MAX(id) AS id
FROM alert_rule_version
WHERE rule_org_id = ?
AND rule_uid IN (%s)
GROUP BY rule_guid
) AS v2 ON v1.rule_guid = v2.rule_guid AND v1.id = v2.id
`, strings.Join(in, ",")), append([]any{orgID}, args...)...).Rows(new(alertRuleVersion))
if err != nil {
return err
}
result = make([]alertRuleVersion, 0, len(ruleUIDs))
for rows.Next() {
rule := new(alertRuleVersion)
err = rows.Scan(rule)
if err != nil {
st.Logger.Error("Invalid rule version found in DB store, ignoring it", "func", "getLatestVersionOfRulesByUID", "error", err)
continue
}
result = append(result, *rule)
}
return nil
})
if err != nil {
return nil, err
}
return result, nil
}
// IncreaseVersionForAllRulesInNamespaces Increases version for all rules that have specified namespace. Returns all rules that belong to the namespaces
func (st DBstore) IncreaseVersionForAllRulesInNamespaces(ctx context.Context, orgID int64, namespaceUIDs []string) ([]ngmodels.AlertRuleKeyWithVersion, error) {
var keys []ngmodels.AlertRuleKeyWithVersion
@ -820,7 +885,7 @@ func (st DBstore) DeleteInFolders(ctx context.Context, orgID int64, folderUIDs [
}
}
if err := st.DeleteAlertRulesByUID(ctx, orgID, uids...); err != nil {
if err := st.DeleteAlertRulesByUID(ctx, orgID, ngmodels.NewUserUID(user), uids...); err != nil {
return err
}
}

View File

@ -27,6 +27,7 @@ import (
"github.com/grafana/grafana/pkg/services/folder/folderimpl"
"github.com/grafana/grafana/pkg/services/ngalert/testutil"
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/services/sqlstore"
"github.com/grafana/grafana/pkg/services/user"
"github.com/grafana/grafana/pkg/infra/db"
@ -685,7 +686,6 @@ func TestIntegration_DeleteInFolder(t *testing.T) {
b := &fakeBus{}
logger := log.New("test-dbstore")
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
rule := createRule(t, store, nil)
t.Run("should not be able to delete folder without permissions to delete rules", func(t *testing.T) {
@ -714,6 +714,9 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
sqlStore := db.InitTestDB(t)
cfg := setting.NewCfg()
cfg.UnifiedAlerting.BaseInterval = 1 * time.Second
cfg.UnifiedAlerting.RuleVersionRecordLimit = -1
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
logger := log.New("test-dbstore")
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, &fakeBus{})
@ -742,7 +745,7 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
called = true
return nil
}
err := store.DeleteAlertRulesByUID(context.Background(), rule.OrgID, rule.UID)
err := store.DeleteAlertRulesByUID(context.Background(), rule.OrgID, &models.AlertingUserUID, rule.UID)
require.NoError(t, err)
require.True(t, called)
})
@ -769,7 +772,7 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
require.Len(t, savedInstances, 1)
// Delete the rule
err = store.DeleteAlertRulesByUID(context.Background(), rule.OrgID, rule.UID)
err = store.DeleteAlertRulesByUID(context.Background(), rule.OrgID, &models.AlertingUserUID, rule.UID)
require.NoError(t, err)
// Now there should be no alert rule state
@ -780,6 +783,71 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
require.NoError(t, err)
require.Empty(t, savedInstances)
})
t.Run("should remove all version and insert one with empty rule_uid", func(t *testing.T) {
orgID := int64(rand.Intn(1000))
gen = gen.With(gen.WithOrgID(orgID))
// Create a new store to pass the custom bus to check the signal
b := &fakeBus{}
logger := log.New("test-dbstore")
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
uids := make([]string, 0, len(result))
for _, rule := range result {
uids = append(uids, rule.UID)
}
require.NoError(t, err)
rules, err := store.ListAlertRules(context.Background(), &models.ListAlertRulesQuery{OrgID: orgID, RuleUIDs: uids})
require.NoError(t, err)
updates := make([]models.UpdateRule, 0, len(rules))
for _, rule := range rules {
rule2 := models.CopyRule(rule, gen.WithTitle(util.GenerateShortUID()))
updates = append(updates, models.UpdateRule{
Existing: rule,
New: *rule2,
})
}
err = store.UpdateAlertRules(context.Background(), &models.AlertingUserUID, updates)
require.NoError(t, err)
versions, err := store.GetAlertRuleVersions(context.Background(), orgID, rules[0].GUID)
require.NoError(t, err)
require.Len(t, versions, 2)
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uids...)
require.NoError(t, err)
guids := make([]string, 0, len(rules))
for _, rule := range rules {
guids = append(guids, rule.GUID)
}
_ = sqlStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
var versions []alertRuleVersion
err = sess.Table(alertRuleVersion{}).Where(`rule_uid = ''`).In("rule_guid", guids).Find(&versions)
require.NoError(t, err)
require.Len(t, versions, len(rules)) // should be one version per GUID
for _, version := range versions {
assert.Equal(t, "", version.RuleUID)
assert.Equal(t, "test", *version.CreatedBy)
// Remove the GUID from guids
for i, guid := range guids {
if guid == version.RuleGUID {
guids = append(guids[:i], guids[i+1:]...)
break
}
}
}
// Ensure that guids is empty
assert.Empty(t, guids, "Some rules are left unrecoverable")
return nil
})
})
}
func TestIntegrationInsertAlertRules(t *testing.T) {
@ -1894,11 +1962,12 @@ func createTestStore(
bus bus.Bus,
) *DBstore {
return &DBstore{
SQLStore: sqlStore,
FolderService: folderService,
Logger: logger,
Cfg: cfg,
Bus: bus,
SQLStore: sqlStore,
FolderService: folderService,
Logger: logger,
Cfg: cfg,
Bus: bus,
FeatureToggles: featuremgmt.WithFeatures(),
}
}

View File

@ -102,10 +102,10 @@ func (f *RuleStore) GetRecordedCommands(predicate func(cmd any) (any, bool)) []a
return result
}
func (f *RuleStore) DeleteAlertRulesByUID(_ context.Context, orgID int64, UIDs ...string) error {
func (f *RuleStore) DeleteAlertRulesByUID(_ context.Context, orgID int64, user *models.UserUID, UIDs ...string) error {
f.RecordedOps = append(f.RecordedOps, GenericRecordedQuery{
Name: "DeleteAlertRulesByUID",
Params: []any{orgID, UIDs},
Params: []any{orgID, user, UIDs},
})
rules := f.Rules[orgID]