From 2fef8e6f2ce8fd55ca7e3f2e0944b906b9c18e9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-Philippe=20Qu=C3=A9m=C3=A9ner?= Date: Wed, 17 Aug 2022 18:53:36 +0200 Subject: [PATCH] Alerting: add docs for file provisioning (#53101) --- conf/provisioning/alerting/sample.yaml | 186 ++++++ .../administration/provisioning/_index.md | 582 ++++++++++++++++++ docs/sources/developers/http_api/admin.md | 3 + packaging/deb/control/postinst | 1 + packaging/rpm/control/postinst | 1 + 5 files changed, 773 insertions(+) create mode 100644 conf/provisioning/alerting/sample.yaml diff --git a/conf/provisioning/alerting/sample.yaml b/conf/provisioning/alerting/sample.yaml new file mode 100644 index 00000000000..ae13cf68274 --- /dev/null +++ b/conf/provisioning/alerting/sample.yaml @@ -0,0 +1,186 @@ +# # config file version +apiVersion: 1 + +# # List of rule groups to import or update +# groups: +# # organization ID, default = 1 +# - orgId: 1 +# # name of the rule group +# name: my_rule_group +# # name of the folder the rule group will be stored in +# folder: my_first_folder +# # interval of the rule group evaluation +# interval: 60s +# # list of rules that are part of the rule group +# rules: +# # unique identifier for the rule +# - uid: my_id_1 +# # title of the rule, will be displayed in the UI +# title: my_first_rule +# # query used for the condition +# condition: A +# # list of query objects that should be executed on each +# # evaluation - should be obtained via the API +# data: +# - refId: A +# datasourceUid: "-100" +# model: +# conditions: +# - evaluator: +# params: +# - 3 +# type: gt +# operator: +# type: and +# query: +# params: +# - A +# reducer: +# type: last +# type: query +# datasource: +# type: __expr__ +# uid: "-100" +# expression: 1==0 +# intervalMs: 1000 +# maxDataPoints: 43200 +# refId: A +# type: math +# # UID of a dashboard that the alert rule should be linked to +# dashboardUid: my_dashboard +# # ID of the panel that the alert rule should be linked to +# panelId: 123 +# # state of the alert rule when no data is returned +# # possible values: "NoData", "Alerting", "OK", default = NoData +# noDataState: Alerting +# # state of the alert rule when the query execution +# # fails - possible values: "Error", "Alerting", "OK" +# # default = Alerting +# # how long the alert condition should be breached before Firing. Before this time has elapsed, the alert is considered to be Pending +# for: 60s +# # > map of strings to attach arbitrary custom data +# annotations: +# some_key: some_value +# # map of strings to filter and +# # route alerts +# labels: +# team: sre_team_1 + +# # List of alert rule UIDs that should be deleted +# deleteRules: +# # organization ID, default = 1 +# - orgId: 1 +# # unique identifier for the rule +# uid: my_id_1 + +# # List of contact points to import or update +# contactPoints: +# # organization ID, default = 1 +# - orgId: 1 +# # name of the contact point +# name: cp_1 +# receivers: +# # unique identifier for the receiver +# - uid: first_uid +# # type of the receiver +# type: prometheus-alertmanager +# # settings for the specific receiver type +# settings: +# url: http://test:9000 + +# # List of receivers that should be deleted +# deleteContactPoints: +# - orgId: 1 +# uid: first_uid + +# # List of notification policies to import or update +# policies: +# # organization ID, default = 1 +# - orgId: 1 +# # name of the receiver that should be used for this route +# receiver: grafana-default-email +# # > The labels by which incoming alerts are grouped together. For example, +# # multiple alerts coming in for cluster=A and alertname=LatencyHigh would +# # be batched into a single group. +# # +# # To aggregate by all possible labels, use the special value '...' as +# # the sole label name, for example: +# # group_by: ['...'] +# # This effectively disables aggregation entirely, passing through all +# # alerts as-is. This is unlikely to be what you want, unless you have +# # a very low alert volume or your upstream notification system performs +# # its own grouping. +# group_by: +# - grafana_folder +# - alertname +# # a list of matchers that an alert has to fulfill to match the node +# matchers: +# - alertname = Watchdog +# - severity =~ "warning|critical" +# # Times when the route should be muted. These must match the name of a +# # mute time interval. +# # Additionally, the root node cannot have any mute times. +# # When a route is muted it will not send any notifications, but +# # otherwise acts normally (including ending the route-matching process +# # if the `continue` option is not set) +# mute_time_intervals: +# - abc +# # How long to initially wait to send a notification for a group +# # of alerts. Allows to collect more initial alerts for the same group. +# # (Usually ~0s to few minutes), default = 30s +# group_wait: 30s +# # How long to wait before sending a notification about new alerts that +# # are added to a group of alerts for which an initial notification has +# # already been sent. (Usually ~5m or more), default = 5m +# group_internval: 5m +# # How long to wait before sending a notification again if it has already +# # been sent successfully for an alert. (Usually ~3h or more), default = 4h +# repeat_interval: 4h +# # Zero or more child routes +# routes: +# ... + +# # List of orgIds that should be reset to the default policy +# resetPolicies: +# - 1 + +# # List of templates to import or update +# templates: +# # organization ID, default = 1 +# - orgID: 1 +# # name of the template, must be unique +# name: my_first_template +# # content of the the template +# template: Alerting with a custome text template + +# # List of templates that should be deleted +# deleteTemplates: +# # organization ID, default = 1 +# - orgId: 1 +# # name of the template, must be unique +# name: my_first_template + + +# # List of mute time intervals to import or update +# muteTimes: +# # organization ID, default = 1 +# - orgId: 1 +# # name of the mute time interval, must be unique +# name: mti_1 +# # time intervals that should trigger the muting +# refer to https://prometheus.io/docs/alerting/latest/configuration/#time_interval-0 +# time_intervals: +# - times: +# - start_time: '06:00' +# end_time: '23:59' +# weekdays: ['monday:wednesday','saturday', 'sunday'] +# months: ['1:3', 'may:august', 'december'] +# years: ['2020:2022', '2030'] +# days_of_month: ['1:5', '-3:-1'] + +# # List of mute time intervals that should be deleted +# deleteMuteTimes: +# # organization ID, default = 1 +# - orgId: 1 +# # name of the mute time interval, must be unique +# name: mti_1 diff --git a/docs/sources/administration/provisioning/_index.md b/docs/sources/administration/provisioning/_index.md index 08d3eb5ba7a..2d6cd741338 100644 --- a/docs/sources/administration/provisioning/_index.md +++ b/docs/sources/administration/provisioning/_index.md @@ -364,8 +364,590 @@ providers: > **Note:** To provision dashboards to the General folder, store them in the root of your `path`. +## Alerting + +You can manage alert objects in Grafana by adding one or more YAML or JSON +configuration files in the [`provisioning/alerting`]({{< relref "../../setup-grafana/configure-grafana/" >}}) +directory. Those files will be applied when starting Grafana. When Grafana +is running, it's possible to do a hot reload using the +[Admin API]({{< relref "../../developers/http_api/admin/#reload-provisioning-configurations" >}}). + +### Rules + +Creation + +```yaml +# config file version +apiVersion: 1 + +# List of rule groups to import or update +groups: + # organization ID, default = 1 + - orgId: 1 + # name of the rule group + name: my_rule_group + # name of the folder the rule group will be stored in + folder: my_first_folder + # interval that the rule group should evaluated at + interval: 60s + # list of rules that are part of the rule group + rules: + # unique identifier for the rule + - uid: my_id_1 + # title of the rule that will be displayed in the UI + title: my_first_rule + # which query should be used for the condition + condition: A + # list of query objects that should be executed on each + # evaluation - should be obtained trough the API + data: + - refId: A + datasourceUid: '-100' + model: + conditions: + - evaluator: + params: + - 3 + type: gt + operator: + type: and + query: + params: + - A + reducer: + type: last + type: query + datasource: + type: __expr__ + uid: '-100' + expression: 1==0 + intervalMs: 1000 + maxDataPoints: 43200 + refId: A + type: math + # UID of a dashboard that the alert rule should be linked to + dashboardUid: my_dashboard + # ID of the panel that the alert rule should be linked to + panelId: 123 + # the state the alert rule will have when no data is returned + # possible values: "NoData", "Alerting", "OK", default = NoData + noDataState: Alerting + # the state the alert rule will have when the query execution + # failed - possible values: "Error", "Alerting", "OK" + # default = Alerting + # for how long should the alert fire before alerting + for: 60s + # > a map of strings to pass around any data + annotations: + some_key: some_value + # a map of strings that can be used to filter and + # route alerts + labels: + team: sre_team_1 +``` + +Deletion + +```yaml +# config file version +apiVersion: 1 + +# List of alert rule UIDs that should be deleted +deleteRules: + # organization ID, default = 1 + - orgId: 1 + # unique identifier for the rule + uid: my_id_1 +``` + +### Contact points + +Creation + +```yaml +# config file version +apiVersion: 1 + +# List of contact points to import or update +contactPoints: + # organization ID, default = 1 + - orgId: 1 + # name of the contact point + name: cp_1 + receivers: + # unique identifier for the receiver + - uid: first_uid + # type of the receiver + type: prometheus-alertmanager + # settings for the specific receiver type + settings: + url: http://test:9000 +``` + +Deletion + +```yaml +# config file version +apiVersion: 1 + +# List of receivers that should be deleted +deleteContactPoints: + # organization ID, default = 1 + - orgId: 1 + # unique identifier for the receiver + uid: first_uid +``` + +#### Settings + +Here we showcase what kind of settings you can have for the different +contact point types. + +##### Alertmanager + +```yaml +type: prometheus-alertmanager +settings: + # + url: http://localhost:9093 + # + basicAuthUser: abc + # + basicAuthPassword: abc123 +``` + +##### DingDing + +```yaml +type: dingding +settings: + # + url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxx + # options: link, actionCard + msgType: link + # + message: | + {{ template "default.message" . }} +``` + +##### Discord + +```yaml +type: discord +settings: + # + url: https://discord/webhook + # + avatar_url: https://my_avatar + # + use_discord_username: Grafana + # + message: | + {{ template "default.message" . }} +``` + +##### E-Mail + +```yaml +type: email +settings: + # + addresses: me@example.com;you@example.com + # + singleEmail: false + # + message: my optional message to include + # + subject: | + {{ template "default.title" . }} +``` + +##### Google Hangouts Chat + +```yaml +type: googlechat +settings: + # + url: https://google/webhook + # + message: | + {{ template "default.message" . }} +``` + +##### Kafka + +```yaml +type: kafka +settings: + # + kafkaRestProxy: http://localhost:8082 + # + kafkaTopic: topic1 +``` + +##### LINE + +```yaml +type: line +settings: + # + token: xxx +``` + +##### Microsoft Teams + +```yaml +type: teams +settings: + # + url: https://ms_teams_url + # + title: | + {{ template "default.title" . }} + # + sectiontitle: '' + # + message: | + {{ template "default.message" . }} +``` + +##### OpsGenie + +```yaml +type: opsgenie +settings: + # + apiKey: xxx + # + apiUrl: https://api.opsgenie.com/v2/alerts + # + message: | + {{ template "default.title" . }} + # + description: some descriptive description + # + autoClose: false + # + overridePriority: false + # options: tags, details, both + sendTagsAs: both +``` + +##### PagerDuty + +```yaml +type: pagerduty +settings: + # + integrationKey: XXX + # options: critical, error, warning, info + severity: critical + # + class: ping failure + # + component: Grafana + # + group: app-stack + # + summary: | + {{ template "default.message" . }} +``` + +##### Pushover + +```yaml +type: pushover +settings: + # + apiToken: XXX + # + userKey: user1,user2 + # + device: device1,device2 + # options (high to low): 2,1,0,-1,-2 + priority: '2' + # + retry: '30' + # + expire: '120' + # + sound: siren + # + okSound: magic + # + message: | + {{ template "default.message" . }} +``` + +##### Slack + +```yaml +type: slack +settings: + # + recipient: alerting-dev + # + token: xxx + # + username: grafana_bot + # + icon_emoji: heart + # + icon_url: https://icon_url + # + mentionUsers: user_1,user_2 + # + mentionGroups: group_1,group_2 + # options: here, channel + mentionChannel: here + # Optionally provide a Slack incoming webhook URL for sending messages, in this case the token isn't necessary + url: https://some_webhook_url + # + endpointUrl: https://custom_url/api/chat.postMessage + # + title: | + {{ template "slack.default.title" . }} + text: | + {{ template "slack.default.text" . }} +``` + +##### Sensu Go + +```yaml +type: sensugo +settings: + # + url: http://sensu-api.local:8080 + # + apikey: xxx + # + entity: default + # + check: default + # + handler: some_handler + # + namespace: default + # + message: | + {{ template "default.message" . }} +``` + +##### Telegram + +```yaml +type: telegram +settings: + # + bottoken: xxx + # + chatid: some_chat_id + # + message: | + {{ template "default.message" . }} +``` + +##### Threema Gateway + +```yaml +type: threema +settings: + # + api_secret: xxx + # + gateway_id: A5K94S9 + # + recipient_id: A9R4KL4S +``` + +##### VictorOps + +```yaml +type: victorops +settings: + # + url: XXX + # options: CRITICAL, WARNING + messageType: CRITICAL +``` + +##### Webhook + +```yaml +type: webhook +settings: + # + url: https://endpoint_url + # options: POST, PUT + httpMethod: POST + # + username: abc + # + password: abc123 + # + authorization_scheme: Bearer + # + authorization_credentials: abc123 + # + maxAlerts: '10' +``` + +##### WeCom + +```yaml +type: wecom +settings: + # + url: https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxxx + # + message: | + {{ template "default.message" . }} + # + title: | + {{ template "default.title" . }} +``` + +### Notification policies + +Create + +```yaml +# config file version +apiVersion: 1 + +# List of notification policies +policies: + # organization ID, default = 1 + - orgId: 1 + # name of the contact point that should be used for this route + receiver: grafana-default-email + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use the special value '...' as + # the sole label name, for example: + # group_by: ['...'] + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. + group_by: ['...'] + # a list of matchers that an alert has to fulfill to match the node + matchers: + - alertname = Watchdog + - severity =~ "warning|critical" + # Times when the route should be muted. These must match the name of a + # mute time interval. + # Additionally, the root node cannot have any mute times. + # When a route is muted it will not send any notifications, but + # otherwise acts normally (including ending the route-matching process + # if the `continue` option is not set) + mute_time_intervals: + - abc + # How long to initially wait to send a notification for a group + # of alerts. Allows to collect more initial alerts for the same group. + # (Usually ~0s to few minutes), default = 30s + group_wait: 30s + # How long to wait before sending a notification about new alerts that + # are added to a group of alerts for which an initial notification has + # already been sent. (Usually ~5m or more), default = 5m + group_internval: 5m + # How long to wait before sending a notification again if it has already + # been sent successfully for an alert. (Usually ~3h or more), default = 4h + repeat_interval: 4h + # Zero or more child routes + # routes: + # ... +``` + +Reset + +```yaml +# config file version +apiVersion: 1 + +# List of orgIds that should be reset to the default policy +resetPolicies: + - 1 +``` + +### Templates + +Creation + +```yaml +# config file version +apiVersion: 1 + +# List of templates to import or update +templates: + # organization ID, default = 1 + - orgID: 1 + # name of the template, must be unique + name: my_first_template + # content of the the template + template: Alerting with a custom text template +``` + +Deletion + +```yaml +# config file version +apiVersion: 1 + +# List of alert rule UIDs that should be deleted +deleteTemplates: + # organization ID, default = 1 + - orgId: 1 + # name of the template, must be unique + name: my_first_template +``` + +### Mute timings + +Creation + +```yaml +# config file version +apiVersion: 1 + +# List of mute time intervals to import or update +muteTimes: + # organization ID, default = 1 + - orgId: 1 + # name of the mute time interval, must be unique + name: mti_1 + # time intervals that should trigger the muting + # refer to https://prometheus.io/docs/alerting/latest/configuration/#time_interval-0 + time_intervals: + - times: + - start_time: '06:00' + end_time: '23:59' + weekdays: ['monday:wednesday', 'saturday', 'sunday'] + months: ['1:3', 'may:august', 'december'] + years: ['2020:2022', '2030'] + days_of_month: ['1:5', '-3:-1'] +``` + +Deletion + +```yaml +# config file version +apiVersion: 1 + +# List of mute time intervals that should be deleted +deleteMuteTimes: + # organization ID, default = 1 + - orgId: 1 + # name of the mute time interval, must be unique + name: mti_1 +``` + ## Alert Notification Channels +> **Note:** Alert Notification Channels are part of legacy alerting, which is deprecated and will be removed in Grafana 10. Use Contact Points in the alerting section above. + Alert Notification Channels can be provisioned by adding one or more YAML config files in the [`provisioning/notifiers`](/administration/configuration/#provisioning) directory. Each config file can contain the following top-level fields: diff --git a/docs/sources/developers/http_api/admin.md b/docs/sources/developers/http_api/admin.md index f0890c5eab3..f4e5d87b419 100644 --- a/docs/sources/developers/http_api/admin.md +++ b/docs/sources/developers/http_api/admin.md @@ -650,6 +650,8 @@ Content-Type: application/json `POST /api/admin/provisioning/access-control/reload` +`POST /api/admin/provisioning/alerting/reload` + Reloads the provisioning config files for specified type and provision entities again. It won't return until the new provisioned entities are already stored in the database. In case of dashboards, it will stop polling for changes in dashboard files and then restart it with new configurations after returning. @@ -667,6 +669,7 @@ See note in the [introduction]({{< ref "#admin-api" >}}) for an explanation. | provisioning:reload | provisioners:datasources | datasources | | provisioning:reload | provisioners:plugins | plugins | | provisioning:reload | provisioners:notifications | notifications | +| provisioning:reload | provisioners:alerting | alerting | **Example Request**: diff --git a/packaging/deb/control/postinst b/packaging/deb/control/postinst index 95d7d89483a..952e71e2239 100755 --- a/packaging/deb/control/postinst +++ b/packaging/deb/control/postinst @@ -54,6 +54,7 @@ case "$1" in if [ ! -d $PROVISIONING_CFG_DIR/alerting ]; then mkdir -p $PROVISIONING_CFG_DIR/alerting + cp /usr/share/grafana/conf/provisioning/alerting/sample.yaml $PROVISIONING_CFG_DIR/alerting/sample.yaml fi # configuration files should not be modifiable by grafana user, as this can be a security issue diff --git a/packaging/rpm/control/postinst b/packaging/rpm/control/postinst index 739babd6a57..bde2accb9bd 100755 --- a/packaging/rpm/control/postinst +++ b/packaging/rpm/control/postinst @@ -68,6 +68,7 @@ if [ $1 -eq 1 ] ; then if [ ! -d $PROVISIONING_CFG_DIR/alerting ]; then mkdir -p $PROVISIONING_CFG_DIR/alerting + cp /usr/share/grafana/conf/provisioning/alerting/sample.yaml $PROVISIONING_CFG_DIR/alerting/sample.yaml fi # Set user permissions on /var/log/grafana, /var/lib/grafana