From 7a2fbad0c85bbdaec55c91c033d099afd47d99df Mon Sep 17 00:00:00 2001 From: Fayzal Ghantiwala <114010985+fayzal-g@users.noreply.github.com> Date: Tue, 14 May 2024 14:21:42 +0100 Subject: [PATCH] Alerting: Add options to configure TLS for HA using Redis (#87567) * Add Alerting HA Redis Client TLS configs * Add test to ping miniredis with mTLS * Update .ini files and docs * Add tests for unified alerting ha redis TLS settings * Fix malformed go.sum * Add modowner * Fix lint error * Update docs and use dstls config --- conf/defaults.ini | 26 ++++++ conf/sample.ini | 29 ++++++ .../configure-high-availability/_index.md | 7 +- go.mod | 1 + go.sum | 2 + go.work.sum | 5 ++ .../ngalert/notifier/multiorg_alertmanager.go | 16 ++-- pkg/services/ngalert/notifier/redis_peer.go | 21 ++++- .../ngalert/notifier/redis_peer_test.go | 89 +++++++++++++++++++ pkg/setting/setting_unified_alerting.go | 11 +++ pkg/setting/setting_unified_alerting_test.go | 47 ++++++++++ 11 files changed, 242 insertions(+), 12 deletions(-) create mode 100644 pkg/services/ngalert/notifier/redis_peer_test.go diff --git a/conf/defaults.ini b/conf/defaults.ini index a3f53f4357a..cecf8934acf 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1190,6 +1190,32 @@ ha_redis_peer_name = # The maximum number of simultaneous redis connections. ha_redis_max_conns = 5 +# Enable TLS on the client used to communicate with the redis server. This should be set to true +# if using any of the other ha_redis_tls_* fields. +ha_redis_tls_enabled = false + +# Path to the PEM-encoded TLS client certificate file used to authenticate with the redis server. +ha_redis_tls_cert_path = + +# Path to the PEM-encoded TLS private key file. Also requires the client certificate to be configured. +ha_redis_tls_key_path = + +# Path to the PEM-encoded CA certificates file. +ha_redis_tls_ca_path = + +# Overrides the expected name of the redis server certificate. +ha_redis_tls_server_name = + +# Skips validating the redis server certificate. +ha_redis_tls_insecure_skip_verify = + +# Overrides the default TLS cipher suite list. +ha_redis_tls_cipher_suites = + +# Overrides the default minimum TLS version. +# Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 +ha_redis_tls_min_version = + # Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. ha_listen_address = "0.0.0.0:9094" diff --git a/conf/sample.ini b/conf/sample.ini index c9d5b89e50a..6336e427f4b 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1101,6 +1101,35 @@ # provided, a random one will be generated. ;ha_redis_peer_name = +# The maximum number of simultaneous redis connections. +# ha_redis_max_conns = 5 + +# Enable TLS on the client used to communicate with the redis server. This should be set to true +# if using any of the other ha_redis_tls_* fields. +# ha_redis_tls_enabled = false + +# Path to the PEM-encoded TLS client certificate file used to authenticate with the redis server. +# ha_redis_tls_cert_path = + +# Path to the PEM-encoded TLS private key file. Also requires the client certificate to be configured. +# ha_redis_tls_key_path = + +# Path to the PEM-encoded CA certificates file. +# ha_redis_tls_ca_path = + +# Overrides the expected name of the redis server certificate. +# ha_redis_tls_server_name = + +# Skips validating the redis server certificate. +# ha_redis_tls_insecure_skip_verify = + +# Overrides the default TLS cipher suite list. +# ha_redis_tls_cipher_suites = + +# Overrides the default minimum TLS version. +# Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 +# ha_redis_tls_min_version = + # Listen address/hostname and port to receive unified alerting messages for other Grafana instances. The port is used for both TCP and UDP. It is assumed other Grafana instances are also running on the same port. The default value is `0.0.0.0:9094`. ;ha_listen_address = "0.0.0.0:9094" diff --git a/docs/sources/alerting/set-up/configure-high-availability/_index.md b/docs/sources/alerting/set-up/configure-high-availability/_index.md index 04594a87554..99c2c76c8b3 100644 --- a/docs/sources/alerting/set-up/configure-high-availability/_index.md +++ b/docs/sources/alerting/set-up/configure-high-availability/_index.md @@ -61,11 +61,12 @@ Since gossiping of notifications and silences uses both TCP and UDP port `9094`, As an alternative to Memberlist, you can use Redis for high availability. This is useful if you want to have a central database for HA and cannot support the meshing of all Grafana servers. -1. Make sure you have a redis server that supports pub/sub. If you use a proxy in front of your Redis cluster, make sure the proxy supports pub/sub. +1. Make sure you have a Redis server that supports pub/sub. If you use a proxy in front of your Redis cluster, make sure the proxy supports pub/sub. 1. In your custom configuration file ($WORKING_DIR/conf/custom.ini), go to the [unified_alerting] section. 1. Set `ha_redis_address` to the Redis server address Grafana should connect to. -1. [Optional] Set the username and password if authentication is enabled on the redis server using `ha_redis_username` and `ha_redis_password`. -1. [Optional] Set `ha_redis_prefix` to something unique if you plan to share the redis server with multiple Grafana instances. +1. [Optional] Set the username and password if authentication is enabled on the Redis server using `ha_redis_username` and `ha_redis_password`. +1. [Optional] Set `ha_redis_prefix` to something unique if you plan to share the Redis server with multiple Grafana instances. +1. [Optional] Set `ha_redis_tls_enabled` to `true` and configure the corresponding `ha_redis_tls_*` fields to secure communications between Grafana and Redis with Transport Layer Security (TLS). The following metrics can be used for meta monitoring, exposed by the `/metrics` endpoint in Grafana: diff --git a/go.mod b/go.mod index ce39e1c84a5..082a6bda69b 100644 --- a/go.mod +++ b/go.mod @@ -127,6 +127,7 @@ require ( github.com/lib/pq v1.10.9 // @grafana/grafana-backend-group github.com/linkedin/goavro/v2 v2.10.0 // @grafana/grafana-backend-group github.com/m3db/prometheus_remote_client_golang v0.4.4 // @grafana/grafana-backend-group + github.com/madflojo/testcerts v1.1.1 // @grafana/alerting-squad-backend github.com/magefile/mage v1.15.0 // @grafana/grafana-release-guild github.com/matryer/is v1.4.0 // @grafana/grafana-as-code github.com/mattn/go-isatty v0.0.20 // @grafana/grafana-backend-group diff --git a/go.sum b/go.sum index a461970f552..347d92d9368 100644 --- a/go.sum +++ b/go.sum @@ -2519,6 +2519,8 @@ github.com/lyft/protoc-gen-star/v2 v2.0.3/go.mod h1:amey7yeodaJhXSbf/TlLvWiqQfLO github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/m3db/prometheus_remote_client_golang v0.4.4 h1:DsAIjVKoCp7Ym35tAOFL1OuMLIdIikAEHeNPHY+yyM8= github.com/m3db/prometheus_remote_client_golang v0.4.4/go.mod h1:wHfVbA3eAK6dQvKjCkHhusWYegCk3bDGkA15zymSHdc= +github.com/madflojo/testcerts v1.1.1 h1:YsSHWV79nMNZK0mJtwXjKoYHjJEbLPFefR8TxmmWupY= +github.com/madflojo/testcerts v1.1.1/go.mod h1:MW8sh39gLnkKh4K0Nc55AyHEDl9l/FBLDUsQhpmkuo0= github.com/magefile/mage v1.11.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg= github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= diff --git a/go.work.sum b/go.work.sum index e49fb05bd46..83acdc45795 100644 --- a/go.work.sum +++ b/go.work.sum @@ -670,6 +670,7 @@ github.com/hamba/avro/v2 v2.17.2/go.mod h1:Q9YK+qxAhtVrNqOhwlZTATLgLA8qxG2vtvkhK github.com/hanwen/go-fuse v1.0.0 h1:GxS9Zrn6c35/BnfiVsZVWmsG803xwE7eVRDvcf/BEVc= github.com/hanwen/go-fuse/v2 v2.1.0 h1:+32ffteETaLYClUj0a3aHjZ1hOPxxaNEHiZiujuDaek= github.com/hashicorp/consul/sdk v0.15.0 h1:2qK9nDrr4tiJKRoxPGhm6B7xJjLVIQqkjiab2M4aKjU= +github.com/hamba/avro/v2 v2.17.2/go.mod h1:Q9YK+qxAhtVrNqOhwlZTATLgLA8qxG2vtvkhK8fJ7Jo= github.com/hashicorp/go-hclog v0.16.1 h1:IVQwpTGNRRIHafnTs2dQLIk4ENtneRIEEJWOVDqz99o= github.com/hashicorp/go-hclog v0.16.1/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-syslog v1.0.0 h1:KaodqZuhUoZereWVIYmpUgZysurB1kBLX2j0MwMrUAE= @@ -944,6 +945,9 @@ github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JT github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs= github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= +github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= +github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= +github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/tinylib/msgp v1.1.8/go.mod h1:qkpG+2ldGg4xRFmx+jfTvZPxfGFhi64BcnL9vkCm/Tw= github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM= @@ -1101,6 +1105,7 @@ k8s.io/component-base v0.0.0-20240417101527-62c04b35eff6/go.mod h1:l0ukbPS0lwFxO k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01 h1:pWEwq4Asjm4vjW7vcsmijwBhOr1/shsbSYiWXmNGlks= k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= +k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= k8s.io/kms v0.29.0/go.mod h1:mB0f9HLxRXeXUfHfn1A7rpwOlzXI1gIWu86z6buNoYA= diff --git a/pkg/services/ngalert/notifier/multiorg_alertmanager.go b/pkg/services/ngalert/notifier/multiorg_alertmanager.go index 2132e7093bd..26ee2f13e12 100644 --- a/pkg/services/ngalert/notifier/multiorg_alertmanager.go +++ b/pkg/services/ngalert/notifier/multiorg_alertmanager.go @@ -169,13 +169,15 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error { // Redis setup. if cfg.UnifiedAlerting.HARedisAddr != "" { redisPeer, err := newRedisPeer(redisConfig{ - addr: cfg.UnifiedAlerting.HARedisAddr, - name: cfg.UnifiedAlerting.HARedisPeerName, - prefix: cfg.UnifiedAlerting.HARedisPrefix, - password: cfg.UnifiedAlerting.HARedisPassword, - username: cfg.UnifiedAlerting.HARedisUsername, - db: cfg.UnifiedAlerting.HARedisDB, - maxConns: cfg.UnifiedAlerting.HARedisMaxConns, + addr: cfg.UnifiedAlerting.HARedisAddr, + name: cfg.UnifiedAlerting.HARedisPeerName, + prefix: cfg.UnifiedAlerting.HARedisPrefix, + password: cfg.UnifiedAlerting.HARedisPassword, + username: cfg.UnifiedAlerting.HARedisUsername, + db: cfg.UnifiedAlerting.HARedisDB, + maxConns: cfg.UnifiedAlerting.HARedisMaxConns, + tlsEnabled: cfg.UnifiedAlerting.HARedisTLSEnabled, + tls: cfg.UnifiedAlerting.HARedisTLSConfig, }, clusterLogger, moa.metrics.Registerer, cfg.UnifiedAlerting.HAPushPullInterval) if err != nil { return fmt.Errorf("unable to initialize redis: %w", err) diff --git a/pkg/services/ngalert/notifier/redis_peer.go b/pkg/services/ngalert/notifier/redis_peer.go index 0918e79c0a4..f934effe045 100644 --- a/pkg/services/ngalert/notifier/redis_peer.go +++ b/pkg/services/ngalert/notifier/redis_peer.go @@ -12,6 +12,7 @@ import ( "github.com/google/uuid" alertingCluster "github.com/grafana/alerting/cluster" alertingClusterPB "github.com/grafana/alerting/cluster/clusterpb" + dstls "github.com/grafana/dskit/crypto/tls" "github.com/prometheus/client_golang/prometheus" "github.com/redis/go-redis/v9" @@ -27,6 +28,9 @@ type redisConfig struct { name string prefix string maxConns int + + tlsEnabled bool + tls dstls.ClientConfig } const ( @@ -90,13 +94,26 @@ func newRedisPeer(cfg redisConfig, logger log.Logger, reg prometheus.Registerer, if cfg.maxConns >= 0 { poolSize = cfg.maxConns } - rdb := redis.NewClient(&redis.Options{ + + opts := &redis.Options{ Addr: cfg.addr, Username: cfg.username, Password: cfg.password, DB: cfg.db, PoolSize: poolSize, - }) + } + + if cfg.tlsEnabled { + tlsClientConfig, err := cfg.tls.GetTLSConfig() + if err != nil { + logger.Error("Failed to get TLS config", "err", err) + return nil, err + } else { + opts.TLSConfig = tlsClientConfig + } + } + + rdb := redis.NewClient(opts) cmd := rdb.Ping(context.Background()) if cmd.Err() != nil { logger.Error("Failed to ping redis - redis-based alertmanager clustering may not be available", "err", cmd.Err()) diff --git a/pkg/services/ngalert/notifier/redis_peer_test.go b/pkg/services/ngalert/notifier/redis_peer_test.go new file mode 100644 index 00000000000..a5fe2c4e13c --- /dev/null +++ b/pkg/services/ngalert/notifier/redis_peer_test.go @@ -0,0 +1,89 @@ +package notifier + +import ( + "context" + "crypto/tls" + "crypto/x509" + "os" + "testing" + "time" + + "github.com/alicebob/miniredis/v2" + dstls "github.com/grafana/dskit/crypto/tls" + "github.com/grafana/grafana/pkg/infra/log" + "github.com/madflojo/testcerts" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" +) + +func TestNewRedisPeerWithTLS(t *testing.T) { + // Write client and server certificates/keys to tempDir, both issues by the same CA + certPaths := createX509TestDir(t) + + // Set up tls.Config and start miniredis with server-side TLS + x509Cert, err := tls.LoadX509KeyPair(certPaths.serverCert, certPaths.serverKey) + require.NoError(t, err) + clientCAPool := x509.NewCertPool() + clientCAFile, err := os.ReadFile(certPaths.ca) + require.NoError(t, err) + clientCAPool.AppendCertsFromPEM(clientCAFile) + + mr, err := miniredis.RunTLS(&tls.Config{ + Certificates: []tls.Certificate{x509Cert}, + ClientCAs: clientCAPool, + }) + require.NoError(t, err) + defer mr.Close() + + // Create redis peer with client-side TLS + redisPeer, err := newRedisPeer(redisConfig{ + addr: mr.Addr(), + tlsEnabled: true, + tls: dstls.ClientConfig{ + CertPath: certPaths.clientCert, + KeyPath: certPaths.clientKey, + CAPath: certPaths.ca, + ServerName: "localhost", + }}, log.NewNopLogger(), prometheus.DefaultRegisterer, time.Second*60) + require.NoError(t, err) + + ping := redisPeer.redis.Ping(context.Background()) + require.NoError(t, ping.Err()) +} + +type certPaths struct { + clientCert string + clientKey string + serverCert string + serverKey string + ca string +} + +func createX509TestDir(t *testing.T) certPaths { + t.Helper() + + tmpDir := t.TempDir() + + ca := testcerts.NewCA() + caCertFile, _, err := ca.ToTempFile(tmpDir) + require.NoError(t, err) + + serverKp, err := ca.NewKeyPair("localhost") + require.NoError(t, err) + + serverCertFile, serverKeyFile, err := serverKp.ToTempFile(tmpDir) + require.NoError(t, err) + + clientKp, err := ca.NewKeyPair() + require.NoError(t, err) + clientCertFile, clientKeyFile, err := clientKp.ToTempFile(tmpDir) + require.NoError(t, err) + + return certPaths{ + clientCert: clientCertFile.Name(), + clientKey: clientKeyFile.Name(), + serverCert: serverCertFile.Name(), + serverKey: serverKeyFile.Name(), + ca: caCertFile.Name(), + } +} diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index a85a8762b6c..bfdc46dc4dc 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -7,6 +7,7 @@ import ( "time" alertingCluster "github.com/grafana/alerting/cluster" + dstls "github.com/grafana/dskit/crypto/tls" "github.com/grafana/grafana-plugin-sdk-go/backend/gtime" "gopkg.in/ini.v1" @@ -79,6 +80,8 @@ type UnifiedAlertingSettings struct { HARedisPassword string HARedisDB int HARedisMaxConns int + HARedisTLSEnabled bool + HARedisTLSConfig dstls.ClientConfig MaxAttempts int64 MinInterval time.Duration EvaluationTimeout time.Duration @@ -234,6 +237,14 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { uaCfg.HAPeers = append(uaCfg.HAPeers, peer) } } + uaCfg.HARedisTLSEnabled = ua.Key("ha_redis_tls_enabled").MustBool(false) + uaCfg.HARedisTLSConfig.CertPath = ua.Key("ha_redis_tls_cert_path").MustString("") + uaCfg.HARedisTLSConfig.KeyPath = ua.Key("ha_redis_tls_key_path").MustString("") + uaCfg.HARedisTLSConfig.CAPath = ua.Key("ha_redis_tls_ca_path").MustString("") + uaCfg.HARedisTLSConfig.ServerName = ua.Key("ha_redis_tls_server_name").MustString("") + uaCfg.HARedisTLSConfig.InsecureSkipVerify = ua.Key("ha_redis_tls_insecure_skip_verify").MustBool(false) + uaCfg.HARedisTLSConfig.CipherSuites = ua.Key("ha_redis_tls_cipher_suites").MustString("") + uaCfg.HARedisTLSConfig.MinVersion = ua.Key("ha_redis_tls_min_version").MustString("") // TODO load from ini file uaCfg.DefaultConfiguration = alertmanagerDefaultConfiguration diff --git a/pkg/setting/setting_unified_alerting_test.go b/pkg/setting/setting_unified_alerting_test.go index 4e7285203f6..4f4a540b401 100644 --- a/pkg/setting/setting_unified_alerting_test.go +++ b/pkg/setting/setting_unified_alerting_test.go @@ -298,3 +298,50 @@ func TestMinInterval(t *testing.T) { }) } } + +func TestHARedisTLSSettings(t *testing.T) { + // Initialize .ini file with new HA Redis TLS Settings + f := ini.Empty() + section, err := f.NewSection("unified_alerting") + require.NoError(t, err) + + const ( + tlsEnabled = true + certPath = "path/to/cert" + keyPath = "path/to/key" + caPath = "path/to/ca" + serverName = "server_name" + insecureSkipVerify = true + cipherSuites = "TLS_AES_128_GCM_SHA256" + minVersion = "VersionTLS13" + ) + _, err = section.NewKey("ha_redis_tls_enabled", strconv.FormatBool(tlsEnabled)) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_cert_path", certPath) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_key_path", keyPath) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_ca_path", caPath) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_server_name", serverName) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_insecure_skip_verify", strconv.FormatBool(insecureSkipVerify)) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_cipher_suites", cipherSuites) + require.NoError(t, err) + _, err = section.NewKey("ha_redis_tls_min_version", minVersion) + require.NoError(t, err) + + cfg := NewCfg() + err = cfg.ReadUnifiedAlertingSettings(f) + require.Nil(t, err) + + require.Equal(t, tlsEnabled, cfg.UnifiedAlerting.HARedisTLSEnabled) + require.Equal(t, certPath, cfg.UnifiedAlerting.HARedisTLSConfig.CertPath) + require.Equal(t, keyPath, cfg.UnifiedAlerting.HARedisTLSConfig.KeyPath) + require.Equal(t, caPath, cfg.UnifiedAlerting.HARedisTLSConfig.CAPath) + require.Equal(t, serverName, cfg.UnifiedAlerting.HARedisTLSConfig.ServerName) + require.Equal(t, insecureSkipVerify, cfg.UnifiedAlerting.HARedisTLSConfig.InsecureSkipVerify) + require.Equal(t, cipherSuites, cfg.UnifiedAlerting.HARedisTLSConfig.CipherSuites) + require.Equal(t, minVersion, cfg.UnifiedAlerting.HARedisTLSConfig.MinVersion) +}