Visualise Ra entry commit latency & members with many log entries

Did a couple other improvements to all other panels, feels almost MVP.

[#166819045]
This commit is contained in:
Gerhard Lazu 2019-06-24 18:28:01 +01:00
parent 0a5b355ee3
commit 3cdf507c63
5 changed files with 358 additions and 69 deletions

View File

@ -30,6 +30,7 @@ services:
RABBITMQ_ERLANG_COOKIE: rabbitmq-qq RABBITMQ_ERLANG_COOKIE: rabbitmq-qq
volumes: volumes:
- ./rabbitmq-qq.conf:/etc/rabbitmq/rabbitmq.conf:ro - ./rabbitmq-qq.conf:/etc/rabbitmq/rabbitmq.conf:ro
- ./rabbitmq-qq-env.conf:/etc/rabbitmq/rabbitmq-env.conf:ro
- ./rabbitmq-qq-definitions.json:/etc/rabbitmq/rabbitmq-definitions.json:ro - ./rabbitmq-qq-definitions.json:/etc/rabbitmq/rabbitmq-definitions.json:ro
rmq1-qq: rmq1-qq:
<< : *rabbitmq << : *rabbitmq
@ -50,7 +51,7 @@ services:
networks: networks:
- "rabbitmq-prometheus" - "rabbitmq-prometheus"
environment: environment:
URIS: "amqp://guest:guest@rmq0-qq:5672/%2f,amqp://guest:guest@rmq1-qq:5672/%2f,amqp://guest:guest@rmq1-qq:5672/%2f" URIS: "amqp://guest:guest@rmq0-qq:5672/%2f,amqp://guest:guest@rmq1-qq:5672/%2f,amqp://guest:guest@rmq2-qq:5672/%2f"
QUEUE_PATTERN: "qq%d" QUEUE_PATTERN: "qq%d"
QUEUE_PATTERN_FROM: 1 QUEUE_PATTERN_FROM: 1
QUEUE_PATTERN_TO: 10 QUEUE_PATTERN_TO: 10
@ -59,6 +60,6 @@ services:
QUEUE_ARGS: x-queue-type=quorum,x-max-length=1000 QUEUE_ARGS: x-queue-type=quorum,x-max-length=1000
FLAG: persistent FLAG: persistent
AUTO_DELETE: "false" AUTO_DELETE: "false"
RATE: 10 RATE: 200
AUTOACK: "false" AUTOACK: "false"
SERVERS_STARTUP_TIMEOUT: &startup_timeout 30 SERVERS_STARTUP_TIMEOUT: &startup_timeout 30

View File

@ -16,60 +16,97 @@
"editable": true, "editable": true,
"gnetId": null, "gnetId": null,
"graphTooltip": 1, "graphTooltip": 1,
"id": 3, "iteration": 1561396911983,
"iteration": 1561057199575,
"links": [], "links": [],
"panels": [ "panels": [
{ {
"aliasColors": {}, "aliasColors": {},
"bars": true, "bars": false,
"cacheTimeout": null, "cacheTimeout": null,
"dashLength": 10, "dashLength": 10,
"dashes": false, "dashes": false,
"description": "", "description": "Number of operations committed by a majority of nodes in the Raft cluster.",
"fill": 10, "fill": 0,
"gridPos": { "gridPos": {
"h": 11, "h": 9,
"w": 24, "w": 12,
"x": 0, "x": 0,
"y": 0 "y": 0
}, },
"id": 62, "id": 64,
"legend": { "legend": {
"alignAsTable": true, "alignAsTable": true,
"avg": false, "avg": false,
"current": true, "current": true,
"hideEmpty": false,
"hideZero": true,
"max": true, "max": true,
"min": false, "min": false,
"rightSide": false,
"show": true, "show": true,
"sort": "current", "sort": "total",
"sortDesc": true, "sortDesc": true,
"total": true, "total": true,
"values": true "values": true
}, },
"lines": false, "lines": true,
"linewidth": 0, "linewidth": 1,
"links": [], "links": [],
"nullPointMode": "null as zero", "nullPointMode": "null",
"options": {}, "options": {},
"percentage": false, "percentage": false,
"pointradius": 2, "pointradius": 2,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
"seriesOverrides": [], "seriesOverrides": [
{
"alias": "/^rabbit@\\w+0/",
"color": "#56A64B"
},
{
"alias": "/^rabbit@\\w+1/",
"color": "#F2CC0C"
},
{
"alias": "/^rabbit@\\w+2/",
"color": "#3274D9"
},
{
"alias": "/^rabbit@\\w+3/",
"color": "#A352CC"
},
{
"alias": "/^rabbit@\\w+4/",
"color": "#FF780A"
},
{
"alias": "/^rabbit@\\w+5/",
"color": "#96D98D"
},
{
"alias": "/^rabbit@\\w+6/",
"color": "#FFEE52"
},
{
"alias": "/^rabbit@\\w+7/",
"color": "#8AB8FF"
},
{
"alias": "/^rabbit@\\w+8/",
"color": "#CA95E5"
},
{
"alias": "/^rabbit@\\w+9/",
"color": "#FFB357"
}
],
"spaceLength": 10, "spaceLength": 10,
"stack": true, "stack": false,
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "sum(rabbitmq_raft_log_last_written_index{cluster=\"$cluster\"} - rabbitmq_raft_log_commit_index{cluster=\"$cluster\"}) by(queue)", "expr": "sum(rate(rabbitmq_raft_log_commit_index{cluster=\"$cluster\", node=~\"$node\"}[$__interval])) by(node)",
"format": "time_series", "format": "time_series",
"instant": false, "instant": false,
"intervalFactor": 1, "intervalFactor": 1,
"legendFormat": "{{queue}}", "legendFormat": "{{node}}",
"refId": "A" "refId": "A"
} }
], ],
@ -77,10 +114,10 @@
"timeFrom": null, "timeFrom": null,
"timeRegions": [], "timeRegions": [],
"timeShift": null, "timeShift": null,
"title": "Uncommitted entries", "title": "Entries committed / s",
"tooltip": { "tooltip": {
"shared": true, "shared": true,
"sort": 1, "sort": 2,
"value_type": "individual" "value_type": "individual"
}, },
"type": "graph", "type": "graph",
@ -95,7 +132,7 @@
{ {
"decimals": null, "decimals": null,
"format": "short", "format": "short",
"label": "", "label": null,
"logBase": 1, "logBase": 1,
"max": null, "max": null,
"min": null, "min": null,
@ -122,33 +159,33 @@
"cardRound": null "cardRound": null
}, },
"color": { "color": {
"cardColor": "#37872D", "cardColor": "rgb(255, 255, 255)",
"colorScale": "sqrt", "colorScale": "sqrt",
"colorScheme": "interpolateRdYlGn", "colorScheme": "interpolateRdYlGn",
"exponent": 0.3, "exponent": 0.5,
"mode": "opacity" "mode": "opacity"
}, },
"dataFormat": "timeseries", "dataFormat": "timeseries",
"description": "", "description": "Time taken for an entry to be committed",
"gridPos": { "gridPos": {
"h": 7, "h": 9,
"w": 12, "w": 12,
"x": 0, "x": 12,
"y": 11 "y": 0
}, },
"heatmap": {}, "heatmap": {},
"hideZeroBuckets": false, "hideZeroBuckets": false,
"highlightCards": true, "highlightCards": true,
"id": 64, "id": 65,
"legend": { "legend": {
"show": false "show": true
}, },
"links": [], "links": [],
"options": {}, "options": {},
"reverseYBuckets": false, "reverseYBuckets": false,
"targets": [ "targets": [
{ {
"expr": "rate(rabbitmq_raft_log_commit_index{cluster=\"$cluster\"}[$__interval])", "expr": "max(rabbitmq_raft_entry_commit_latency{cluster=\"$cluster\", node=~\"$node\"}) by(queue, node)",
"format": "time_series", "format": "time_series",
"instant": false, "instant": false,
"intervalFactor": 1, "intervalFactor": 1,
@ -158,7 +195,7 @@
], ],
"timeFrom": null, "timeFrom": null,
"timeShift": null, "timeShift": null,
"title": "Entries committed / s", "title": "Entry commit latency",
"tooltip": { "tooltip": {
"show": true, "show": true,
"showHistogram": true "showHistogram": true
@ -171,30 +208,166 @@
"xBucketSize": null, "xBucketSize": null,
"yAxis": { "yAxis": {
"decimals": null, "decimals": null,
"format": "short", "format": "ms",
"logBase": 1, "logBase": 1,
"max": null, "max": null,
"min": null, "min": null,
"show": true, "show": true,
"splitFactor": null "splitFactor": null
}, },
"yBucketBound": "auto", "yBucketBound": "lower",
"yBucketNumber": null, "yBucketNumber": null,
"yBucketSize": null "yBucketSize": null
}, },
{ {
"aliasColors": {}, "aliasColors": {},
"bars": true, "bars": false,
"cacheTimeout": null, "cacheTimeout": null,
"dashLength": 10, "dashLength": 10,
"dashes": false, "dashes": false,
"description": "", "description": "",
"fill": 10, "fill": 0,
"gridPos": { "gridPos": {
"h": 7, "h": 9,
"w": 12,
"x": 0,
"y": 9
},
"id": 62,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": true,
"max": true,
"min": false,
"rightSide": false,
"show": true,
"sort": "total",
"sortDesc": true,
"total": true,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/^rabbit@\\w+0/",
"color": "#56A64B"
},
{
"alias": "/^rabbit@\\w+1/",
"color": "#F2CC0C"
},
{
"alias": "/^rabbit@\\w+2/",
"color": "#3274D9"
},
{
"alias": "/^rabbit@\\w+3/",
"color": "#A352CC"
},
{
"alias": "/^rabbit@\\w+4/",
"color": "#FF780A"
},
{
"alias": "/^rabbit@\\w+5/",
"color": "#96D98D"
},
{
"alias": "/^rabbit@\\w+6/",
"color": "#FFEE52"
},
{
"alias": "/^rabbit@\\w+7/",
"color": "#8AB8FF"
},
{
"alias": "/^rabbit@\\w+8/",
"color": "#CA95E5"
},
{
"alias": "/^rabbit@\\w+9/",
"color": "#FFB357"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rabbitmq_raft_log_last_written_index{cluster=\"$cluster\", node=~\"$node\"} - rabbitmq_raft_log_commit_index{cluster=\"$cluster\", node=~\"$node\"}) by(node)",
"format": "time_series",
"instant": false,
"intervalFactor": 1,
"legendFormat": "{{node}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Uncommitted entries",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"cacheTimeout": null,
"dashLength": 10,
"dashes": false,
"description": "",
"fill": 0,
"gridPos": {
"h": 9,
"w": 12, "w": 12,
"x": 12, "x": 12,
"y": 11 "y": 9
}, },
"id": 63, "id": 63,
"legend": { "legend": {
@ -204,30 +377,73 @@
"max": true, "max": true,
"min": false, "min": false,
"rightSide": false, "rightSide": false,
"show": false, "show": true,
"total": false, "sort": "total",
"sortDesc": true,
"total": true,
"values": true "values": true
}, },
"lines": false, "lines": true,
"linewidth": 1, "linewidth": 1,
"links": [], "links": [],
"nullPointMode": "null as zero", "nullPointMode": "null",
"options": {}, "options": {},
"percentage": false, "percentage": false,
"pointradius": 2, "pointradius": 2,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
"seriesOverrides": [], "seriesOverrides": [
{
"alias": "/^rabbit@\\w+0/",
"color": "#56A64B"
},
{
"alias": "/^rabbit@\\w+1/",
"color": "#F2CC0C"
},
{
"alias": "/^rabbit@\\w+2/",
"color": "#3274D9"
},
{
"alias": "/^rabbit@\\w+3/",
"color": "#A352CC"
},
{
"alias": "/^rabbit@\\w+4/",
"color": "#FF780A"
},
{
"alias": "/^rabbit@\\w+5/",
"color": "#96D98D"
},
{
"alias": "/^rabbit@\\w+6/",
"color": "#FFEE52"
},
{
"alias": "/^rabbit@\\w+7/",
"color": "#8AB8FF"
},
{
"alias": "/^rabbit@\\w+8/",
"color": "#CA95E5"
},
{
"alias": "/^rabbit@\\w+9/",
"color": "#FFB357"
}
],
"spaceLength": 10, "spaceLength": 10,
"stack": true, "stack": false,
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "sum(rate(rabbitmq_raft_term[30s])) by(queue)", "expr": "sum(rate(rabbitmq_raft_term{cluster=\"$cluster\", node=~\"$node\"}[30s])) by(node)",
"format": "time_series", "format": "time_series",
"instant": false, "instant": false,
"intervalFactor": 1, "intervalFactor": 1,
"legendFormat": "{{queue}}", "legendFormat": "{{node}}",
"refId": "A" "refId": "A"
} }
], ],
@ -235,10 +451,10 @@
"timeFrom": null, "timeFrom": null,
"timeRegions": [], "timeRegions": [],
"timeShift": null, "timeShift": null,
"title": "Number of elections / s", "title": "Leader elections / s",
"tooltip": { "tooltip": {
"shared": true, "shared": true,
"sort": 0, "sort": 2,
"value_type": "individual" "value_type": "individual"
}, },
"type": "graph", "type": "graph",
@ -275,14 +491,14 @@
}, },
{ {
"aliasColors": {}, "aliasColors": {},
"bars": true, "bars": false,
"cacheTimeout": null, "cacheTimeout": null,
"dashLength": 10, "dashLength": 10,
"dashes": false, "dashes": false,
"description": "", "description": "",
"fill": 0, "fill": 0,
"gridPos": { "gridPos": {
"h": 9, "h": 13,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 18 "y": 18
@ -293,13 +509,15 @@
"avg": false, "avg": false,
"current": true, "current": true,
"max": true, "max": true,
"min": true, "min": false,
"rightSide": false, "rightSide": false,
"show": false, "show": true,
"total": false, "sort": "total",
"sortDesc": true,
"total": true,
"values": true "values": true
}, },
"lines": false, "lines": true,
"linewidth": 1, "linewidth": 1,
"links": [], "links": [],
"nullPointMode": "null as zero", "nullPointMode": "null as zero",
@ -308,17 +526,58 @@
"pointradius": 2, "pointradius": 2,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
"seriesOverrides": [], "seriesOverrides": [
{
"alias": "/^rabbit@\\w+0/",
"color": "#56A64B"
},
{
"alias": "/^rabbit@\\w+1/",
"color": "#F2CC0C"
},
{
"alias": "/^rabbit@\\w+2/",
"color": "#3274D9"
},
{
"alias": "/^rabbit@\\w+3/",
"color": "#A352CC"
},
{
"alias": "/^rabbit@\\w+4/",
"color": "#FF780A"
},
{
"alias": "/^rabbit@\\w+5/",
"color": "#96D98D"
},
{
"alias": "/^rabbit@\\w+6/",
"color": "#FFEE52"
},
{
"alias": "/^rabbit@\\w+7/",
"color": "#8AB8FF"
},
{
"alias": "/^rabbit@\\w+8/",
"color": "#CA95E5"
},
{
"alias": "/^rabbit@\\w+9/",
"color": "#FFB357"
}
],
"spaceLength": 10, "spaceLength": 10,
"stack": false, "stack": false,
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "", "expr": "sum(rabbitmq_raft_log_last_written_index{cluster=\"$cluster\", node=~\"$node\"} -rabbitmq_raft_log_snapshot_index{cluster=\"$cluster\", node=~\"$node\"}) by(queue, node) > 5000",
"format": "time_series", "format": "time_series",
"instant": false, "instant": false,
"intervalFactor": 1, "intervalFactor": 1,
"legendFormat": "", "legendFormat": "{{node}} {{queue}}",
"refId": "A" "refId": "A"
} }
], ],
@ -326,10 +585,10 @@
"timeFrom": null, "timeFrom": null,
"timeRegions": [], "timeRegions": [],
"timeShift": null, "timeShift": null,
"title": "Number of entries in the log", "title": "Raft member with >5k entries in the log",
"tooltip": { "tooltip": {
"shared": true, "shared": true,
"sort": 0, "sort": 2,
"value_type": "individual" "value_type": "individual"
}, },
"type": "graph", "type": "graph",
@ -347,7 +606,7 @@
"label": "", "label": "",
"logBase": 1, "logBase": 1,
"max": null, "max": null,
"min": "0", "min": null,
"show": true "show": true
}, },
{ {
@ -378,19 +637,18 @@
{ {
"allValue": null, "allValue": null,
"current": { "current": {
"selected": true,
"text": "rabbitmq-qq", "text": "rabbitmq-qq",
"value": "rabbitmq-qq" "value": "rabbitmq-qq"
}, },
"datasource": "prometheus", "datasource": "prometheus",
"definition": "label_values(rabbitmq_memory_used_bytes,cluster)", "definition": "label_values(rabbitmq_raft_term,cluster)",
"hide": 0, "hide": 0,
"includeAll": false, "includeAll": false,
"label": "Cluster", "label": "Cluster",
"multi": false, "multi": false,
"name": "cluster", "name": "cluster",
"options": [], "options": [],
"query": "label_values(rabbitmq_memory_used_bytes,cluster)", "query": "label_values(rabbitmq_raft_term,cluster)",
"refresh": 1, "refresh": 1,
"regex": "", "regex": "",
"skipUrlSync": false, "skipUrlSync": false,
@ -400,11 +658,36 @@
"tagsQuery": "", "tagsQuery": "",
"type": "query", "type": "query",
"useTags": false "useTags": false
},
{
"allValue": null,
"current": {
"text": "All",
"value": "$__all"
},
"datasource": "prometheus",
"definition": "label_values(rabbitmq_raft_term,node)",
"hide": 0,
"includeAll": true,
"label": "Node",
"multi": false,
"name": "node",
"options": [],
"query": "label_values(rabbitmq_raft_term,node)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
} }
] ]
}, },
"time": { "time": {
"from": "now-3h", "from": "now-15m",
"to": "now" "to": "now"
}, },
"timepicker": { "timepicker": {
@ -432,5 +715,5 @@
"timezone": "", "timezone": "",
"title": "RabbitMQ-Raft", "title": "RabbitMQ-Raft",
"uid": "f1Mee9nZz", "uid": "f1Mee9nZz",
"version": 18 "version": 30
} }

View File

@ -0,0 +1,2 @@
export RA="-ra wal_max_size_bytes 536870912"
export SERVER_START_ARGS="$RA"

View File

@ -4,7 +4,9 @@ listeners.tcp.default = 5672
management.listener.port = 15672 management.listener.port = 15672
management.listener.ssl = false management.listener.ssl = false
vm_memory_high_watermark.absolute = 256MiB # Raft WAL defaults to 512MB
# We want the node to have more memory available than 512MB, ideally 3x
vm_memory_high_watermark.absolute = 1536MB
cluster_formation.peer_discovery_backend = rabbit_peer_discovery_classic_config cluster_formation.peer_discovery_backend = rabbit_peer_discovery_classic_config
cluster_formation.classic_config.nodes.1 = rabbit@rmq0-qq cluster_formation.classic_config.nodes.1 = rabbit@rmq0-qq

View File

@ -152,7 +152,8 @@
{3, raft_log_snapshot_index, counter, "Raft log snapshot index"}, {3, raft_log_snapshot_index, counter, "Raft log snapshot index"},
{4, raft_log_last_applied_index, counter, "Raft log last applied index"}, {4, raft_log_last_applied_index, counter, "Raft log last applied index"},
{5, raft_log_commit_index, counter, "Raft log commit index"}, {5, raft_log_commit_index, counter, "Raft log commit index"},
{6, raft_log_last_written_index, counter, "Raft log last written index"} {6, raft_log_last_written_index, counter, "Raft log last written index"},
{7, raft_entry_commit_latency, gauge, "Time taken for an entry to be committed"}
]}, ]},
{queue_coarse_metrics, [ {queue_coarse_metrics, [