Convert raft_entry_commit_latency to seconds & be explicit about unit

This is a follow-up to https://github.com/rabbitmq/ra/pull/160

Had to introduce mf_convert/3 so that METRICS_REQUIRING_CONVERSIONS
proplist does not clash with METRICS_RAW proplists that have the same
number of elements. This is begging to be refactored, but I know that
@dcorbacho is working on https://github.com/rabbitmq/rabbitmq-prometheus/issues/26

Also modified the RabbitMQ-Quorum-Queues-Raft dashboard

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
This commit is contained in:
Gerhard Lazu 2020-01-07 16:12:31 +00:00
parent 5602a9eb4c
commit 89efb964d9
3 changed files with 30 additions and 10 deletions

View File

@ -43,7 +43,7 @@
"gnetId": null, "gnetId": null,
"graphTooltip": 1, "graphTooltip": 1,
"id": null, "id": null,
"iteration": 1575376605605, "iteration": 1578410270904,
"links": [ "links": [
{ {
"icon": "doc", "icon": "doc",
@ -227,7 +227,7 @@
"reverseYBuckets": false, "reverseYBuckets": false,
"targets": [ "targets": [
{ {
"expr": "rabbitmq_raft_entry_commit_latency * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}", "expr": "rabbitmq_raft_entry_commit_latency_seconds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"format": "time_series", "format": "time_series",
"instant": false, "instant": false,
"intervalFactor": 1, "intervalFactor": 1,
@ -250,7 +250,7 @@
"xBucketSize": null, "xBucketSize": null,
"yAxis": { "yAxis": {
"decimals": null, "decimals": null,
"format": "ms", "format": "s",
"logBase": 1, "logBase": 1,
"max": null, "max": null,
"min": "0", "min": "0",

View File

@ -129,7 +129,7 @@
{2, disk_space_available_limit_bytes, gauge, "Free disk space low watermark in bytes", disk_free_limit}, {2, disk_space_available_limit_bytes, gauge, "Free disk space low watermark in bytes", disk_free_limit},
{2, erlang_processes_limit, gauge, "Erlang processes limit", proc_total}, {2, erlang_processes_limit, gauge, "Erlang processes limit", proc_total},
{2, erlang_scheduler_run_queue, gauge, "Erlang scheduler run queue", run_queue}, {2, erlang_scheduler_run_queue, gauge, "Erlang scheduler run queue", run_queue},
{2, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval in seconds", net_ticktime} {2, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval", net_ticktime}
]}, ]},
{node_persister_metrics, [ {node_persister_metrics, [
@ -155,8 +155,7 @@
{3, raft_log_snapshot_index, gauge, "Raft log snapshot index"}, {3, raft_log_snapshot_index, gauge, "Raft log snapshot index"},
{4, raft_log_last_applied_index, gauge, "Raft log last applied index"}, {4, raft_log_last_applied_index, gauge, "Raft log last applied index"},
{5, raft_log_commit_index, gauge, "Raft log commit index"}, {5, raft_log_commit_index, gauge, "Raft log commit index"},
{6, raft_log_last_written_index, gauge, "Raft log last written index"}, {6, raft_log_last_written_index, gauge, "Raft log last written index"}
{7, raft_entry_commit_latency, gauge, "Time taken for an entry to be committed"}
]}, ]},
{queue_coarse_metrics, [ {queue_coarse_metrics, [
@ -201,11 +200,13 @@
{2, 1000000, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time}, {2, 1000000, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
{2, 1000000, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time}, {2, 1000000, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
{2, 1000000, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time} {2, 1000000, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
]},
{ra_metrics, [
{7, 1000, raft_entry_commit_latency_seconds, gauge, "Time taken for a log entry to be committed"}
]} ]}
]). ]).
-define(METRICS, ?METRICS_RAW ++ ?METRICS_REQUIRING_CONVERSIONS).
-define(TOTALS, [ -define(TOTALS, [
%% ordering differs from metrics above, refer to list comprehension %% ordering differs from metrics above, refer to list comprehension
{connection_created, connections, gauge, "Connections currently open"}, {connection_created, connections, gauge, "Connections currently open"},
@ -227,7 +228,11 @@ collect_mf(_Registry, Callback) ->
[begin [begin
Data = ets:tab2list(Table), Data = ets:tab2list(Table),
mf(Callback, Contents, Data) mf(Callback, Contents, Data)
end || {Table, Contents} <- ?METRICS], end || {Table, Contents} <- ?METRICS_RAW],
[begin
Data = ets:tab2list(Table),
mf_convert(Callback, Contents, Data)
end || {Table, Contents} <- ?METRICS_REQUIRING_CONVERSIONS],
[begin [begin
Size = ets:info(Table, size), Size = ets:info(Table, size),
mf_totals(Callback, Name, Type, Help, Size) mf_totals(Callback, Name, Type, Help, Size)
@ -295,7 +300,21 @@ mf(Callback, Contents, Data) ->
{Type, Fun, Data} {Type, Fun, Data}
) )
) )
end || {Index, Name, Type, Help, Key} <- Contents], end || {Index, Name, Type, Help, Key} <- Contents].
mf_convert(Callback, Contents, Data) ->
[begin
Fun = fun(D) -> element(Index, D) / BaseUnitConversionFactor end,
Callback(
create_mf(
?METRIC_NAME(Name),
Help,
catch_boolean(Type),
?MODULE,
{Type, Fun, Data}
)
)
end || {Index, BaseUnitConversionFactor, Name, Type, Help} <- Contents],
[begin [begin
Fun = fun(D) -> proplists:get_value(Key, element(Index, D)) / BaseUnitConversionFactor end, Fun = fun(D) -> proplists:get_value(Key, element(Index, D)) / BaseUnitConversionFactor end,
Callback( Callback(

View File

@ -198,6 +198,7 @@ metrics_test(Config) ->
%% Checking the first metric value in each ETS table that requires converting %% Checking the first metric value in each ETS table that requires converting
?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])), ?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])), ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds{", [{capture, none}, multiline])),
%% Checking the first TOTALS metric value %% Checking the first TOTALS metric value
?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])). ?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])).