Convert raft_entry_commit_latency to seconds & be explicit about unit

This is a follow-up to https://github.com/rabbitmq/ra/pull/160

Had to introduce mf_convert/3 so that METRICS_REQUIRING_CONVERSIONS
proplist does not clash with METRICS_RAW proplists that have the same
number of elements. This is begging to be refactored, but I know that
@dcorbacho is working on https://github.com/rabbitmq/rabbitmq-prometheus/issues/26

Also modified the RabbitMQ-Quorum-Queues-Raft dashboard

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
This commit is contained in:
Gerhard Lazu 2020-01-07 16:12:31 +00:00
parent 5602a9eb4c
commit 89efb964d9
3 changed files with 30 additions and 10 deletions

View File

@ -43,7 +43,7 @@
"gnetId": null,
"graphTooltip": 1,
"id": null,
"iteration": 1575376605605,
"iteration": 1578410270904,
"links": [
{
"icon": "doc",
@ -227,7 +227,7 @@
"reverseYBuckets": false,
"targets": [
{
"expr": "rabbitmq_raft_entry_commit_latency * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"expr": "rabbitmq_raft_entry_commit_latency_seconds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
"format": "time_series",
"instant": false,
"intervalFactor": 1,
@ -250,7 +250,7 @@
"xBucketSize": null,
"yAxis": {
"decimals": null,
"format": "ms",
"format": "s",
"logBase": 1,
"max": null,
"min": "0",

View File

@ -129,7 +129,7 @@
{2, disk_space_available_limit_bytes, gauge, "Free disk space low watermark in bytes", disk_free_limit},
{2, erlang_processes_limit, gauge, "Erlang processes limit", proc_total},
{2, erlang_scheduler_run_queue, gauge, "Erlang scheduler run queue", run_queue},
{2, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval in seconds", net_ticktime}
{2, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval", net_ticktime}
]},
{node_persister_metrics, [
@ -155,8 +155,7 @@
{3, raft_log_snapshot_index, gauge, "Raft log snapshot index"},
{4, raft_log_last_applied_index, gauge, "Raft log last applied index"},
{5, raft_log_commit_index, gauge, "Raft log commit index"},
{6, raft_log_last_written_index, gauge, "Raft log last written index"},
{7, raft_entry_commit_latency, gauge, "Time taken for an entry to be committed"}
{6, raft_log_last_written_index, gauge, "Raft log last written index"}
]},
{queue_coarse_metrics, [
@ -201,11 +200,13 @@
{2, 1000000, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
{2, 1000000, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
{2, 1000000, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
]},
{ra_metrics, [
{7, 1000, raft_entry_commit_latency_seconds, gauge, "Time taken for a log entry to be committed"}
]}
]).
-define(METRICS, ?METRICS_RAW ++ ?METRICS_REQUIRING_CONVERSIONS).
-define(TOTALS, [
%% ordering differs from metrics above, refer to list comprehension
{connection_created, connections, gauge, "Connections currently open"},
@ -227,7 +228,11 @@ collect_mf(_Registry, Callback) ->
[begin
Data = ets:tab2list(Table),
mf(Callback, Contents, Data)
end || {Table, Contents} <- ?METRICS],
end || {Table, Contents} <- ?METRICS_RAW],
[begin
Data = ets:tab2list(Table),
mf_convert(Callback, Contents, Data)
end || {Table, Contents} <- ?METRICS_REQUIRING_CONVERSIONS],
[begin
Size = ets:info(Table, size),
mf_totals(Callback, Name, Type, Help, Size)
@ -295,7 +300,21 @@ mf(Callback, Contents, Data) ->
{Type, Fun, Data}
)
)
end || {Index, Name, Type, Help, Key} <- Contents],
end || {Index, Name, Type, Help, Key} <- Contents].
mf_convert(Callback, Contents, Data) ->
[begin
Fun = fun(D) -> element(Index, D) / BaseUnitConversionFactor end,
Callback(
create_mf(
?METRIC_NAME(Name),
Help,
catch_boolean(Type),
?MODULE,
{Type, Fun, Data}
)
)
end || {Index, BaseUnitConversionFactor, Name, Type, Help} <- Contents],
[begin
Fun = fun(D) -> proplists:get_value(Key, element(Index, D)) / BaseUnitConversionFactor end,
Callback(

View File

@ -198,6 +198,7 @@ metrics_test(Config) ->
%% Checking the first metric value in each ETS table that requires converting
?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds{", [{capture, none}, multiline])),
%% Checking the first TOTALS metric value
?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])).