Convert raft_entry_commit_latency to seconds & be explicit about unit

This is a follow-up to https://github.com/rabbitmq/ra/pull/160 Had to introduce mf_convert/3 so that METRICS_REQUIRING_CONVERSIONS proplist does not clash with METRICS_RAW proplists that have the same number of elements. This is begging to be refactored, but I know that @dcorbacho is working on https://github.com/rabbitmq/rabbitmq-prometheus/issues/26 Also modified the RabbitMQ-Quorum-Queues-Raft dashboard Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
2020-01-07 16:12:31 +00:00 · 2020-01-07 16:12:31 +00:00 · 89efb964d9
parent 5602a9eb4c
commit 89efb964d9
3 changed files with 30 additions and 10 deletions
--- a/deps/rabbitmq_prometheus/docker/grafana/dashboards/RabbitMQ-Quorum-Queues-Raft.json
+++ b/deps/rabbitmq_prometheus/docker/grafana/dashboards/RabbitMQ-Quorum-Queues-Raft.json
@ -43,7 +43,7 @@
  "gnetId": null,
  "graphTooltip": 1,
  "id": null,
-  "iteration": 1575376605605,
+  "iteration": 1578410270904,
  "links": [
    {
      "icon": "doc",
@ -227,7 +227,7 @@
      "reverseYBuckets": false,
      "targets": [
        {
-          "expr": "rabbitmq_raft_entry_commit_latency * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
+          "expr": "rabbitmq_raft_entry_commit_latency_seconds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$rabbitmq_cluster\"}",
          "format": "time_series",
          "instant": false,
          "intervalFactor": 1,
@ -250,7 +250,7 @@
      "xBucketSize": null,
      "yAxis": {
        "decimals": null,
-        "format": "ms",
+        "format": "s",
        "logBase": 1,
        "max": null,
        "min": "0",
--- a/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
+++ b/deps/rabbitmq_prometheus/src/collectors/prometheus_rabbitmq_core_metrics_collector.erl
@ -129,7 +129,7 @@
        {2, disk_space_available_limit_bytes, gauge, "Free disk space low watermark in bytes", disk_free_limit},
        {2, erlang_processes_limit, gauge, "Erlang processes limit", proc_total},
        {2, erlang_scheduler_run_queue, gauge, "Erlang scheduler run queue", run_queue},
-        {2, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval in seconds", net_ticktime}
+        {2, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval", net_ticktime}
    ]},
    {node_persister_metrics, [
@ -155,8 +155,7 @@
        {3, raft_log_snapshot_index, gauge, "Raft log snapshot index"},
        {4, raft_log_last_applied_index, gauge, "Raft log last applied index"},
        {5, raft_log_commit_index, gauge, "Raft log commit index"},
-        {6, raft_log_last_written_index, gauge, "Raft log last written index"},
+        {6, raft_log_last_written_index, gauge, "Raft log last written index"}
        {7, raft_entry_commit_latency, gauge, "Time taken for an entry to be committed"}
    ]},
    {queue_coarse_metrics, [
@ -201,11 +200,13 @@
        {2, 1000000, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
        {2, 1000000, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
        {2, 1000000, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
    ]},
    {ra_metrics, [
        {7, 1000, raft_entry_commit_latency_seconds, gauge, "Time taken for a log entry to be committed"}
    ]}
 ]).
 -define(METRICS, ?METRICS_RAW ++ ?METRICS_REQUIRING_CONVERSIONS).
 -define(TOTALS, [
    %% ordering differs from metrics above, refer to list comprehension
    {connection_created, connections, gauge, "Connections currently open"},
@ -227,7 +228,11 @@ collect_mf(_Registry, Callback) ->
    [begin
         Data = ets:tab2list(Table),
         mf(Callback, Contents, Data)
-     end || {Table, Contents} <- ?METRICS],
+     end || {Table, Contents} <- ?METRICS_RAW],
    [begin
         Data = ets:tab2list(Table),
         mf_convert(Callback, Contents, Data)
     end || {Table, Contents} <- ?METRICS_REQUIRING_CONVERSIONS],
    [begin
         Size = ets:info(Table, size),
         mf_totals(Callback, Name, Type, Help, Size)
@ -295,7 +300,21 @@ mf(Callback, Contents, Data) ->
                {Type, Fun, Data}
            )
        )
-    end || {Index, Name, Type, Help, Key} <- Contents],
+    end || {Index, Name, Type, Help, Key} <- Contents].
 mf_convert(Callback, Contents, Data) ->
    [begin
        Fun = fun(D) -> element(Index, D) / BaseUnitConversionFactor end,
        Callback(
            create_mf(
                ?METRIC_NAME(Name),
                Help,
                catch_boolean(Type),
                ?MODULE,
                {Type, Fun, Data}
            )
        )
    end || {Index, BaseUnitConversionFactor, Name, Type, Help} <- Contents],
    [begin
        Fun = fun(D) -> proplists:get_value(Key, element(Index, D)) / BaseUnitConversionFactor end,
        Callback(
--- a/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
+++ b/deps/rabbitmq_prometheus/test/rabbit_prometheus_http_SUITE.erl
@ -198,6 +198,7 @@ metrics_test(Config) ->
    %% Checking the first metric value in each ETS table that requires converting
    ?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])),
    ?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
    ?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds{", [{capture, none}, multiline])),
    %% Checking the first TOTALS metric value
    ?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])).