Replace histogram type with gauge for raft_entry_commit_latency_seconds

We want to keep the same metric type regardless whether we aggregate or
don't. If we had used a histogram type, considering the ~12 buckets that
we added, it would have meant 12 extra metrics per queue which would
have resulted in an explosion of metrics. Keeping the gauge type and
aggregating latencies across all members.

re https://github.com/rabbitmq/rabbitmq-prometheus/pull/28

Signed-off-by: Gerhard Lazu <gerhard@lazu.co.uk>
This commit is contained in:
Gerhard Lazu 2020-02-06 17:37:37 +00:00
parent 8aa8d8aa3d
commit 11d676f3e1
6 changed files with 56 additions and 58 deletions

View File

@ -11,7 +11,7 @@ OTP_SHA256 := 92df7d22239b09f7580572305c862da1fb030a97cef7631ba060ac51fa3864cc
define PROJECT_ENV
[
{enable_metric_aggregation, false}
{enable_metrics_aggregation, false}
]
endef

View File

@ -1,18 +1,19 @@
# Prometheus Exporter of Core (Raw, Unaggregated) RabbitMQ Metrics
# Prometheus Exporter of Core RabbitMQ Metrics
## Getting Started
This is a Prometheus exporter of core (raw, unaggregated) RabbitMQ metrics, developed by the RabbitMQ core team.
This is a Prometheus exporter of core RabbitMQ metrics, developed by the RabbitMQ core team.
It is largely a "clean room" design that reuses some prior work from Prometheus exporters done by the community.
## Project Maturity
This plugin is new and relatively immature. It shipped in the RabbitMQ distribution starting with `3.8.0`.
This plugin is new as of RabbitMQ `3.8.0`.
## Documentation
See [Monitoring RabbitMQ with Prometheus and Grafana](https://www.rabbitmq.com/prometheus.html).
## Installation
This plugin is included into RabbitMQ 3.8.x releases. Like all [plugins](https://www.rabbitmq.com/plugins.html), it has to be
@ -22,7 +23,6 @@ To enable it with [rabbitmq-plugins](http://www.rabbitmq.com/man/rabbitmq-plugin
rabbitmq-plugins enable rabbitmq_prometheus
## Usage
See the [documentation guide](https://www.rabbitmq.com/prometheus.html).
@ -30,13 +30,17 @@ See the [documentation guide](https://www.rabbitmq.com/prometheus.html).
Default port used by the plugin is `15692`. In most environments there would be no configuration
necessary.
See the entire list of [metrics](metrics.md) exposed via the default port.
See the entire list of [metrics](metrics.md) exposed via the default port.
## Configuration
This exporter supports the following options via a set of `prometheus.*` configuration keys:
* `prometheus.path` defines a scrape endpoint. Default is `"/metrics"`.
* `prometheus.enable_metrics_aggregation` returns all metrics aggregated (default is `false`).
Nodes with over 50k objects (queues, connections, channels) can take 30 seconds or more to return metrics without this option.
See #26 for more details.
* `prometheus.path` defines a scrape endpoint (default is `"/metrics"`).
* `prometheus.tcp.*` controls HTTP listener settings that match [those used by the RabbitMQ HTTP API](https://www.rabbitmq.com/management.html#configuration)
* `prometheus.ssl.*` controls TLS (HTTPS) listener settings that match [those used by the RabbitMQ HTTP API](https://www.rabbitmq.com/management.html#single-listener-https)
@ -44,6 +48,7 @@ Sample configuration snippet:
``` ini
# these values are defaults
prometheus.enable_metrics_aggregation = false
prometheus.path = /metrics
prometheus.tcp.port = 15692
```

View File

@ -157,14 +157,14 @@
### Raft
| Metric | Description |
| --- | --- |
| rabbitmq_raft_term_total | Current Raft term number |
| rabbitmq_raft_log_snapshot_index | Raft log snapshot index |
| rabbitmq_raft_log_last_applied_index | Raft log last applied index |
| rabbitmq_raft_log_commit_index | Raft log commit index |
| rabbitmq_raft_log_last_written_index | Raft log last written index |
| rabbitmq_raft_entry_commit_latency | Time taken for an entry to be committed |
| Metric | Description |
| --- | --- |
| rabbitmq_raft_term_total | Current Raft term number |
| rabbitmq_raft_log_snapshot_index | Raft log snapshot index |
| rabbitmq_raft_log_last_applied_index | Raft log last applied index |
| rabbitmq_raft_log_commit_index | Raft log commit index |
| rabbitmq_raft_log_last_written_index | Raft log last written index |
| rabbitmq_raft_entry_commit_latency_seconds | Time taken for an entry to be committed |
## Telemetry

View File

@ -4,6 +4,10 @@
%% See https://rabbitmq.com/prometheus.html for details
%% ----------------------------------------------------------------------------
%% Option to enable metrics aggregation
{mapping, "prometheus.enable_metrics_aggregation", "rabbitmq_prometheus.enable_metrics_aggregation",
[{datatype, {enum, [true, false]}}]}.
%% Endpoint path
{mapping, "prometheus.path", "rabbitmq_prometheus.path",
[{datatype, string}]}.

View File

@ -23,8 +23,7 @@
create_mf/5,
gauge_metric/2,
counter_metric/2,
untyped_metric/2,
histogram_metric/4]).
untyped_metric/2]).
-include_lib("prometheus/include/prometheus.hrl").
-include_lib("rabbit_common/include/rabbit.hrl").
@ -57,6 +56,10 @@
% Some metrics require to be converted, mostly those that represent time.
% It is a Prometheus best practice to use specific base units: https://prometheus.io/docs/practices/naming/#base-units
% Extra context: https://github.com/prometheus/docs/pull/1414#issuecomment-522337895
-define(MILLISECOND, 1000).
-define(MICROSECOND, 1000000).
-define(METRICS_RAW, [
{channel_metrics, [
{2, undefined, channel_consumers, gauge, "Consumers on a channel", consumer_count},
@ -135,7 +138,7 @@
{2, undefined, erlang_processes_limit, gauge, "Erlang processes limit", proc_total},
{2, undefined, erlang_scheduler_run_queue, gauge, "Erlang scheduler run queue", run_queue},
{2, undefined, erlang_net_ticktime_seconds, gauge, "Inter-node heartbeat interval", net_ticktime},
{2, 1000, erlang_uptime_seconds, gauge, "Node uptime", uptime}
{2, ?MILLISECOND, erlang_uptime_seconds, gauge, "Node uptime", uptime}
]},
{node_persister_metrics, [
@ -154,11 +157,11 @@
{2, undefined, queue_index_read_ops_total, counter, "Total number of Queue Index read operations", queue_index_read_count},
{2, undefined, queue_index_write_ops_total, counter, "Total number of Queue Index write operations", queue_index_write_count},
{2, undefined, queue_index_journal_write_ops_total, counter, "Total number of Queue Index Journal write operations", queue_index_journal_write_count},
{2, 1000000, io_read_time_seconds_total, counter, "Total I/O read time", io_read_time},
{2, 1000000, io_write_time_seconds_total, counter, "Total I/O write time", io_write_time},
{2, 1000000, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
{2, 1000000, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
{2, 1000000, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
{2, ?MICROSECOND, io_read_time_seconds_total, counter, "Total I/O read time", io_read_time},
{2, ?MICROSECOND, io_write_time_seconds_total, counter, "Total I/O write time", io_write_time},
{2, ?MICROSECOND, io_sync_time_seconds_total, counter, "Total I/O sync time", io_sync_time},
{2, ?MICROSECOND, io_seek_time_seconds_total, counter, "Total I/O seek time", io_seek_time},
{2, ?MICROSECOND, io_open_attempt_time_seconds_total, counter, "Total file open attempts time", io_file_handle_open_attempt_time}
]},
{ra_metrics, [
@ -167,7 +170,7 @@
{4, undefined, raft_log_last_applied_index, gauge, "Raft log last applied index"},
{5, undefined, raft_log_commit_index, gauge, "Raft log commit index"},
{6, undefined, raft_log_last_written_index, gauge, "Raft log last written index"},
{7, undefined,raft_entry_commit_latency_seconds, histogram, "Time taken for a log entry to be committed"}
{7, ?MILLISECOND, raft_entry_commit_latency_seconds, gauge, "Time taken for a log entry to be committed"}
]},
{queue_coarse_metrics, [
@ -206,9 +209,6 @@
{queue_metrics, queues, gauge, "Queues available"}
]).
-define(BUCKETS, [100, 300, 500, 750, 1000, 2500, 5000, 7500, 10000,
30000, 60000, 90000, infinity]).
%%====================================================================
%% Collector API
%%====================================================================
@ -219,7 +219,7 @@ register() ->
deregister_cleanup(_) -> ok.
collect_mf(_Registry, Callback) ->
{ok, Enable} = application:get_env(rabbitmq_prometheus, enable_metric_aggregation),
{ok, Enable} = application:get_env(rabbitmq_prometheus, enable_metrics_aggregation),
[begin
Data = get_data(Table, Enable),
mf(Callback, Contents, Data)
@ -339,8 +339,6 @@ metric(counter, Labels, Value) ->
emit_counter_metric_if_defined(Labels, Value);
metric(gauge, Labels, Value) ->
emit_gauge_metric_if_defined(Labels, Value);
metric(histogram, Labels, Value) ->
emit_histogram_metric(Labels, Value);
metric(untyped, Labels, Value) ->
untyped_metric(Labels, Value);
metric(boolean, Labels, Value0) ->
@ -377,11 +375,6 @@ emit_gauge_metric_if_defined(Labels, Value) ->
gauge_metric(Labels, Value)
end.
emit_histogram_metric(Labels, {Buckets, Count, Sum}) ->
histogram_metric(Labels, maps:to_list(Buckets), Count, Sum);
emit_histogram_metric(Labels, Value) ->
emit_gauge_metric_if_defined(Labels, Value).
get_data(connection_metrics = Table, true) ->
{Table, A1, A2, A3, A4} = ets:foldl(fun({_, Props}, {T, A1, A2, A3, A4}) ->
{T,
@ -447,7 +440,7 @@ get_data(Table, true) when Table == channel_exchange_metrics;
Table == channel_queue_exchange_metrics;
Table == ra_metrics;
Table == channel_process_metrics ->
[ets:foldl(fun({_, V1}, {T, A1}) ->
Result = ets:foldl(fun({_, V1}, {T, A1}) ->
{T, V1 + A1};
({_, V1, _}, {T, A1}) ->
{T, V1 + A1};
@ -458,18 +451,24 @@ get_data(Table, true) when Table == channel_exchange_metrics;
({_, V1, V2, V3, V4}, {T, A1, A2, A3, A4}) ->
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4};
({_, V1, V2, V3, V4, V5, V6}, {T, A1, A2, A3, A4, A5, A6}) ->
%% ra_metrics: latency is a histogram.
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, update_histogram(V6, A6)};
%% ra_metrics: raft_entry_commit_latency_seconds needs to be an average
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, accumulate_count_and_sum(V6, A6)};
({_, V1, V2, V3, V4, V5, V6, V7, _}, {T, A1, A2, A3, A4, A5, A6, A7}) ->
{T, V1 + A1, V2 + A2, V3 + A3, V4 + A4, V5 + A5, V6 + A6, V7 + A7}
end, empty(Table), Table)];
end, empty(Table), Table),
case Table of
%% raft_entry_commit_latency_seconds needs to be an average
ra_metrics ->
{Count, Sum} = element(7, Result),
[setelement(7, Result, Sum / Count)];
_ ->
[Result]
end;
get_data(Table, _) ->
ets:tab2list(Table).
update_histogram(Value, {H, C, S}) ->
NH = maps:update_with(position(?BUCKETS, Value), fun(V) -> V + 1 end, H),
{NH, C + 1, S + Value}.
accumulate_count_and_sum(Value, {Count, Sum}) ->
{Count + 1, Sum + Value}.
empty(T) when T == channel_queue_exchange_metrics; T == channel_process_metrics ->
{T, 0};
@ -478,7 +477,7 @@ empty(T) when T == connection_coarse_metrics ->
empty(T) when T == channel_exchange_metrics; T == queue_coarse_metrics; T == connection_metrics ->
{T, 0, 0, 0, 0};
empty(T) when T == ra_metrics ->
{T, 0, 0, 0, 0, 0, empty_histogram(?BUCKETS)};
{T, 0, 0, 0, 0, 0, {0, 0}};
empty(T) when T == channel_queue_metrics; T == channel_metrics ->
{T, 0, 0, 0, 0, 0, 0, 0};
empty(queue_metrics = T) ->
@ -490,11 +489,3 @@ sum('', B) ->
B;
sum(A, B) ->
A + B.
empty_histogram(Buckets) ->
{maps:from_list([{B, 0} || B <- Buckets]), 0, 0}.
position([Bound | L], Value) when Value > Bound ->
position(L, Value);
position([Bound | _], _) ->
Bound.

View File

@ -63,7 +63,7 @@ init_per_group(config_port, Config0) ->
Config1 = rabbit_ct_helpers:merge_app_env(Config0, PathConfig),
init_per_group(config_port, Config1, [{prometheus_port, 15772}]);
init_per_group(with_aggregation, Config0) ->
PathConfig = {rabbitmq_prometheus, [{enable_metric_aggregation, true}]},
PathConfig = {rabbitmq_prometheus, [{enable_metrics_aggregation, true}]},
Config1 = rabbit_ct_helpers:merge_app_env(Config0, PathConfig),
init_per_group(with_metrics, Config1);
init_per_group(with_metrics, Config0) ->
@ -199,7 +199,6 @@ metrics_test(Config) ->
?assertEqual(match, re:run(Body, "^rabbitmq_process_max_fds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_ops_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_term_total{", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds{", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_queue_messages_ready{", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_queue_consumers{", [{capture, none}, multiline])),
%% Checking the first metric value in each ETS table that requires converting
@ -229,16 +228,15 @@ aggregated_metrics_test(Config) ->
?assertEqual(match, re:run(Body, "^rabbitmq_process_max_fds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_ops_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_term_total ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds_count ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds_sum ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds_bucket{", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_queue_messages_ready ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_queue_consumers ", [{capture, none}, multiline])),
%% Checking the first metric value in each ETS table that requires converting
?assertEqual(match, re:run(Body, "^rabbitmq_erlang_uptime_seconds ", [{capture, none}, multiline])),
?assertEqual(match, re:run(Body, "^rabbitmq_io_read_time_seconds_total ", [{capture, none}, multiline])),
%% Checking the first TOTALS metric value
?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])).
?assertEqual(match, re:run(Body, "^rabbitmq_connections ", [{capture, none}, multiline])),
%% Checking raft_entry_commit_latency_seconds because we are aggregating it
?assertEqual(match, re:run(Body, "^rabbitmq_raft_entry_commit_latency_seconds ", [{capture, none}, multiline])).
build_info_test(Config) ->
{_Headers, Body} = http_get(Config, [], 200),