diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb index e4d47d53815..0d874f5bebc 100644 --- a/config/initializers/7_prometheus_metrics.rb +++ b/config/initializers/7_prometheus_metrics.rb @@ -4,7 +4,7 @@ PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRI require Rails.root.join('metrics_server', 'metrics_server') if PUMA_EXTERNAL_METRICS_SERVER # Keep separate directories for separate processes -def prometheus_default_multiproc_dir +def metrics_temp_dir return unless Rails.env.development? || Rails.env.test? if Gitlab::Runtime.sidekiq? @@ -16,20 +16,22 @@ def prometheus_default_multiproc_dir end end +def prometheus_metrics_dir + ENV['prometheus_multiproc_dir'] || metrics_temp_dir +end + def puma_metrics_server_process? Prometheus::PidProvider.worker_id == 'puma_master' end -def sidekiq_metrics_server_process? - Gitlab::Runtime.sidekiq? && (!ENV['SIDEKIQ_WORKER_ID'] || ENV['SIDEKIQ_WORKER_ID'] == '0') -end - -if puma_metrics_server_process? || sidekiq_metrics_server_process? +if puma_metrics_server_process? # The following is necessary to ensure stale Prometheus metrics don't accumulate over time. - # It needs to be done as early as here to ensure metrics files aren't deleted. - # After we hit our app in `warmup`, first metrics and corresponding files already being created, - # for example in `lib/gitlab/metrics/requests_rack_middleware.rb`. - Prometheus::CleanupMultiprocDirService.new.execute + # It needs to be done as early as possible to ensure new metrics aren't being deleted. + # + # Note that this should not happen for Sidekiq. Since Sidekiq workers are spawned from the + # sidekiq-cluster script, we perform this cleanup in `sidekiq_cluster/cli.rb` instead, + # since it must happen prior to any worker processes or the metrics server starting up. + Prometheus::CleanupMultiprocDirService.new(prometheus_metrics_dir).execute ::Prometheus::Client.reinitialize_on_pid_change(force: true) end @@ -37,7 +39,7 @@ end ::Prometheus::Client.configure do |config| config.logger = Gitlab::AppLogger - config.multiprocess_files_dir = ENV['prometheus_multiproc_dir'] || prometheus_default_multiproc_dir + config.multiprocess_files_dir = prometheus_metrics_dir config.pid_provider = ::Prometheus::PidProvider.method(:worker_id) end diff --git a/lib/prometheus/cleanup_multiproc_dir_service.rb b/lib/prometheus/cleanup_multiproc_dir_service.rb index 6418b4de166..b309247fa73 100644 --- a/lib/prometheus/cleanup_multiproc_dir_service.rb +++ b/lib/prometheus/cleanup_multiproc_dir_service.rb @@ -2,22 +2,17 @@ module Prometheus class CleanupMultiprocDirService - include Gitlab::Utils::StrongMemoize + def initialize(metrics_dir) + @metrics_dir = metrics_dir + end def execute - FileUtils.rm_rf(old_metrics) if old_metrics - end + return if @metrics_dir.blank? - private + files_to_delete = Dir[File.join(@metrics_dir, '*.db')] + return if files_to_delete.blank? - def old_metrics - strong_memoize(:old_metrics) do - Dir[File.join(multiprocess_files_dir, '*.db')] if multiprocess_files_dir - end - end - - def multiprocess_files_dir - ::Prometheus::Client.configuration.multiprocess_files_dir + FileUtils.rm_rf(files_to_delete) end end end diff --git a/metrics_server/metrics_server.rb b/metrics_server/metrics_server.rb index 2aadc63d65d..695f3a65930 100644 --- a/metrics_server/metrics_server.rb +++ b/metrics_server/metrics_server.rb @@ -84,7 +84,7 @@ class MetricsServer # rubocop:disable Gitlab/NamespacedClass end FileUtils.mkdir_p(@metrics_dir, mode: 0700) - ::Prometheus::CleanupMultiprocDirService.new.execute if @wipe_metrics_dir + ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute if @wipe_metrics_dir # We need to `warmup: true` since otherwise the sampler and exporter threads enter # a race where not all Prometheus db files will be visible to the exporter, resulting diff --git a/sidekiq_cluster/cli.rb b/sidekiq_cluster/cli.rb index e04f5ab1d42..59f39003429 100644 --- a/sidekiq_cluster/cli.rb +++ b/sidekiq_cluster/cli.rb @@ -118,6 +118,11 @@ module Gitlab return if @dryrun + # Make sure we reset the metrics directory prior to: + # - starting a metrics server process + # - starting new workers + ::Prometheus::CleanupMultiprocDirService.new(@metrics_dir).execute + ProcessManagement.write_pid(@pid) if @pid supervisor = SidekiqProcessSupervisor.instance( @@ -137,7 +142,7 @@ module Gitlab # and the metrics server died, restart it. if supervisor.alive && dead_pids.include?(metrics_server_pid) @logger.info('Sidekiq metrics server terminated, restarting...') - metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false) + metrics_server_pid = restart_metrics_server all_pids = worker_pids + Array(metrics_server_pid) else # If a worker process died we'll just terminate the whole cluster. @@ -154,15 +159,14 @@ module Gitlab def start_metrics_server return unless metrics_server_enabled? - restart_metrics_server(wipe_metrics_dir: true) + restart_metrics_server end - def restart_metrics_server(wipe_metrics_dir: false) + def restart_metrics_server @logger.info("Starting metrics server on port #{sidekiq_exporter_port}") MetricsServer.fork( 'sidekiq', metrics_dir: @metrics_dir, - wipe_metrics_dir: wipe_metrics_dir, reset_signals: TERMINATE_SIGNALS + FORWARD_SIGNALS ) end diff --git a/spec/commands/sidekiq_cluster/cli_spec.rb b/spec/commands/sidekiq_cluster/cli_spec.rb index 2cb3f67b03d..e2fc907fd05 100644 --- a/spec/commands/sidekiq_cluster/cli_spec.rb +++ b/spec/commands/sidekiq_cluster/cli_spec.rb @@ -41,6 +41,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo end let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) } + let(:metrics_cleanup_service) { instance_double(Prometheus::CleanupMultiprocDirService, execute: nil) } before do stub_env('RAILS_ENV', 'test') @@ -54,6 +55,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo allow(Gitlab::ProcessManagement).to receive(:write_pid) allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor) allow(supervisor).to receive(:supervise) + + allow(Prometheus::CleanupMultiprocDirService).to receive(:new).and_return(metrics_cleanup_service) end after do @@ -300,6 +303,12 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo allow(Gitlab::SidekiqCluster).to receive(:start).and_return([]) end + it 'wipes the metrics directory' do + expect(metrics_cleanup_service).to receive(:execute) + + cli.run(%w(foo)) + end + context 'when there are no sidekiq_health_checks settings set' do let(:sidekiq_exporter_enabled) { true } @@ -379,7 +388,7 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo with_them do specify do if start_metrics_server - expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, wipe_metrics_dir: true, reset_signals: trapped_signals) + expect(MetricsServer).to receive(:fork).with('sidekiq', metrics_dir: metrics_dir, reset_signals: trapped_signals) else expect(MetricsServer).not_to receive(:fork) end diff --git a/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb b/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb index db77a5d42d8..bdf9673a53f 100644 --- a/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb +++ b/spec/lib/prometheus/cleanup_multiproc_dir_service_spec.rb @@ -1,32 +1,27 @@ # frozen_string_literal: true -require 'spec_helper' +require 'fast_spec_helper' RSpec.describe Prometheus::CleanupMultiprocDirService do - describe '.call' do - subject { described_class.new.execute } - + describe '#execute' do let(:metrics_multiproc_dir) { Dir.mktmpdir } let(:metrics_file_path) { File.join(metrics_multiproc_dir, 'counter_puma_master-0.db') } + subject(:service) { described_class.new(metrics_dir_arg).execute } + before do FileUtils.touch(metrics_file_path) end after do - FileUtils.rm_r(metrics_multiproc_dir) + FileUtils.rm_rf(metrics_multiproc_dir) end context 'when `multiprocess_files_dir` is defined' do - before do - expect(Prometheus::Client.configuration) - .to receive(:multiprocess_files_dir) - .and_return(metrics_multiproc_dir) - .at_least(:once) - end + let(:metrics_dir_arg) { metrics_multiproc_dir } it 'removes old metrics' do - expect { subject } + expect { service } .to change { File.exist?(metrics_file_path) } .from(true) .to(false) @@ -34,15 +29,10 @@ RSpec.describe Prometheus::CleanupMultiprocDirService do end context 'when `multiprocess_files_dir` is not defined' do - before do - expect(Prometheus::Client.configuration) - .to receive(:multiprocess_files_dir) - .and_return(nil) - .at_least(:once) - end + let(:metrics_dir_arg) { nil } it 'does not remove any files' do - expect { subject } + expect { service } .not_to change { File.exist?(metrics_file_path) } .from(true) end