140 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Ruby
		
	
	
	
			
		
		
	
	
			140 lines
		
	
	
		
			4.8 KiB
		
	
	
	
		
			Ruby
		
	
	
	
| # frozen_string_literal: true
 | |
| 
 | |
| # trigger stackprof by sending a SIGUSR2 signal
 | |
| #
 | |
| # Docs: https://docs.gitlab.com/ee/development/performance.html#production
 | |
| 
 | |
| module Gitlab
 | |
|   class StackProf
 | |
|     DEFAULT_FILE_PREFIX = Dir.tmpdir
 | |
|     DEFAULT_TIMEOUT_SEC = 30
 | |
|     DEFAULT_MODE = :cpu
 | |
|     # Sample interval as a frequency in microseconds (~99hz); appropriate for CPU profiles
 | |
|     DEFAULT_INTERVAL_US = 10_100
 | |
|     # Sample interval in event occurrences (n = every nth event); appropriate for allocation profiles
 | |
|     DEFAULT_INTERVAL_EVENTS = 100
 | |
| 
 | |
|     # this is a workaround for sidekiq, which defines its own SIGUSR2 handler.
 | |
|     # by defering to the sidekiq startup event, we get to set up our own
 | |
|     # handler late enough.
 | |
|     # see also: https://github.com/mperham/sidekiq/pull/4653
 | |
|     def self.install
 | |
|       require 'stackprof'
 | |
|       require 'tmpdir'
 | |
| 
 | |
|       if Gitlab::Runtime.sidekiq?
 | |
|         Sidekiq.configure_server do |config|
 | |
|           config.on :startup do
 | |
|             on_worker_start
 | |
|           end
 | |
|         end
 | |
|       else
 | |
|         Gitlab::Cluster::LifecycleEvents.on_worker_start do
 | |
|           on_worker_start
 | |
|         end
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     def self.on_worker_start
 | |
|       log_event('listening for SIGUSR2 signal')
 | |
| 
 | |
|       # create a pipe in order to propagate signal out of the signal handler
 | |
|       # see also: https://cr.yp.to/docs/selfpipe.html
 | |
|       read, write = IO.pipe
 | |
| 
 | |
|       # create a separate thread that polls for signals on the pipe.
 | |
|       #
 | |
|       # this way we do not execute in signal handler context, which
 | |
|       # lifts restrictions and also serializes the calls in a thread-safe
 | |
|       # manner.
 | |
|       #
 | |
|       # it's very similar to a goroutine and channel design.
 | |
|       #
 | |
|       # another nice benefit of this method is that we can timeout the
 | |
|       # IO.select call, allowing the profile to automatically stop after
 | |
|       # a given interval (by default 30 seconds), avoiding unbounded memory
 | |
|       # growth from a profile that was started and never stopped.
 | |
|       t = Thread.new do
 | |
|         timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || DEFAULT_TIMEOUT_SEC
 | |
|         current_timeout_s = nil
 | |
|         loop do
 | |
|           read.getbyte if IO.select([read], nil, nil, current_timeout_s)
 | |
| 
 | |
|           if ::StackProf.running?
 | |
|             stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || DEFAULT_FILE_PREFIX
 | |
|             stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile"
 | |
| 
 | |
|             log_event(
 | |
|               'stopping profile',
 | |
|               profile_filename: stackprof_out_file,
 | |
|               profile_timeout_s: timeout_s
 | |
|             )
 | |
| 
 | |
|             ::StackProf.stop
 | |
|             ::StackProf.results(stackprof_out_file)
 | |
|             current_timeout_s = nil
 | |
|           else
 | |
|             mode = ENV['STACKPROF_MODE']&.to_sym || DEFAULT_MODE
 | |
|             stackprof_interval = ENV['STACKPROF_INTERVAL']&.to_i
 | |
|             stackprof_interval ||= interval(mode)
 | |
| 
 | |
|             log_event(
 | |
|               'starting profile',
 | |
|               profile_mode: mode,
 | |
|               profile_interval: stackprof_interval,
 | |
|               profile_timeout: timeout_s
 | |
|             )
 | |
| 
 | |
|             ::StackProf.start(
 | |
|               mode: mode,
 | |
|               raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'),
 | |
|               interval: stackprof_interval
 | |
|             )
 | |
|             current_timeout_s = timeout_s
 | |
|           end
 | |
|         end
 | |
|       rescue StandardError => e
 | |
|         log_event("stackprof failed: #{e}")
 | |
|       end
 | |
|       t.abort_on_exception = true
 | |
| 
 | |
|       # in the case of puma, this will override the existing SIGUSR2 signal handler
 | |
|       # that can be used to trigger a restart.
 | |
|       #
 | |
|       # puma cluster has two types of restarts:
 | |
|       # * SIGUSR1: phased restart
 | |
|       # * SIGUSR2: restart
 | |
|       #
 | |
|       # phased restart is not supported in our configuration, because we use
 | |
|       # preload_app. this means we will always perform a normal restart.
 | |
|       # additionally, phased restart is not supported when sending a SIGUSR2
 | |
|       # directly to a puma worker (as opposed to the master process).
 | |
|       #
 | |
|       # the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in
 | |
|       # our configuration, and we can always use a SIGUSR1 to perform a restart.
 | |
|       #
 | |
|       # thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and
 | |
|       # override the puma behaviour.
 | |
|       #
 | |
|       # see also:
 | |
|       # * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals
 | |
|       # * https://github.com/mperham/sidekiq/wiki/Signals
 | |
|       Signal.trap('SIGUSR2') do
 | |
|         write.write('.')
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     def self.log_event(event, labels = {})
 | |
|       Gitlab::AppJsonLogger.info({
 | |
|         event: 'stackprof',
 | |
|         message: event,
 | |
|         pid: Process.pid
 | |
|       }.merge(labels.compact))
 | |
|     end
 | |
| 
 | |
|     def self.interval(mode)
 | |
|       mode == :object ? DEFAULT_INTERVAL_EVENTS : DEFAULT_INTERVAL_US
 | |
|     end
 | |
|   end
 | |
| end
 |