Validate the existence of archived traces before removing live trace
Often live traces are removed even though the archived trace doesn't exist. This commit checkes the existence strictly.
This commit is contained in:
parent
1a3fda63a5
commit
c2e0e689f3
|
|
@ -531,6 +531,14 @@ module Ci
|
|||
trace.exist?
|
||||
end
|
||||
|
||||
def has_live_trace?
|
||||
trace.live_trace_exist?
|
||||
end
|
||||
|
||||
def has_archived_trace?
|
||||
trace.archived_trace_exist?
|
||||
end
|
||||
|
||||
def artifacts_file
|
||||
job_artifacts_archive&.file
|
||||
end
|
||||
|
|
|
|||
|
|
@ -176,6 +176,10 @@ module Ci
|
|||
end
|
||||
end
|
||||
|
||||
def self.archived_trace_exists_for?(job_id)
|
||||
where(job_id: job_id).trace.take&.file&.file&.exists?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def file_format_adapter_class
|
||||
|
|
|
|||
|
|
@ -2,8 +2,25 @@
|
|||
|
||||
module Ci
|
||||
class ArchiveTraceService
|
||||
def execute(job)
|
||||
def execute(job, worker_name:)
|
||||
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
|
||||
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
|
||||
unless job.has_live_trace?
|
||||
Sidekiq.logger.warn(class: worker_name,
|
||||
message: 'The job does not have live trace but going to be archived.',
|
||||
job_id: job.id)
|
||||
return
|
||||
end
|
||||
|
||||
job.trace.archive!
|
||||
|
||||
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
|
||||
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
|
||||
unless job.has_archived_trace?
|
||||
Sidekiq.logger.warn(class: worker_name,
|
||||
message: 'The job does not have archived trace after archiving.',
|
||||
job_id: job.id)
|
||||
end
|
||||
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
|
||||
# It's already archived, thus we can safely ignore this exception.
|
||||
rescue => e
|
||||
|
|
@ -11,7 +28,7 @@ module Ci
|
|||
# If `archive!` keeps failing for over a week, that could incur data loss.
|
||||
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
|
||||
# In order to avoid interrupting the system, we do not raise an exception here.
|
||||
archive_error(e, job)
|
||||
archive_error(e, job, worker_name)
|
||||
end
|
||||
|
||||
private
|
||||
|
|
@ -22,9 +39,12 @@ module Ci
|
|||
"Counter of failed attempts of trace archiving")
|
||||
end
|
||||
|
||||
def archive_error(error, job)
|
||||
def archive_error(error, job, worker_name)
|
||||
failed_archive_counter.increment
|
||||
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger
|
||||
|
||||
Sidekiq.logger.warn(class: worker_name,
|
||||
message: "Failed to archive trace. message: #{error.message}.",
|
||||
job_id: job.id)
|
||||
|
||||
Gitlab::Sentry
|
||||
.track_exception(error,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ class ArchiveTraceWorker
|
|||
# rubocop: disable CodeReuse/ActiveRecord
|
||||
def perform(job_id)
|
||||
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
|
||||
Ci::ArchiveTraceService.new.execute(job)
|
||||
Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name)
|
||||
end
|
||||
end
|
||||
# rubocop: enable CodeReuse/ActiveRecord
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ module Ci
|
|||
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
|
||||
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
|
||||
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
|
||||
Ci::ArchiveTraceService.new.execute(build)
|
||||
Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name)
|
||||
end
|
||||
end
|
||||
# rubocop: enable CodeReuse/ActiveRecord
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
title: Extra logging for new live trace architecture
|
||||
merge_request: 30892
|
||||
author:
|
||||
type: fixed
|
||||
|
|
@ -63,7 +63,15 @@ module Gitlab
|
|||
end
|
||||
|
||||
def exist?
|
||||
trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present?
|
||||
archived_trace_exist? || live_trace_exist?
|
||||
end
|
||||
|
||||
def archived_trace_exist?
|
||||
trace_artifact&.exists?
|
||||
end
|
||||
|
||||
def live_trace_exist?
|
||||
job.trace_chunks.any? || current_path.present? || old_trace.present?
|
||||
end
|
||||
|
||||
def read
|
||||
|
|
@ -167,7 +175,7 @@ module Gitlab
|
|||
|
||||
def clone_file!(src_stream, temp_dir)
|
||||
FileUtils.mkdir_p(temp_dir)
|
||||
Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path|
|
||||
Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path|
|
||||
temp_path = File.join(dir_path, "job.log")
|
||||
FileUtils.touch(temp_path)
|
||||
size = IO.copy_stream(src_stream, temp_path)
|
||||
|
|
|
|||
|
|
@ -166,6 +166,13 @@ module Gitlab
|
|||
end
|
||||
|
||||
def destroy!
|
||||
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
|
||||
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
|
||||
unless build.has_archived_trace?
|
||||
Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.',
|
||||
job_id: build.id)
|
||||
end
|
||||
|
||||
trace_chunks.fast_destroy_all
|
||||
@tell = @size = 0
|
||||
ensure
|
||||
|
|
|
|||
|
|
@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do
|
|||
|
||||
expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0)
|
||||
end
|
||||
|
||||
context 'when the job does not have archived trace' do
|
||||
it 'leaves a message in sidekiq log' do
|
||||
expect(Sidekiq.logger).to receive(:warn).with(
|
||||
message: 'The job does not have archived trace but going to be destroyed.',
|
||||
job_id: build.id).and_call_original
|
||||
|
||||
subject
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
|||
|
|
@ -692,6 +692,34 @@ describe Ci::Build do
|
|||
end
|
||||
end
|
||||
|
||||
describe '#has_live_trace?' do
|
||||
subject { build.has_live_trace? }
|
||||
|
||||
let(:build) { create(:ci_build, :trace_live) }
|
||||
|
||||
it { is_expected.to be_truthy }
|
||||
|
||||
context 'when build does not have live trace' do
|
||||
let(:build) { create(:ci_build) }
|
||||
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
end
|
||||
|
||||
describe '#has_archived_trace?' do
|
||||
subject { build.has_archived_trace? }
|
||||
|
||||
let(:build) { create(:ci_build, :trace_artifact) }
|
||||
|
||||
it { is_expected.to be_truthy }
|
||||
|
||||
context 'when build does not have archived trace' do
|
||||
let(:build) { create(:ci_build) }
|
||||
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
end
|
||||
|
||||
describe '#has_job_artifacts?' do
|
||||
subject { build.has_job_artifacts? }
|
||||
|
||||
|
|
|
|||
|
|
@ -70,6 +70,31 @@ describe Ci::JobArtifact do
|
|||
end
|
||||
end
|
||||
|
||||
describe '.archived_trace_exists_for?' do
|
||||
subject { described_class.archived_trace_exists_for?(job_id) }
|
||||
|
||||
let!(:artifact) { create(:ci_job_artifact, :trace, job: job) }
|
||||
let(:job) { create(:ci_build) }
|
||||
|
||||
context 'when the specified job_id exists' do
|
||||
let(:job_id) { job.id }
|
||||
|
||||
it { is_expected.to be_truthy }
|
||||
|
||||
context 'when the job does have archived trace' do
|
||||
let!(:artifact) { }
|
||||
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
end
|
||||
|
||||
context 'when the specified job_id does not exist' do
|
||||
let(:job_id) { 10000 }
|
||||
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
end
|
||||
|
||||
describe 'callbacks' do
|
||||
subject { create(:ci_job_artifact, :archive) }
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
require 'spec_helper'
|
||||
|
||||
describe Ci::ArchiveTraceService, '#execute' do
|
||||
subject { described_class.new.execute(job) }
|
||||
subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) }
|
||||
|
||||
context 'when job is finished' do
|
||||
let(:job) { create(:ci_build, :success, :trace_live) }
|
||||
|
|
@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do
|
|||
expect { subject }.not_to change { Ci::JobArtifact.trace.count }
|
||||
end
|
||||
end
|
||||
|
||||
context 'when job does not have trace' do
|
||||
let(:job) { create(:ci_build, :success) }
|
||||
|
||||
it 'leaves a warning message in sidekiq log' do
|
||||
expect(Sidekiq.logger).to receive(:warn).with(
|
||||
class: ArchiveTraceWorker.name,
|
||||
message: 'The job does not have live trace but going to be archived.',
|
||||
job_id: job.id)
|
||||
|
||||
subject
|
||||
end
|
||||
end
|
||||
|
||||
context 'when job failed to archive trace but did not raise an exception' do
|
||||
before do
|
||||
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {}
|
||||
end
|
||||
|
||||
it 'leaves a warning message in sidekiq log' do
|
||||
expect(Sidekiq.logger).to receive(:warn).with(
|
||||
class: ArchiveTraceWorker.name,
|
||||
message: 'The job does not have archived trace after archiving.',
|
||||
job_id: job.id)
|
||||
|
||||
subject
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'when job is running' do
|
||||
|
|
@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do
|
|||
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
|
||||
extra: { job_id: job.id } ).once
|
||||
|
||||
expect(Rails.logger)
|
||||
.to receive(:error)
|
||||
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet")
|
||||
.and_call_original
|
||||
expect(Sidekiq.logger).to receive(:warn).with(
|
||||
class: ArchiveTraceWorker.name,
|
||||
message: "Failed to archive trace. message: Job is not finished yet.",
|
||||
job_id: job.id).and_call_original
|
||||
|
||||
expect(Gitlab::Metrics)
|
||||
.to receive(:counter)
|
||||
|
|
|
|||
|
|
@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do
|
|||
end
|
||||
end
|
||||
|
||||
describe '#archived_trace_exist?' do
|
||||
subject { trace.archived_trace_exist? }
|
||||
|
||||
context 'when trace does not exist' do
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
|
||||
context 'when archived trace exists' do
|
||||
before do
|
||||
create(:ci_job_artifact, :trace, job: build)
|
||||
end
|
||||
|
||||
it { is_expected.to be_truthy }
|
||||
end
|
||||
|
||||
context 'when live trace exists' do
|
||||
before do
|
||||
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
|
||||
stream.write('abc')
|
||||
end
|
||||
end
|
||||
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
end
|
||||
|
||||
describe '#live_trace_exist?' do
|
||||
subject { trace.live_trace_exist? }
|
||||
|
||||
context 'when trace does not exist' do
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
|
||||
context 'when archived trace exists' do
|
||||
before do
|
||||
create(:ci_job_artifact, :trace, job: build)
|
||||
end
|
||||
|
||||
it { is_expected.to be_falsy }
|
||||
end
|
||||
|
||||
context 'when live trace exists' do
|
||||
before do
|
||||
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
|
||||
stream.write('abc')
|
||||
end
|
||||
end
|
||||
|
||||
it { is_expected.to be_truthy }
|
||||
end
|
||||
end
|
||||
|
||||
describe '#archive!' do
|
||||
subject { trace.archive! }
|
||||
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ describe ArchiveTraceWorker do
|
|||
|
||||
it 'executes service' do
|
||||
expect_any_instance_of(Ci::ArchiveTraceService)
|
||||
.to receive(:execute).with(job)
|
||||
.to receive(:execute).with(job, anything)
|
||||
|
||||
subject
|
||||
end
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do
|
|||
|
||||
it 'executes service' do
|
||||
expect_any_instance_of(Ci::ArchiveTraceService)
|
||||
.to receive(:execute).with(build)
|
||||
.to receive(:execute).with(build, anything)
|
||||
|
||||
subject
|
||||
end
|
||||
|
|
@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do
|
|||
end
|
||||
|
||||
it 'puts a log' do
|
||||
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error")
|
||||
expect(Sidekiq.logger).to receive(:warn).with(
|
||||
class: described_class.name,
|
||||
message: "Failed to archive trace. message: Unexpected error.",
|
||||
job_id: build.id)
|
||||
|
||||
subject
|
||||
end
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@ describe StuckCiJobsWorker do
|
|||
|
||||
let!(:runner) { create :ci_runner }
|
||||
let!(:job) { create :ci_build, runner: runner }
|
||||
let(:trace_lease_key) { "trace:write:lock:#{job.id}" }
|
||||
let(:trace_lease_uuid) { SecureRandom.uuid }
|
||||
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
|
||||
let(:worker_lease_uuid) { SecureRandom.uuid }
|
||||
|
||||
|
|
@ -16,7 +14,6 @@ describe StuckCiJobsWorker do
|
|||
|
||||
before do
|
||||
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
|
||||
stub_exclusive_lease(trace_lease_key, trace_lease_uuid)
|
||||
job.update!(status: status, updated_at: updated_at)
|
||||
end
|
||||
|
||||
|
|
@ -195,7 +192,6 @@ describe StuckCiJobsWorker do
|
|||
end
|
||||
|
||||
it 'cancels exclusive leases after worker perform' do
|
||||
expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid)
|
||||
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
|
||||
|
||||
worker.perform
|
||||
|
|
|
|||
Loading…
Reference in New Issue