From 294f96da0ca676d0914a592d48bd2da11baa39e7 Mon Sep 17 00:00:00 2001 From: Pedro Pombeiro Date: Wed, 30 Aug 2023 15:46:59 +0200 Subject: [PATCH 1/2] Implement redis cache of failed builds executed on instance runners Only records builds failed due to a runner system failure EE: true --- .../models/ci/instance_runner_failed_jobs.rb | 45 +++++++ ee/app/workers/ee/ci/build_finished_worker.rb | 10 ++ .../ci/instance_runner_failed_jobs_spec.rb | 113 ++++++++++++++++++ .../ee/ci/build_finished_worker_spec.rb | 73 ++++++++++- 4 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 ee/app/models/ci/instance_runner_failed_jobs.rb create mode 100644 ee/spec/models/ci/instance_runner_failed_jobs_spec.rb diff --git a/ee/app/models/ci/instance_runner_failed_jobs.rb b/ee/app/models/ci/instance_runner_failed_jobs.rb new file mode 100644 index 00000000000000..d01cc0f63a203e --- /dev/null +++ b/ee/app/models/ci/instance_runner_failed_jobs.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +module Ci + class InstanceRunnerFailedJobs + # Safety margin for situations where there is a mismatch between the async insert order and the finished_at value + JOB_LIMIT_MARGIN = 10 + JOB_LIMIT = 2 + + class << self + def save_job(job) + raise Gitlab::Access::AccessDeniedError unless License.feature_available?(:runner_performance_insights) + + with_redis do |redis| + redis.lpush(key, job.id) + redis.ltrim(key, 0, max_admissible_job_count - 1) + end + end + + def recent_jobs + raise Gitlab::Access::AccessDeniedError unless License.feature_available?(:runner_performance_insights) + + job_ids = with_redis do |redis| + # Fetch a few more jobs in case there is a mismatch between the async insert order and the finished_at value + redis.lrange(key, 0, max_admissible_job_count - 1) + end + + Ci::Build.id_in(job_ids).order(finished_at: :desc).limit(JOB_LIMIT) + end + + private + + def key + self.class.name + end + + def with_redis(&block) + Gitlab::Redis::SharedState.with(&block) + end + + def max_admissible_job_count + JOB_LIMIT + JOB_LIMIT_MARGIN + end + end + end +end diff --git a/ee/app/workers/ee/ci/build_finished_worker.rb b/ee/app/workers/ee/ci/build_finished_worker.rb index 7be7444f250e4d..7227a00a7fde72 100644 --- a/ee/app/workers/ee/ci/build_finished_worker.rb +++ b/ee/app/workers/ee/ci/build_finished_worker.rb @@ -16,6 +16,10 @@ def process_build(build) if ::Gitlab.com? && build.has_security_reports? ::Security::TrackSecureScansWorker.perform_async(build.id) end + + if save_system_failure_job?(build) + ::Ci::InstanceRunnerFailedJobs.save_job(build) + end end private @@ -29,6 +33,12 @@ def requirements_available?(build) !build.project.requirements.empty? && Ability.allowed?(build.user, :create_requirement_test_report, build.project) end + + def save_system_failure_job?(build) + License.feature_available?(:runner_performance_insights) && + build.failure_reason == 'runner_system_failure' && + build.runner.instance_type? + end end end end diff --git a/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb b/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb new file mode 100644 index 00000000000000..52f14461368536 --- /dev/null +++ b/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb @@ -0,0 +1,113 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Ci::InstanceRunnerFailedJobs, :freeze_time, :clean_gitlab_redis_shared_state, + feature_category: :runner_fleet do + let_it_be(:job) { create(:ci_build, created_at: 10.minutes.ago, finished_at: 3.minutes.ago) } + let_it_be(:job2) { create(:ci_build, created_at: 5.minutes.ago, finished_at: 4.minutes.ago) } + let_it_be(:job3) { create(:ci_build, created_at: 5.minutes.ago, finished_at: 5.minutes.ago) } + + before do + stub_licensed_features(runner_performance_insights: runner_performance_insights) + end + + describe '.save_job' do + subject(:save_job) { described_class.save_job(job) } + + context 'with runner_performance_insights licensed feature' do + let(:runner_performance_insights) { true } + + it 'saves job id on redis cache' do + save_job + + expect(redis_stored_job_ids).to match_array(formatted_job_ids_for(job)) + end + end + + context 'without runner_performance_insights licensed feature' do + let(:runner_performance_insights) { false } + + it 'raises AccessDeniedError error' do + expect { save_job }.to raise_error Gitlab::Access::AccessDeniedError + end + end + end + + describe '.recent_jobs' do + subject(:recent_jobs) { described_class.recent_jobs } + + context 'with runner_performance_insights licensed feature' do + let(:runner_performance_insights) { true } + + context 'when content is not set' do + it { is_expected.to be_empty } + end + + context 'when jobs are added' do + before do + described_class.save_job(job) + end + + it 'returns 2 most recently finished jobs' do + expect(described_class.recent_jobs).to contain_exactly(an_object_having_attributes(id: job.id)) + + described_class.save_job(job2) + described_class.save_job(job3) + + expect(described_class.recent_jobs).to match([ + an_object_having_attributes(id: job.id), + an_object_having_attributes(id: job2.id) + ]) + end + + context 'when jobs are added in different order' do + it 'returns 2 most recently finished jobs' do + expect(described_class.recent_jobs).to contain_exactly(an_object_having_attributes(id: job.id)) + + described_class.save_job(job3) + described_class.save_job(job2) + + expect(described_class.recent_jobs).to match([ + an_object_having_attributes(id: job.id), + an_object_having_attributes(id: job2.id) + ]) + end + end + + context 'when trimming is required' do + before do + stub_const("#{described_class}::JOB_LIMIT", 1) + stub_const("#{described_class}::JOB_LIMIT_MARGIN", 1) + end + + it 'returns 2 most recently finished jobs and purges the rest', :aggregate_failures do + described_class.save_job(job3) + described_class.save_job(job2) + + # Only the last 2 jobs saved will be retained + expect(redis_stored_job_ids).to eq(formatted_job_ids_for(job2, job3)) + # and of those 2, the most recently finished will be returned (JOB_LIMIT) + expect(described_class.recent_jobs).to contain_exactly(an_object_having_attributes(id: job2.id)) + end + end + end + end + + context 'without runner_performance_insights licensed feature' do + let(:runner_performance_insights) { false } + + it 'raises AccessDeniedError error' do + expect { recent_jobs }.to raise_error Gitlab::Access::AccessDeniedError + end + end + end + + def redis_stored_job_ids + Gitlab::Redis::SharedState.with { |redis| redis.lrange(described_class.class.name, 0, -1) } + end + + def formatted_job_ids_for(*builds) + builds.map(&:id).map(&:to_s) + end +end diff --git a/ee/spec/workers/ee/ci/build_finished_worker_spec.rb b/ee/spec/workers/ee/ci/build_finished_worker_spec.rb index 8a0eab2b3990ec..13ea3b7de3e7eb 100644 --- a/ee/spec/workers/ee/ci/build_finished_worker_spec.rb +++ b/ee/spec/workers/ee/ci/build_finished_worker_spec.rb @@ -8,10 +8,6 @@ let_it_be(:project) { build.project } let_it_be(:namespace) { project.shared_runners_limit_namespace } - subject do - described_class.new.perform(build.id) - end - def namespace_stats namespace.namespace_statistics || namespace.create_namespace_statistics end @@ -21,6 +17,10 @@ def project_stats end describe '#perform' do + subject(:perform) do + described_class.new.perform(build.id) + end + context 'when on .com' do before do allow(Gitlab).to receive(:com?).and_return(true) @@ -90,7 +90,7 @@ def project_stats it 'does not schedule processing of requirement reports by default' do expect(RequirementsManagement::ProcessRequirementsReportsWorker).not_to receive(:perform_async) - subject + perform end context 'with requirements' do @@ -138,5 +138,68 @@ def project_stats it_behaves_like 'does not schedule processing of requirement reports' end end + + it 'does not save job on Ci::InstanceRunnerFailedJobs by default' do + expect(Ci::InstanceRunnerFailedJobs).not_to receive(:save_job) + + perform + end + + context 'when job failed', feature_category: :runner_fleet do + let_it_be_with_reload(:build) { create(:ee_ci_build, :sast, :running, runner: ci_runner) } + + before do + stub_licensed_features(runner_performance_insights: runner_performance_insights) + + build.drop!(failure_reason) + end + + context 'with runner_performance_insights licensed feature' do + let(:runner_performance_insights) { true } + + context 'with failure_reason set to :runner_system_failure' do + let(:failure_reason) { :runner_system_failure } + + it 'saves job to Redis list', :aggregate_failures do + expect(Ci::InstanceRunnerFailedJobs).to receive(:save_job).once.and_call_original + + expect { perform }.to change { Ci::InstanceRunnerFailedJobs.recent_jobs.count }.by(1) + expect(Ci::InstanceRunnerFailedJobs.recent_jobs).to contain_exactly(build) + end + end + + context 'with failure_reason not set to :runner_system_failure' do + using RSpec::Parameterized::TableSyntax + + where(:failure_reason) { Enums::Ci::CommitStatus.failure_reasons.keys - [:runner_system_failure] } + + with_them do + context 'with runner_performance_insights licensed feature' do + let(:runner_performance_insights) { true } + + specify do + expect(Ci::InstanceRunnerFailedJobs).not_to receive(:save_job) + + perform + end + end + end + end + + context 'without runner_performance_insights licensed feature' do + let(:runner_performance_insights) { false } + + context 'with failure_reason set to :runner_system_failure' do + let(:failure_reason) { :runner_system_failure } + + specify do + expect(Ci::InstanceRunnerFailedJobs).not_to receive(:save_job) + + perform + end + end + end + end + end end end -- GitLab From 3a88f6c445bfc829c6aecbef580de157ffccf400 Mon Sep 17 00:00:00 2001 From: Pedro Pombeiro Date: Thu, 31 Aug 2023 10:47:43 +0200 Subject: [PATCH 2/2] Address MR review comments --- .../models/ci/instance_runner_failed_jobs.rb | 22 +++-- ee/app/workers/ee/ci/build_finished_worker.rb | 10 +- .../ci/instance_runner_failed_jobs_spec.rb | 97 +++++++++++++------ .../ee/ci/build_finished_worker_spec.rb | 85 +++++----------- 4 files changed, 108 insertions(+), 106 deletions(-) diff --git a/ee/app/models/ci/instance_runner_failed_jobs.rb b/ee/app/models/ci/instance_runner_failed_jobs.rb index d01cc0f63a203e..b7e8c6a4fd6cae 100644 --- a/ee/app/models/ci/instance_runner_failed_jobs.rb +++ b/ee/app/models/ci/instance_runner_failed_jobs.rb @@ -4,27 +4,29 @@ module Ci class InstanceRunnerFailedJobs # Safety margin for situations where there is a mismatch between the async insert order and the finished_at value JOB_LIMIT_MARGIN = 10 - JOB_LIMIT = 2 + JOB_LIMIT = 100 class << self - def save_job(job) - raise Gitlab::Access::AccessDeniedError unless License.feature_available?(:runner_performance_insights) + def track(build) + return unless track_job?(build) with_redis do |redis| - redis.lpush(key, job.id) - redis.ltrim(key, 0, max_admissible_job_count - 1) + redis.pipelined do |pipeline| + pipeline.lpush(key, build.id) + pipeline.ltrim(key, 0, max_admissible_job_count - 1) + end end end def recent_jobs - raise Gitlab::Access::AccessDeniedError unless License.feature_available?(:runner_performance_insights) + return Ci::Build.none unless License.feature_available?(:runner_performance_insights) job_ids = with_redis do |redis| # Fetch a few more jobs in case there is a mismatch between the async insert order and the finished_at value redis.lrange(key, 0, max_admissible_job_count - 1) end - Ci::Build.id_in(job_ids).order(finished_at: :desc).limit(JOB_LIMIT) + Ci::Build.id_in(job_ids).order(finished_at: :desc, id: :desc).limit(JOB_LIMIT) end private @@ -40,6 +42,12 @@ def with_redis(&block) def max_admissible_job_count JOB_LIMIT + JOB_LIMIT_MARGIN end + + def track_job?(build) + License.feature_available?(:runner_performance_insights) && + build.failure_reason == 'runner_system_failure' && + build.runner.instance_type? + end end end end diff --git a/ee/app/workers/ee/ci/build_finished_worker.rb b/ee/app/workers/ee/ci/build_finished_worker.rb index 7227a00a7fde72..a27e7894731419 100644 --- a/ee/app/workers/ee/ci/build_finished_worker.rb +++ b/ee/app/workers/ee/ci/build_finished_worker.rb @@ -17,9 +17,7 @@ def process_build(build) ::Security::TrackSecureScansWorker.perform_async(build.id) end - if save_system_failure_job?(build) - ::Ci::InstanceRunnerFailedJobs.save_job(build) - end + ::Ci::InstanceRunnerFailedJobs.track(build) if build.failed? end private @@ -33,12 +31,6 @@ def requirements_available?(build) !build.project.requirements.empty? && Ability.allowed?(build.user, :create_requirement_test_report, build.project) end - - def save_system_failure_job?(build) - License.feature_available?(:runner_performance_insights) && - build.failure_reason == 'runner_system_failure' && - build.runner.instance_type? - end end end end diff --git a/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb b/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb index 52f14461368536..99df1913b090f8 100644 --- a/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb +++ b/ee/spec/models/ci/instance_runner_failed_jobs_spec.rb @@ -4,32 +4,63 @@ RSpec.describe Ci::InstanceRunnerFailedJobs, :freeze_time, :clean_gitlab_redis_shared_state, feature_category: :runner_fleet do - let_it_be(:job) { create(:ci_build, created_at: 10.minutes.ago, finished_at: 3.minutes.ago) } - let_it_be(:job2) { create(:ci_build, created_at: 5.minutes.ago, finished_at: 4.minutes.ago) } - let_it_be(:job3) { create(:ci_build, created_at: 5.minutes.ago, finished_at: 5.minutes.ago) } - before do stub_licensed_features(runner_performance_insights: runner_performance_insights) end - describe '.save_job' do - subject(:save_job) { described_class.save_job(job) } + describe '.track' do + subject(:track) { described_class.track(job) } - context 'with runner_performance_insights licensed feature' do - let(:runner_performance_insights) { true } + let_it_be(:instance_runner) { create(:ci_runner, :instance) } - it 'saves job id on redis cache' do - save_job + let(:job) { create(:ci_build, runner: runner) } - expect(redis_stored_job_ids).to match_array(formatted_job_ids_for(job)) - end + before do + job.drop!(failure_reason) end - context 'without runner_performance_insights licensed feature' do - let(:runner_performance_insights) { false } + context 'when job fails with runner_system_failure' do + let(:failure_reason) { :runner_system_failure } + + context 'with runner_performance_insights licensed feature' do + let(:runner_performance_insights) { true } - it 'raises AccessDeniedError error' do - expect { save_job }.to raise_error Gitlab::Access::AccessDeniedError + context 'when job is executed in an instance runner' do + let(:runner) { instance_runner } + + it 'saves job id on redis cache' do + track + + expect(redis_stored_job_ids).to match_array(formatted_job_ids_for(job)) + end + + context 'when job fails with different failure_reason' do + let(:failure_reason) { :stuck_or_timeout_failure } + + it 'does not save job' do + expect { track }.not_to change { redis_stored_job_ids } + end + end + end + + context 'when job is executed in a project runner' do + let_it_be(:project_runner) { create(:ci_runner, :project) } + + let(:runner) { project_runner } + + it 'does not save job' do + expect { track }.not_to change { redis_stored_job_ids } + end + end + end + + context 'without runner_performance_insights licensed feature' do + let(:runner_performance_insights) { false } + let(:runner) { instance_runner } + + it 'does not save job' do + expect { track }.not_to change { redis_stored_job_ids } + end end end end @@ -37,6 +68,12 @@ describe '.recent_jobs' do subject(:recent_jobs) { described_class.recent_jobs } + let_it_be(:runner) { create(:ci_runner, :instance) } + let_it_be(:job_args) { { runner: runner, failure_reason: :runner_system_failure } } + let_it_be(:job) { create(:ci_build, created_at: 10.minutes.ago, finished_at: 3.minutes.ago, **job_args) } + let_it_be(:job2) { create(:ci_build, created_at: 5.minutes.ago, finished_at: 4.minutes.ago, **job_args) } + let_it_be(:job3) { create(:ci_build, created_at: 5.minutes.ago, finished_at: 5.minutes.ago, **job_args) } + context 'with runner_performance_insights licensed feature' do let(:runner_performance_insights) { true } @@ -46,31 +83,33 @@ context 'when jobs are added' do before do - described_class.save_job(job) + described_class.track(job) end - it 'returns 2 most recently finished jobs' do + it 'returns 3 most recently finished jobs' do expect(described_class.recent_jobs).to contain_exactly(an_object_having_attributes(id: job.id)) - described_class.save_job(job2) - described_class.save_job(job3) + described_class.track(job2) + described_class.track(job3) expect(described_class.recent_jobs).to match([ an_object_having_attributes(id: job.id), - an_object_having_attributes(id: job2.id) + an_object_having_attributes(id: job2.id), + an_object_having_attributes(id: job3.id) ]) end context 'when jobs are added in different order' do - it 'returns 2 most recently finished jobs' do + it 'returns 3 most recently finished jobs' do expect(described_class.recent_jobs).to contain_exactly(an_object_having_attributes(id: job.id)) - described_class.save_job(job3) - described_class.save_job(job2) + described_class.track(job3) + described_class.track(job2) expect(described_class.recent_jobs).to match([ an_object_having_attributes(id: job.id), - an_object_having_attributes(id: job2.id) + an_object_having_attributes(id: job2.id), + an_object_having_attributes(id: job3.id) ]) end end @@ -82,8 +121,8 @@ end it 'returns 2 most recently finished jobs and purges the rest', :aggregate_failures do - described_class.save_job(job3) - described_class.save_job(job2) + described_class.track(job3) + described_class.track(job2) # Only the last 2 jobs saved will be retained expect(redis_stored_job_ids).to eq(formatted_job_ids_for(job2, job3)) @@ -97,9 +136,7 @@ context 'without runner_performance_insights licensed feature' do let(:runner_performance_insights) { false } - it 'raises AccessDeniedError error' do - expect { recent_jobs }.to raise_error Gitlab::Access::AccessDeniedError - end + it { is_expected.to be_empty } end end diff --git a/ee/spec/workers/ee/ci/build_finished_worker_spec.rb b/ee/spec/workers/ee/ci/build_finished_worker_spec.rb index 13ea3b7de3e7eb..5700f28bad6c5e 100644 --- a/ee/spec/workers/ee/ci/build_finished_worker_spec.rb +++ b/ee/spec/workers/ee/ci/build_finished_worker_spec.rb @@ -30,7 +30,7 @@ def project_stats it 'tracks secure scans' do expect(::Security::TrackSecureScansWorker).to receive(:perform_async) - subject + perform end context 'when exception is raised in `super`' do @@ -38,7 +38,7 @@ def project_stats allow(Ci::Build).to receive(:find_by_id).with(build.id).and_return(build) allow(build).to receive(:execute_hooks).and_raise(ArgumentError) - expect { subject }.to raise_error(ArgumentError) + expect { perform }.to raise_error(ArgumentError) expect(::Security::TrackSecureScansWorker).not_to receive(:perform_async) end @@ -50,9 +50,15 @@ def project_stats it 'does not track secure scans' do expect(::Security::TrackSecureScansWorker).not_to receive(:perform_async) - subject + perform end end + + it 'does not track job on InstanceRunnerFailedJobs' do + expect(Ci::InstanceRunnerFailedJobs).not_to receive(:track) + + perform + end end context 'when not on .com' do @@ -63,13 +69,19 @@ def project_stats it 'does not notify the owners of Groups' do expect(::Ci::Minutes::EmailNotificationService).not_to receive(:new) - subject + perform end it 'does not track secure scans' do expect(::Security::TrackSecureScansWorker).not_to receive(:perform_async) - subject + perform + end + + it 'does not track job on InstanceRunnerFailedJobs' do + expect(Ci::InstanceRunnerFailedJobs).not_to receive(:track) + + perform end end @@ -83,7 +95,7 @@ def project_stats it 'does not scan security reports for token revocation' do expect(ScanSecurityReportSecretsWorker).not_to receive(:perform_async) - subject + perform end end @@ -106,7 +118,7 @@ def project_stats it do expect(RequirementsManagement::ProcessRequirementsReportsWorker).not_to receive(:perform_async) - subject + perform end end @@ -118,7 +130,7 @@ def project_stats it 'schedules processing of requirement reports' do expect(RequirementsManagement::ProcessRequirementsReportsWorker).to receive(:perform_async) - subject + perform end context 'when user has insufficient permissions to create test reports' do @@ -140,65 +152,18 @@ def project_stats end it 'does not save job on Ci::InstanceRunnerFailedJobs by default' do - expect(Ci::InstanceRunnerFailedJobs).not_to receive(:save_job) + expect(Ci::InstanceRunnerFailedJobs).not_to receive(:track) perform end context 'when job failed', feature_category: :runner_fleet do - let_it_be_with_reload(:build) { create(:ee_ci_build, :sast, :running, runner: ci_runner) } - - before do - stub_licensed_features(runner_performance_insights: runner_performance_insights) - - build.drop!(failure_reason) - end - - context 'with runner_performance_insights licensed feature' do - let(:runner_performance_insights) { true } + let(:build) { create(:ee_ci_build, :sast, :failed, runner: ci_runner) } - context 'with failure_reason set to :runner_system_failure' do - let(:failure_reason) { :runner_system_failure } + it 'tracks job on InstanceRunnerFailedJobs' do + expect(Ci::InstanceRunnerFailedJobs).to receive(:track).once - it 'saves job to Redis list', :aggregate_failures do - expect(Ci::InstanceRunnerFailedJobs).to receive(:save_job).once.and_call_original - - expect { perform }.to change { Ci::InstanceRunnerFailedJobs.recent_jobs.count }.by(1) - expect(Ci::InstanceRunnerFailedJobs.recent_jobs).to contain_exactly(build) - end - end - - context 'with failure_reason not set to :runner_system_failure' do - using RSpec::Parameterized::TableSyntax - - where(:failure_reason) { Enums::Ci::CommitStatus.failure_reasons.keys - [:runner_system_failure] } - - with_them do - context 'with runner_performance_insights licensed feature' do - let(:runner_performance_insights) { true } - - specify do - expect(Ci::InstanceRunnerFailedJobs).not_to receive(:save_job) - - perform - end - end - end - end - - context 'without runner_performance_insights licensed feature' do - let(:runner_performance_insights) { false } - - context 'with failure_reason set to :runner_system_failure' do - let(:failure_reason) { :runner_system_failure } - - specify do - expect(Ci::InstanceRunnerFailedJobs).not_to receive(:save_job) - - perform - end - end - end + perform end end end -- GitLab