diff --git a/config/gitlab.yml.example b/config/gitlab.yml.example index 07be2bbf57b9d8b750a85f25f5c880c58e43fb2b..8c006b9b3170bb2c0b47726d228887e57651f33b 100644 --- a/config/gitlab.yml.example +++ b/config/gitlab.yml.example @@ -586,6 +586,10 @@ production: &base ci_runners_stale_machines_cleanup_worker: cron: "36 * * * *" + # Periodically clean up stale runner system failed builds. + ci_runners_failed_builds_cleanup_worker: + cron: "*/10 * * * *" + # GitLab EE only jobs. These jobs are automatically enabled for an EE # installation, and ignored for a CE installation. ee_cron_jobs: diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index c87d54b9bd48101f3585c2c0bf7d050661e7177c..b97b3463610f3a579f7e40278a0527e7e2f09b4e 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -688,6 +688,9 @@ Settings.cron_jobs['service_desk_custom_email_verification_cleanup'] ||= {} Settings.cron_jobs['service_desk_custom_email_verification_cleanup']['cron'] ||= '*/2 * * * *' Settings.cron_jobs['service_desk_custom_email_verification_cleanup']['job_class'] = 'ServiceDesk::CustomEmailVerificationCleanupWorker' +Settings.cron_jobs['ci_runners_failed_builds_cleanup_worker'] ||= {} +Settings.cron_jobs['ci_runners_failed_builds_cleanup_worker']['cron'] ||= '*/10 * * * *' +Settings.cron_jobs['ci_runners_failed_builds_cleanup_worker']['job_class'] = 'Ci::Runners::FailedBuildsCleanupCronWorker' Gitlab.ee do Settings.cron_jobs['analytics_devops_adoption_create_all_snapshots_worker'] ||= {} diff --git a/ee/app/models/ci/runner_failed_build.rb b/ee/app/models/ci/runner_failed_build.rb index 85369419e0581b78f4a56968f3df0eac4309702e..a56950f9c19b575005697c620ecd09b76f58125c 100644 --- a/ee/app/models/ci/runner_failed_build.rb +++ b/ee/app/models/ci/runner_failed_build.rb @@ -14,6 +14,20 @@ class RunnerFailedBuild < Ci::ApplicationRecord validates :build, presence: true validates :finished_at, presence: true + # The `STALE_TIMEOUT` constant defines the how far past the finished_at timestamp records + # will be considered stale + STALE_TIMEOUT = 1.hour + + scope :stale, -> do + finished_some_time_ago = arel_table[:finished_at].lteq(STALE_TIMEOUT.ago) + + where(finished_some_time_ago).order(finished_at: :desc).offset(2) + end + + scope :for_build, ->(build) do + where(build_id: build) + end + class << self def upsert_from_build!(build) return if Feature.disabled?(:runner_failed_builds) diff --git a/ee/app/services/ci/runners/failed_builds_cleanup_service.rb b/ee/app/services/ci/runners/failed_builds_cleanup_service.rb new file mode 100644 index 0000000000000000000000000000000000000000..88adeac243e9de015f08c500e2f9e71ccec014b0 --- /dev/null +++ b/ee/app/services/ci/runners/failed_builds_cleanup_service.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +module Ci + module Runners + class FailedBuildsCleanupService + MAX_DELETIONS = 1000 + SUB_BATCH_LIMIT = 100 + + def execute + ServiceResponse.success(payload: delete_builds) + end + + private + + def delete_builds + batch_counts = [] + total_deleted_count = 0 + + if Feature.enabled?(:runner_failed_builds) + loop do + sub_batch_limit = [SUB_BATCH_LIMIT, MAX_DELETIONS].min + + # delete_all discards part of the `stale` scope query, + # so we explicitly wrap it with a SELECT as a workaround + deleted_count = Ci::RunnerFailedBuild + .for_build(Ci::RunnerFailedBuild.stale.select(:build_id).limit(sub_batch_limit)) + .delete_all + batch_counts << deleted_count + total_deleted_count += deleted_count + + break if deleted_count == 0 || total_deleted_count >= MAX_DELETIONS + end + end + + { + total_deleted: total_deleted_count, + batch_counts: batch_counts + } + end + end + end +end diff --git a/ee/app/workers/all_queues.yml b/ee/app/workers/all_queues.yml index ec11439a65de488654a7ab4239b39a475f0819d5..2dfdf022eb005eb11e81582ead7708d5cbcf5079 100644 --- a/ee/app/workers/all_queues.yml +++ b/ee/app/workers/all_queues.yml @@ -102,6 +102,15 @@ :weight: 1 :idempotent: true :tags: [] +- :name: cronjob:ci_runners_failed_builds_cleanup_cron + :worker_name: Ci::Runners::FailedBuildsCleanupCronWorker + :feature_category: :runner_fleet + :has_external_dependencies: false + :urgency: :low + :resource_boundary: :unknown + :weight: 1 + :idempotent: true + :tags: [] - :name: cronjob:ci_runners_stale_group_runners_prune_cron :worker_name: Ci::Runners::StaleGroupRunnersPruneCronWorker :feature_category: :runner_fleet diff --git a/ee/app/workers/ci/runners/failed_builds_cleanup_cron_worker.rb b/ee/app/workers/ci/runners/failed_builds_cleanup_cron_worker.rb new file mode 100644 index 0000000000000000000000000000000000000000..fc36a8a2a29ad87c504c2bdf0ea884e5b332da12 --- /dev/null +++ b/ee/app/workers/ci/runners/failed_builds_cleanup_cron_worker.rb @@ -0,0 +1,24 @@ +# frozen_string_literal: true + +module Ci + module Runners + class FailedBuildsCleanupCronWorker + include ApplicationWorker + + # This worker does not schedule other workers that require context. + include CronjobQueue # rubocop:disable Scalability/CronWorkerContext + + data_consistency :sticky + feature_category :runner_fleet + urgency :low + + idempotent! + + def perform + result = ::Ci::Runners::FailedBuildsCleanupService.new.execute + log_extra_metadata_on_done(:status, result.status) + log_hash_metadata_on_done(result.payload) + end + end + end +end diff --git a/ee/spec/factories/ci/runner_failed_builds.rb b/ee/spec/factories/ci/runner_failed_builds.rb index 18eec1639267aa3cee3488abee616cc07ecc7568..2f2cb679089a61ea123b24659808fd9d74fdfd64 100644 --- a/ee/spec/factories/ci/runner_failed_builds.rb +++ b/ee/spec/factories/ci/runner_failed_builds.rb @@ -10,5 +10,9 @@ finished_at: 'Di 29. Okt 09:53:28 CET 2013' finished_at { build&.finished_at } + + trait :stale do + finished_at { Ci::RunnerFailedBuild::STALE_TIMEOUT.ago } + end end end diff --git a/ee/spec/models/ci/runner_failed_build_spec.rb b/ee/spec/models/ci/runner_failed_build_spec.rb index 2c45853f9438edc8b7efd93dd2000d5567dfe94a..bf552106092b18d17bbcfb0e63e7cc90be6e30f8 100644 --- a/ee/spec/models/ci/runner_failed_build_spec.rb +++ b/ee/spec/models/ci/runner_failed_build_spec.rb @@ -74,4 +74,25 @@ end end end + + describe '.for_build', :freeze_time do + let_it_be(:build1) { create(:ci_build, finished_at: Time.current) } + let_it_be(:build2) { create(:ci_build, finished_at: Time.current) } + let_it_be(:runner_failed_build1) { create(:ci_runner_failed_build, build: build1) } + let_it_be(:runner_failed_build2) { create(:ci_runner_failed_build, build: build2) } + + subject(:scope) { described_class.for_build(builds) } + + context 'with 1 build' do + let(:builds) { [build1] } + + it { is_expected.to contain_exactly runner_failed_build1 } + end + + context 'with 2 builds specified by id' do + let(:builds) { [build1.id, build2.id] } + + it { is_expected.to contain_exactly(runner_failed_build1, runner_failed_build2) } + end + end end diff --git a/ee/spec/services/ci/runners/failed_builds_cleanup_service_spec.rb b/ee/spec/services/ci/runners/failed_builds_cleanup_service_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..2ee4746155fc45665859427b769c357ca84350ff --- /dev/null +++ b/ee/spec/services/ci/runners/failed_builds_cleanup_service_spec.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Ci::Runners::FailedBuildsCleanupService, :freeze_time, feature_category: :runner_fleet do + let(:service) { described_class.new } + let!(:runner_failed_build1) { create(:ci_runner_failed_build, :stale) } + let!(:runner_failed_build2) { create(:ci_runner_failed_build, :stale) } + let!(:runner_failed_build3) { create(:ci_runner_failed_build, finished_at: Time.current) } + + subject(:response) { service.execute } + + context 'with no stale runner failed builds' do + it 'does not clean any runner failed builds and returns :success status' do + expect do + expect(response).to be_success + expect(response.payload).to match({ total_deleted: 0, batch_counts: [0] }) + end.not_to change { Ci::RunnerFailedBuild.count }.from(3) + end + end + + context 'with some stale runner failed builds' do + let!(:runner_failed_build4) { create(:ci_runner_failed_build, :stale) } + let!(:runner_failed_build5) { create(:ci_runner_failed_build, :stale) } + + it 'only leaves non-stale builds' do + expect(Ci::RunnerFailedBuild.stale.count).to eq 2 + expect(response).to be_success + expect(response.payload).to match({ total_deleted: 2, batch_counts: [2, 0] }) + expect(Ci::RunnerFailedBuild.all.ids).to contain_exactly( + runner_failed_build3.id, runner_failed_build4.id, runner_failed_build5.id + ) + end + + context 'when runner_failed_builds FF is disabled' do + before do + stub_feature_flags(runner_failed_builds: false) + end + + it 'does not clean up stale builds', :aggregate_failures do + expect(Ci::RunnerFailedBuild.stale.count).to eq 2 + + expect { response }.not_to change { Ci::RunnerFailedBuild.count }.from(5) + end + end + + context 'with more stale runners than SUB_BATCH_LIMIT' do + before do + stub_const("#{described_class}::SUB_BATCH_LIMIT", 1) + end + + it 'only leaves non-stale runners' do + expect(response).to be_success + expect(response.payload).to match({ total_deleted: 2, batch_counts: [1, 1, 0] }) + expect(Ci::RunnerFailedBuild.all.ids).to contain_exactly( + runner_failed_build3.id, runner_failed_build4.id, runner_failed_build5.id + ) + end + end + + context 'with more stale runners than MAX_DELETIONS' do + before do + stub_const("#{described_class}::MAX_DELETIONS", 1) + end + + it 'only leaves non-stale runners' do + expect do + expect(response).to be_success + expect(response.payload).to match({ + total_deleted: described_class::MAX_DELETIONS, + batch_counts: [1] + }) + end.to change { Ci::RunnerFailedBuild.count }.by(-described_class::MAX_DELETIONS) + end + end + end +end diff --git a/ee/spec/workers/ci/runners/failed_builds_cleanup_cron_worker_spec.rb b/ee/spec/workers/ci/runners/failed_builds_cleanup_cron_worker_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..8a973ca77b3adbb9f70642401f0ab135da33e47f --- /dev/null +++ b/ee/spec/workers/ci/runners/failed_builds_cleanup_cron_worker_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Ci::Runners::FailedBuildsCleanupCronWorker, feature_category: :runner_fleet do + let(:worker) { described_class.new } + + describe '#perform', :freeze_time do + subject(:perform) { worker.perform } + + let!(:runner_failed_build1) { create(:ci_runner_failed_build, :stale) } + let!(:runner_failed_build2) { create(:ci_runner_failed_build, :stale) } + let!(:runner_failed_build3) { create(:ci_runner_failed_build, finished_at: Time.current) } + let!(:runner_failed_build4) { create(:ci_runner_failed_build, :stale) } + + it_behaves_like 'an idempotent worker' do + it 'delegates to Ci::Runners::FailedBuildsCleanupService' do + expect_next_instance_of(Ci::Runners::FailedBuildsCleanupService) do |service| + expect(service) + .to receive(:execute).and_call_original + end + + perform + + expect(worker.logging_extras).to eq({ + "extra.ci_runners_failed_builds_cleanup_cron_worker.status" => :success, + "extra.ci_runners_failed_builds_cleanup_cron_worker.total_deleted" => 1, + "extra.ci_runners_failed_builds_cleanup_cron_worker.batch_counts" => [1, 0] + }) + end + + it 'cleans up stale builds', :aggregate_failures do + expect(Ci::RunnerFailedBuild.stale.count).to eq 1 + + expect { perform }.to change { Ci::RunnerFailedBuild.count }.from(4).to(3) + + expect(Ci::RunnerFailedBuild.all.pluck(:build_id)) + .to match_array([runner_failed_build2, runner_failed_build3, runner_failed_build4].map(&:build_id)) + end + + context 'when runner_failed_builds FF is disabled' do + before do + stub_feature_flags(runner_failed_builds: false) + end + + it 'does not clean up stale builds', :aggregate_failures do + expect(Ci::RunnerFailedBuild.stale.count).to eq 1 + + expect { perform }.not_to change { Ci::RunnerFailedBuild.count }.from(4) + end + end + end + end +end