Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Execution Hanging Detection #101

Merged
1 change: 1 addition & 0 deletions lib/github/plan_execution/finished.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Finished
def initialize(payload)
@check_suite = CheckSuite.find_by(bamboo_ci_ref: payload['bamboo_ref'])
@logger = GithubLogger.instance.create('github_plan_execution_finished.log', Logger::INFO)
@hanged = payload['hanged'] || false
end

def finished
Expand Down
15 changes: 13 additions & 2 deletions lib/github/update_status.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def update_status
case @status
when 'in_progress'
@job.in_progress(@github_check)
create_timeout_worker
when 'success'
@job.success(@github_check)
@job.update_execution_time
Expand All @@ -81,6 +82,16 @@ def update_status
[500, 'Internal Server Error']
end

def create_timeout_worker
Delayed::Job.where('handler LIKE ?', "%TimeoutExecution%args%-%#{@check_suite.id}%")&.delete_all

logger(Logger::INFO, "CiJobStatus::Update: TimeoutExecution for '#{@check_suite.id}'")

TimeoutExecution
.delay(run_at: 2.hours.from_now.utc, queue: 'timeout_execution')
.timeout(@check_suite.id)
end

def insert_new_delayed_job
queue = @job.check_suite.pull_request.github_pr_id % 10

Expand All @@ -91,7 +102,7 @@ def delete_and_create_delayed_job(queue)
fetch_delayed_job&.destroy_all

CiJobStatus
.delay(run_at: DELAYED_JOB_TIMER.seconds.from_now, queue: queue)
.delay(run_at: DELAYED_JOB_TIMER.seconds.from_now.utc, queue: queue)
.update(@job.check_suite.id, @job.id)
end

Expand All @@ -117,7 +128,7 @@ def failure
return failures_stats if @failures.is_a? Array and [email protected]?

CiJobFetchTopotestFailures
.delay(run_at: 5.minutes.from_now, queue: 'fetch_topotest_failures')
.delay(run_at: 5.minutes.from_now.utc, queue: 'fetch_topotest_failures')
.update(@job.id, 1)
end

Expand Down
1 change: 1 addition & 0 deletions lib/github_ci_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

# Workers
require_relative '../workers/ci_job_status'
require_relative '../workers/timeout_execution'
require_relative '../workers/ci_job_fetch_topotest_failures'

# Slack libs
Expand Down
4 changes: 4 additions & 0 deletions lib/models/check_suite.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,8 @@ def in_progress?
def execution_started?
ci_jobs.where(status: :in_progress).size < 2
end

def last_job_updated_at_timer
ci_jobs.max_by(&:updated_at)&.updated_at
end
end
1 change: 1 addition & 0 deletions spec/lib/github/update_status_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
before do
allow(Github::PlanExecution::Finished).to receive(:new).and_return(fake_finish_plan)
allow(fake_finish_plan).to receive(:fetch_build_status)
allow(TimeoutExecution).to receive_message_chain(:delay, :timeout).and_return(true)
end

describe 'Validates different Ci Job status' do
Expand Down
42 changes: 42 additions & 0 deletions spec/lib/models/check_suite_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# ci_job_spec.rb
# Part of NetDEF CI System
#
# Copyright (c) 2023 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

describe CheckSuite do
context '#execution_started?' do
let(:check_suite) { create(:check_suite) }
let(:check_suite_running) { create(:check_suite, :with_in_progress) }

it 'returns true when there are less than 2 jobs in progress' do
expect(check_suite.execution_started?).to be_truthy
end

it 'returns false' do
expect(check_suite_running.execution_started?).to be_falsey
end
end

context '#last_job_updated_at_timer? -> success' do
let(:ci_job) { create(:ci_job) }
let(:check_suite) { create(:check_suite, ci_jobs: [ci_job]) }

it 'returns false' do
expect(check_suite.last_job_updated_at_timer).not_to be_nil
end
end

context '#last_job_updated_at_timer? -> error' do
let(:ci_job) { create(:ci_job, updated_at: nil) }
let(:check_suite) { create(:check_suite, ci_jobs: []) }

it 'returns false' do
expect(check_suite.last_job_updated_at_timer).to be_nil
end
end
end
2 changes: 2 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def app
config.include FactoryBot::Syntax::Methods
config.include WebMock::API

config.add_formatter('json', 'tmp/rspec_results.json')

pid = nil

config.before(:suite) do
Expand Down
76 changes: 76 additions & 0 deletions spec/workers/timeout_execution_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# slack_bot_spec.rb
# Part of NetDEF CI System
#
# Copyright (c) 2023 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

describe TimeoutExecution do
let(:timeout_execution) { described_class.instance }
let(:finished_instance) { Github::PlanExecution::Finished.new({}) }

before do
allow(Github::PlanExecution::Finished).to receive(:new).and_return(finished_instance)
end

context 'when timeout is called, but still running' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(true)
end

it 'calls timeout job' do
expect(described_class.timeout(check_suite.id)).to be_falsey
end
end

context 'when timeout is called, but hanged' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(false)
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc - 3.hours)
allow(finished_instance).to receive(:finished).and_return([200, 'Finished'])
end

it 'calls timeout job' do
expect(described_class.timeout(check_suite.id)).to be_truthy
end
end

context 'when timeout is called and rescheduled' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(false)
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc + 3.hours)
allow(TimeoutExecution).to receive_message_chain(:delay, :timeout).and_return(true)
end

it 'calls timeout job' do
expect(described_class.timeout(check_suite.id)).to be_falsey
end
end

context 'when timeout is called, last update in 2 hour ago' do
let(:check_suite) { create(:check_suite) }

before do
allow(CheckSuite).to receive(:find).and_return(check_suite)
allow(check_suite).to receive(:finished?).and_return(false)
allow(check_suite).to receive(:last_job_updated_at_timer).and_return(Time.now.utc - 3.hours)
end

it 'calls timeout job' do
expect(TimeoutExecution).to receive(:timeout)
expect(described_class.timeout(check_suite.id)).to be_falsey
end
end
end
49 changes: 49 additions & 0 deletions workers/timeout_execution.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# ci_job_status.rb
# Part of NetDEF CI System
#
# Copyright (c) 2024 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

require_relative '../config/setup'

class TimeoutExecution
class << self
def timeout(check_suite_id)
@logger = GithubLogger.instance.create('timeout_execution_worker.log', Logger::INFO)
check_suite = CheckSuite.find(check_suite_id)

@logger.info("Timeout execution for check_suite_id: #{check_suite_id} -> finished? #{check_suite.finished?}")

return false if check_suite.finished?
return rescheduling([], check_suite_id) if check_suite.last_job_updated_at_timer > 2.hour.ago.utc

@logger.info("Calling Github::PlanExecution::Finished.new(#{check_suite.bamboo_ci_ref}).finished")

rescheduling(finished(check_suite), check_suite_id)
end

def finished(check_suite)
Github::PlanExecution::Finished
.new({ 'bamboo_ref' => check_suite.bamboo_ci_ref, hanged: true })
.finished
end

def rescheduling(resp, check_suite_id)
return true if resp == [200, 'Finished']

@logger.info("Rescheduling check_suite_id: #{check_suite_id}")

Delayed::Job.where('handler LIKE ?', "%TimeoutExecution%args%-%#{check_suite_id}%").delete_all

TimeoutExecution
.delay(run_at: 2.hours.from_now.utc, queue: 'timeout_execution')
.timeout(check_suite_id)

false
end
end
end
Loading