From a3abdf850537bfffcb15bc8275744d18fff8b532 Mon Sep 17 00:00:00 2001 From: Xavier-Do Date: Thu, 5 Dec 2019 13:47:38 +0100 Subject: [PATCH] [IMP] runbot: add docker state docker_is_running is ambiguous since we dont know if it was started once. This new feature tries to add tools to know if a docker was started or not. The main reason of this is that sometimes docker_run may take more than 15 seconds creating unpredictable errors on build when the second step is launched and the previous one is still running. Hopefully this fix will help to solve this issue and detect late docker run. --- runbot/container.py | 16 ++++++++++++++-- runbot/models/build.py | 16 ++++++++++------ runbot/tests/test_schedule.py | 6 +++--- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/runbot/container.py b/runbot/container.py index e18a45a5..e58779ce 100644 --- a/runbot/container.py +++ b/runbot/container.py @@ -128,7 +128,8 @@ def docker_run(run_cmd, log_path, build_dir, container_name, exposed_ports=None, cmd_object = Command([], run_cmd.split(' '), []) _logger.debug('Docker run command: %s', run_cmd) logs = open(log_path, 'w') - run_cmd = 'cd /data/build && %s' % run_cmd + run_cmd = 'cd /data/build;touch start-%s;%s;cd /data/build;touch end-%s' % (container_name, run_cmd, container_name) + docker_clear_state(container_name, build_dir) # ensure that no state are remaining logs.write("Docker command:\n%s\n=================================================\n" % cmd_object) # create start script docker_command = [ @@ -173,10 +174,21 @@ def docker_stop(container_name): dstop = subprocess.run(['docker', 'stop', container_name]) def docker_is_running(container_name): - """Return True if container is still running""" dinspect = subprocess.run(['docker', 'container', 'inspect', container_name], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) return True if dinspect.returncode == 0 else False +def docker_state(container_name, build_dir): + started = os.path.exists(os.path.join(build_dir, 'start-%s' % container_name)) + ended = os.path.exists(os.path.join(build_dir, 'end-%s' % container_name)) + return 'END' if ended else 'RUNNING' if started else 'UNKNOWN' + +def docker_clear_state(container_name, build_dir): + """Return True if container is still running""" + if os.path.exists(os.path.join(build_dir, 'start-%s' % container_name)): + os.remove(os.path.join(build_dir, 'start-%s' % container_name)) + if os.path.exists(os.path.join(build_dir, 'end-%s' % container_name)): + os.remove(os.path.join(build_dir, 'end-%s' % container_name)) + def docker_get_gateway_ip(): """Return the host ip of the docker default bridge gateway""" docker_net_inspect = subprocess.run(['docker', 'network', 'inspect', 'bridge'], stdout=subprocess.PIPE) diff --git a/runbot/models/build.py b/runbot/models/build.py index d593de38..b3e1b14d 100644 --- a/runbot/models/build.py +++ b/runbot/models/build.py @@ -9,7 +9,7 @@ import subprocess import time import datetime from ..common import dt2time, fqdn, now, grep, uniq_list, local_pgadmin_cursor, s2human, Commit, dest_reg, os -from ..container import docker_build, docker_stop, docker_is_running, Command +from ..container import docker_build, docker_stop, docker_state, Command from odoo.addons.runbot.models.repo import RunbotException from odoo import models, fields, api from odoo.exceptions import UserError, ValidationError @@ -587,7 +587,7 @@ class runbot_build(models.Model): continue if build.requested_action == 'wake_up': - if docker_is_running(build._get_docker_name()): + if docker_state(build._get_docker_name(), build._path()) == 'RUNNING': build.write({'requested_action': False, 'local_state': 'running'}) build._log('wake_up', 'Waking up failed, docker is already running', level='SEPARATOR') elif not os.path.exists(build._path()): @@ -651,15 +651,19 @@ class runbot_build(models.Model): build.local_result = build.triggered_result build._github_status() # failfast # check if current job is finished - if docker_is_running(build._get_docker_name()): + _docker_state = docker_state(build._get_docker_name(), build._path()) + if _docker_state == 'RUNNING': timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000))) if build.local_state != 'running' and build.job_time > timeout: build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time)) build._kill(result='killed') continue - elif build.active_step._is_docker_step() and build.job_time < 15: - _logger.debug('container "%s" seems too take a while to start', build._get_docker_name()) - continue + elif _docker_state == 'UNKNOWN' and build.active_step._is_docker_step(): + if build.job_time < 60: + _logger.debug('container "%s" seems too take a while to start', build._get_docker_name()) + continue + else: + build._log('_schedule', 'Docker not started after 60 seconds, skipping', level='ERROR') # No job running, make result and select nex job build_values = { 'job_end': now(), diff --git a/runbot/tests/test_schedule.py b/runbot/tests/test_schedule.py index 194a5d4f..f08b3642 100644 --- a/runbot/tests/test_schedule.py +++ b/runbot/tests/test_schedule.py @@ -22,8 +22,8 @@ class TestSchedule(RunbotCase): }) @patch('odoo.addons.runbot.models.build.os.path.getmtime') - @patch('odoo.addons.runbot.models.build.docker_is_running') - def test_schedule_mark_done(self, mock_running, mock_getmtime): + @patch('odoo.addons.runbot.models.build.docker_state') + def test_schedule_mark_done(self, mock_docker_state, mock_getmtime): """ Test that results are set even when job_30_run is skipped """ job_end_time = datetime.datetime.now() mock_getmtime.return_value = job_end_time.timestamp() @@ -41,7 +41,7 @@ class TestSchedule(RunbotCase): domain = [('repo_id', 'in', (self.repo.id, ))] domain_host = domain + [('host', '=', 'runbotxx')] build_ids = self.Build.search(domain_host + [('local_state', 'in', ['testing', 'running'])]) - mock_running.return_value = False + mock_docker_state.return_value = 'RUNNING' self.assertEqual(build.local_state, 'testing') build_ids._schedule() # too fast, docker not started self.assertEqual(build.local_state, 'testing')