[IMP] runbot: add docker state

docker_is_running is ambiguous since we dont know if it was started once.
This new feature tries to add tools to know if a docker was started or not.

The main reason of this is that sometimes docker_run may take more than 15 seconds
creating unpredictable errors on build when the second step is launched and
the previous one is still running. Hopefully this fix will help to solve this
issue and detect late docker run.
This commit is contained in:
Xavier-Do 2019-12-05 13:47:38 +01:00
parent 83651c408e
commit a3abdf8505
3 changed files with 27 additions and 11 deletions

View File

@ -128,7 +128,8 @@ def docker_run(run_cmd, log_path, build_dir, container_name, exposed_ports=None,
cmd_object = Command([], run_cmd.split(' '), []) cmd_object = Command([], run_cmd.split(' '), [])
_logger.debug('Docker run command: %s', run_cmd) _logger.debug('Docker run command: %s', run_cmd)
logs = open(log_path, 'w') logs = open(log_path, 'w')
run_cmd = 'cd /data/build && %s' % run_cmd run_cmd = 'cd /data/build;touch start-%s;%s;cd /data/build;touch end-%s' % (container_name, run_cmd, container_name)
docker_clear_state(container_name, build_dir) # ensure that no state are remaining
logs.write("Docker command:\n%s\n=================================================\n" % cmd_object) logs.write("Docker command:\n%s\n=================================================\n" % cmd_object)
# create start script # create start script
docker_command = [ docker_command = [
@ -173,10 +174,21 @@ def docker_stop(container_name):
dstop = subprocess.run(['docker', 'stop', container_name]) dstop = subprocess.run(['docker', 'stop', container_name])
def docker_is_running(container_name): def docker_is_running(container_name):
"""Return True if container is still running"""
dinspect = subprocess.run(['docker', 'container', 'inspect', container_name], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) dinspect = subprocess.run(['docker', 'container', 'inspect', container_name], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
return True if dinspect.returncode == 0 else False return True if dinspect.returncode == 0 else False
def docker_state(container_name, build_dir):
started = os.path.exists(os.path.join(build_dir, 'start-%s' % container_name))
ended = os.path.exists(os.path.join(build_dir, 'end-%s' % container_name))
return 'END' if ended else 'RUNNING' if started else 'UNKNOWN'
def docker_clear_state(container_name, build_dir):
"""Return True if container is still running"""
if os.path.exists(os.path.join(build_dir, 'start-%s' % container_name)):
os.remove(os.path.join(build_dir, 'start-%s' % container_name))
if os.path.exists(os.path.join(build_dir, 'end-%s' % container_name)):
os.remove(os.path.join(build_dir, 'end-%s' % container_name))
def docker_get_gateway_ip(): def docker_get_gateway_ip():
"""Return the host ip of the docker default bridge gateway""" """Return the host ip of the docker default bridge gateway"""
docker_net_inspect = subprocess.run(['docker', 'network', 'inspect', 'bridge'], stdout=subprocess.PIPE) docker_net_inspect = subprocess.run(['docker', 'network', 'inspect', 'bridge'], stdout=subprocess.PIPE)

View File

@ -9,7 +9,7 @@ import subprocess
import time import time
import datetime import datetime
from ..common import dt2time, fqdn, now, grep, uniq_list, local_pgadmin_cursor, s2human, Commit, dest_reg, os from ..common import dt2time, fqdn, now, grep, uniq_list, local_pgadmin_cursor, s2human, Commit, dest_reg, os
from ..container import docker_build, docker_stop, docker_is_running, Command from ..container import docker_build, docker_stop, docker_state, Command
from odoo.addons.runbot.models.repo import RunbotException from odoo.addons.runbot.models.repo import RunbotException
from odoo import models, fields, api from odoo import models, fields, api
from odoo.exceptions import UserError, ValidationError from odoo.exceptions import UserError, ValidationError
@ -587,7 +587,7 @@ class runbot_build(models.Model):
continue continue
if build.requested_action == 'wake_up': if build.requested_action == 'wake_up':
if docker_is_running(build._get_docker_name()): if docker_state(build._get_docker_name(), build._path()) == 'RUNNING':
build.write({'requested_action': False, 'local_state': 'running'}) build.write({'requested_action': False, 'local_state': 'running'})
build._log('wake_up', 'Waking up failed, docker is already running', level='SEPARATOR') build._log('wake_up', 'Waking up failed, docker is already running', level='SEPARATOR')
elif not os.path.exists(build._path()): elif not os.path.exists(build._path()):
@ -651,15 +651,19 @@ class runbot_build(models.Model):
build.local_result = build.triggered_result build.local_result = build.triggered_result
build._github_status() # failfast build._github_status() # failfast
# check if current job is finished # check if current job is finished
if docker_is_running(build._get_docker_name()): _docker_state = docker_state(build._get_docker_name(), build._path())
if _docker_state == 'RUNNING':
timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000))) timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000)))
if build.local_state != 'running' and build.job_time > timeout: if build.local_state != 'running' and build.job_time > timeout:
build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time)) build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time))
build._kill(result='killed') build._kill(result='killed')
continue continue
elif build.active_step._is_docker_step() and build.job_time < 15: elif _docker_state == 'UNKNOWN' and build.active_step._is_docker_step():
_logger.debug('container "%s" seems too take a while to start', build._get_docker_name()) if build.job_time < 60:
continue _logger.debug('container "%s" seems too take a while to start', build._get_docker_name())
continue
else:
build._log('_schedule', 'Docker not started after 60 seconds, skipping', level='ERROR')
# No job running, make result and select nex job # No job running, make result and select nex job
build_values = { build_values = {
'job_end': now(), 'job_end': now(),

View File

@ -22,8 +22,8 @@ class TestSchedule(RunbotCase):
}) })
@patch('odoo.addons.runbot.models.build.os.path.getmtime') @patch('odoo.addons.runbot.models.build.os.path.getmtime')
@patch('odoo.addons.runbot.models.build.docker_is_running') @patch('odoo.addons.runbot.models.build.docker_state')
def test_schedule_mark_done(self, mock_running, mock_getmtime): def test_schedule_mark_done(self, mock_docker_state, mock_getmtime):
""" Test that results are set even when job_30_run is skipped """ """ Test that results are set even when job_30_run is skipped """
job_end_time = datetime.datetime.now() job_end_time = datetime.datetime.now()
mock_getmtime.return_value = job_end_time.timestamp() mock_getmtime.return_value = job_end_time.timestamp()
@ -41,7 +41,7 @@ class TestSchedule(RunbotCase):
domain = [('repo_id', 'in', (self.repo.id, ))] domain = [('repo_id', 'in', (self.repo.id, ))]
domain_host = domain + [('host', '=', 'runbotxx')] domain_host = domain + [('host', '=', 'runbotxx')]
build_ids = self.Build.search(domain_host + [('local_state', 'in', ['testing', 'running'])]) build_ids = self.Build.search(domain_host + [('local_state', 'in', ['testing', 'running'])])
mock_running.return_value = False mock_docker_state.return_value = 'RUNNING'
self.assertEqual(build.local_state, 'testing') self.assertEqual(build.local_state, 'testing')
build_ids._schedule() # too fast, docker not started build_ids._schedule() # too fast, docker not started
self.assertEqual(build.local_state, 'testing') self.assertEqual(build.local_state, 'testing')