[REF] runbot: refactor sheduler

2025-06-02 06:38:02 +07:00 · 2019-12-17 11:27:11 +01:00 · 2019-12-17 11:27:11 +01:00 · 426b7af2cb
commit 426b7af2cb
parent ae5f2906bf
9 changed files with 277 additions and 251 deletions
--- a/runbot/container.py
+++ b/runbot/container.py
@ -166,12 +166,13 @@ def docker_run(run_cmd, log_path, build_dir, container_name, exposed_ports=None,
    docker_command.extend(['odoo:runbot_tests', '/bin/bash', '-c', "%s" % run_cmd])
    docker_run = subprocess.Popen(docker_command, stdout=logs, stderr=logs, preexec_fn=preexec_fn, close_fds=False, cwd=build_dir)
    _logger.info('Started Docker container %s', container_name)
-    return docker_run.pid
+    return
 def docker_stop(container_name):
    """Stops the container named container_name"""
    _logger.info('Stopping container %s', container_name)
    dstop = subprocess.run(['docker', 'stop', container_name])
    # todo delete os.path.join(build_dir, 'end-%s' % container_name)
 def docker_is_running(container_name):
    dinspect = subprocess.run(['docker', 'container', 'inspect', container_name], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
--- a/runbot/models/build.py
+++ b/runbot/models/build.py
@ -67,7 +67,6 @@ class runbot_build(models.Model):
    nb_running = fields.Integer("Number of test slot use", default=0)
    # should we add a stored field for children results?
    pid = fields.Integer('Pid')
    active_step = fields.Many2one('runbot.build.config.step', 'Active step')
    job = fields.Char('Active step display name', compute='_compute_job')
    job_start = fields.Datetime('Job start')
@ -573,14 +572,40 @@ class runbot_build(models.Model):
        self.ensure_one()
        return '%s_%s' % (self.dest, self.active_step.name)
-    def _schedule(self):
+    def _init_pendings(self, host):
-        """schedule the build"""
+        for build in self:
-        icp = self.env['ir.config_parameter']
+            if build.local_state != 'pending':
-        # For retro-compatibility, keep this parameter in seconds
+                raise UserError("Build %s is not pending" % build.id)
-
+            if build.host != host.name:
                raise UserError("Build %s does not have correct host" % build.id)
            # allocate port and schedule first job
            values = {
                'port': self._find_port(),
                'job_start': now(),
                'build_start': now(),
                'job_end': False,
            }
            values.update(build._next_job_values())
            build.write(values)
            if not build.active_step:
                build._log('_schedule', 'No job in config, doing nothing')
                continue
            try:
                build._log('_schedule', 'Init build environment with config %s ' % build.config_id.name)
                # notify pending build - avoid confusing users by saying nothing
                build._github_status()
                os.makedirs(build._path('logs'), exist_ok=True)
                build._log('_schedule', 'Building docker image')
                docker_build(build._path('logs', 'docker_build.txt'), build._path())
            except Exception:
                _logger.exception('Failed initiating build %s', build.dest)
                build._log('_schedule', 'Failed initiating build')
                build._kill(result='ko')
                continue
            build._run_job()
    def _process_requested_actions(self):
        for build in self:
            self.env.cr.commit()  # commit between each build to minimise transactionnal errors due to state computations
            self.invalidate_cache()
            if build.requested_action == 'deathrow':
                result = None
                if build.local_state != 'running' and build.global_result not in ('warn', 'ko'):
@ -617,97 +642,76 @@ class runbot_build(models.Model):
                        build.write({'requested_action': False, 'local_state': 'done'})
                continue
-            if build.local_state == 'pending':
+    def _schedule(self):
-                # allocate port and schedule first job
+        """schedule the build"""
-                port = self._find_port()
+        icp = self.env['ir.config_parameter']
-                values = {
+        for build in self:
-                    'host': fqdn(), # or ip? of false? 
+            if build.local_state not in ['testing', 'running']:
-                    'port': port,
+                raise UserError("Build %s is not testing/running: %s" % (build.id, build.local_state))
-                    'job_start': now(),
+            if build.local_state == 'testing':
-                    'build_start': now(),
+                # failfast in case of docker error (triggered in database)
-                    'job_end': False,
+                if build.triggered_result and not build.active_step.ignore_triggered_result:
-                }
+                    worst_result = self._get_worst_result([build.triggered_result, build.local_result])
-                values.update(build._next_job_values())
+                    if  worst_result != build.local_result:
-                build.write(values)
+                        build.local_result = build.triggered_result
-                if not build.active_step:
+                        build._github_status()  # failfast
-                    build._log('_schedule', 'No job in config, doing nothing')
+            # check if current job is finished
            _docker_state = docker_state(build._get_docker_name(), build._path())
            if _docker_state == 'RUNNING':
                timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000)))
                if build.local_state != 'running' and build.job_time > timeout:
                    build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time))
                    build._kill(result='killed')
                continue
            elif _docker_state == 'UNKNOWN' and build.active_step._is_docker_step():
                if build.job_time < 60:
                    _logger.debug('container "%s" seems too take a while to start', build._get_docker_name())
                    continue
-                try:
+                else:
-                    build._log('_schedule', 'Init build environment with config %s ' % build.config_id.name)
+                    build._log('_schedule', 'Docker not started after 60 seconds, skipping', level='ERROR')
-                    # notify pending build - avoid confusing users by saying nothing
+            # No job running, make result and select nex job
-                    build._github_status()
+            build_values = {
-                    os.makedirs(build._path('logs'), exist_ok=True)
+                'job_end': now(),
-                    build._log('_schedule', 'Building docker image')
+            }
-                    docker_build(build._path('logs', 'docker_build.txt'), build._path())
+            # make result of previous job
-                except Exception:
+            try:
-                    _logger.exception('Failed initiating build %s', build.dest)
+                results = build.active_step._make_results(build)
-                    build._log('_schedule', 'Failed initiating build')
+            except Exception as e:
-                    build._kill(result='ko')
+                if isinstance(e, RunbotException):
-                    continue
+                    message = e.args[0]
-            else:  # testing/running build
+                else:
-                if build.local_state == 'testing':
+                    message = 'An error occured while computing results of %s:\n %s' % (build.job, str(e).replace('\\n', '\n').replace("\\'", "'"))
-                    # failfast in case of docker error (triggered in database)
+                    _logger.exception(message)
-                    if build.triggered_result and not build.active_step.ignore_triggered_result:
+                build._log('_make_results', message, level='ERROR')
-                        worst_result = self._get_worst_result([build.triggered_result, build.local_result])
+                results = {'local_result': 'ko'}
                        if  worst_result != build.local_result:
                            build.local_result = build.triggered_result
                            build._github_status()  # failfast
                # check if current job is finished
                _docker_state = docker_state(build._get_docker_name(), build._path())
                if _docker_state == 'RUNNING':
                    timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000)))
                    if build.local_state != 'running' and build.job_time > timeout:
                        build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time))
                        build._kill(result='killed')
                    continue
                elif _docker_state == 'UNKNOWN' and build.active_step._is_docker_step():
                    if build.job_time < 60:
                        _logger.debug('container "%s" seems too take a while to start', build._get_docker_name())
                        continue
                    else:
                        build._log('_schedule', 'Docker not started after 60 seconds, skipping', level='ERROR')
                # No job running, make result and select nex job
                build_values = {
                    'job_end': now(),
                }
                # make result of previous job
                try:
                    results = build.active_step._make_results(build)
                except Exception as e:
                    if isinstance(e, RunbotException):
                        message = e.args[0]
                    else:
                        message = 'An error occured while computing results of %s:\n %s' % (build.job, str(e).replace('\\n', '\n').replace("\\'", "'"))
                        _logger.exception(message)
                    build._log('_make_results', message, level='ERROR')
                    results = {'local_result': 'ko'}
-                build_values.update(results)
+            build_values.update(results)
-                build.active_step.log_end(build)
+            build.active_step.log_end(build)
-                build_values.update(build._next_job_values())  # find next active_step or set to done
+            build_values.update(build._next_job_values())  # find next active_step or set to done
-                ending_build = build.local_state not in ('done', 'running') and build_values.get('local_state') in ('done', 'running')
+            ending_build = build.local_state not in ('done', 'running') and build_values.get('local_state') in ('done', 'running')
-                if ending_build:
+            if ending_build:
-                    build.update_build_end()
+                build.update_build_end()
-                build.write(build_values)
+            build.write(build_values)
-                if ending_build:
+            if ending_build:
-                    build._github_status()
+                build._github_status()
-                    if not build.local_result:  # Set 'ok' result if no result set (no tests job on build)
+                if not build.local_result:  # Set 'ok' result if no result set (no tests job on build)
-                        build.local_result = 'ok'
+                    build.local_result = 'ok'
-                        build._logger("No result set, setting ok by default")
+                    build._logger("No result set, setting ok by default")
            build._run_job()
-            # run job
+    def _run_job(self):
-            pid = None
+        # run job
        for build in self:
            if build.local_state != 'done':
                build._logger('running %s', build.active_step.name)
                os.makedirs(build._path('logs'), exist_ok=True)
                os.makedirs(build._path('datadir'), exist_ok=True)
                try:
-                    pid = build.active_step._run(build)  # run should be on build?
+                    build.active_step._run(build)  # run should be on build?
                    build.write({'pid': pid})  # no really usefull anymore with dockers
                except Exception as e:
                    if isinstance(e, RunbotException):
                        message = e.args[0]
@ -716,10 +720,6 @@ class runbot_build(models.Model):
                    _logger.exception(message)
                    build._log("run", message, level='ERROR')
                    build._kill(result='ko')
                    continue
        self.env.cr.commit()
        self.invalidate_cache()
    def _path(self, *l, **kw):
        """Return the repo build path"""
@ -844,16 +844,6 @@ class runbot_build(models.Model):
            'line': '0',
        })
    def _reap(self):
        while True:
            try:
                pid, status, rusage = os.wait3(os.WNOHANG)
            except OSError:
                break
            if pid == 0:
                break
            _logger.debug('reaping: pid: %s status: %s', pid, status)
    def _kill(self, result=None):
        host = fqdn()
        for build in self:
--- a/runbot/models/host.py
+++ b/runbot/models/host.py
@ -52,6 +52,10 @@ class RunboHost(models.Model):
        icp = self.env['ir.config_parameter']
        return self.nb_worker or int(icp.sudo().get_param('runbot.runbot_workers', default=6))
    def get_running_max(self):
        icp = self.env['ir.config_parameter']
        return int(icp.get_param('runbot.runbot_running_max', default=75))
    def set_psql_conn_count(self):
        self.ensure_one()
        with local_pgadmin_cursor() as local_cr:
--- a/runbot/models/repo.py
+++ b/runbot/models/repo.py
@ -17,9 +17,11 @@ from odoo.tools.misc import DEFAULT_SERVER_DATETIME_FORMAT
 from odoo import models, fields, api, registry
 from odoo.modules.module import get_module_resource
 from odoo.tools import config
 from odoo.osv import expression
 from ..common import fqdn, dt2time, Commit, dest_reg, os
 from ..container import docker_ps, docker_stop
 from psycopg2.extensions import TransactionRollbackError
 _logger = logging.getLogger(__name__)
 class RunbotException(Exception):
@ -456,97 +458,130 @@ class runbot_repo(models.Model):
            except Exception:
                _logger.exception('Fail to update repo %s', repo.name)
-    @api.multi
+    def _commit(self):
    def _scheduler(self, host=None):
        """Schedule builds for the repository"""
        ids = self.ids
        if not ids:
            return
        icp = self.env['ir.config_parameter']
        host = host or self.env['runbot.host']._get_current()
        workers = host.get_nb_worker()
        running_max = int(icp.get_param('runbot.runbot_running_max', default=75))
        assigned_only = host.assigned_only
        Build = self.env['runbot.build']
        domain = [('repo_id', 'in', ids)]
        domain_host = domain + [('host', '=', host.name)]
        # schedule jobs (transitions testing -> running, kill jobs, ...)
        build_ids = Build.search(domain_host + ['|', ('local_state', 'in', ['testing', 'running']), ('requested_action', 'in', ['wake_up', 'deathrow'])])
        build_ids._schedule()
        self.env.cr.commit()
        self.invalidate_cache()
        self.env.reset()
-        # launch new tests
+    @api.multi
    def _scheduler(self, host):
        nb_workers = host.get_nb_worker()
-        nb_testing = Build.search_count(domain_host + [('local_state', '=', 'testing')])
+        for build in self._get_builds_with_requested_actions(host):
-        available_slots = workers - nb_testing
+            build._process_requested_actions()
-        reserved_slots = Build.search_count(domain_host + [('local_state', '=', 'pending')])
+            self._commit()
-        assignable_slots = (available_slots - reserved_slots) if not assigned_only else 0
+        for build in self._get_builds_to_schedule(host):
-        if available_slots > 0:
+            build._schedule()
-            if assignable_slots > 0:  # note: slots have been addapt to be able to force host on pending build. Normally there is no pending with host.
+            self._commit()
-                # commit transaction to reduce the critical section duration
+        self._assign_pending_builds(host, nb_workers, [('build_type', '!=', 'scheduled')])
-                def allocate_builds(where_clause, limit):
+        self._commit()
-                    self.env.cr.commit()
+        self._assign_pending_builds(host, nb_workers-1 or nb_workers)
-                    self.invalidate_cache()
+        self._commit()
-                    # self-assign to be sure that another runbot instance cannot self assign the same builds
+        for build in self._get_builds_to_init(host):
-                    query = """UPDATE
+            build._init_pendings(host)
-                                    runbot_build
+            self._commit()
-                                SET
+        self._gc_running(host)
-                                    host = %%(host)s
+        self._commit()
-                                WHERE
+        self._reload_nginx()
                                    runbot_build.id IN (
                                        SELECT runbot_build.id
                                        FROM runbot_build
                                        LEFT JOIN runbot_branch
                                        ON runbot_branch.id = runbot_build.branch_id
                                        WHERE
                                            runbot_build.repo_id IN %%(repo_ids)s
                                            AND runbot_build.local_state = 'pending'
                                            AND runbot_build.host IS NULL
                                            %s
                                        ORDER BY
                                            array_position(array['normal','rebuild','indirect','scheduled']::varchar[], runbot_build.build_type) ASC,
                                            runbot_branch.sticky DESC,
                                            runbot_branch.priority DESC,
                                            runbot_build.sequence ASC
                                        FOR UPDATE OF runbot_build SKIP LOCKED
                                        LIMIT %%(limit)s
                                    )
                                RETURNING id""" % where_clause
-                    self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': host.name, 'limit': limit})
+    def build_domain_host(self, host, domain=None):
-                    return self.env.cr.fetchall()
+        domain = domain or []
        return [('repo_id', 'in', self.ids), ('host', '=', host.name)] + domain
-                allocated = allocate_builds("""AND runbot_build.build_type != 'scheduled'""", assignable_slots)
+    def _get_builds_with_requested_actions(self, host):
-                if allocated:
+        return self.env['runbot.build'].search(self.build_domain_host(host, [('requested_action', 'in', ['wake_up', 'deathrow'])]))
                    _logger.debug('Normal builds %s where allocated to runbot' % allocated)
                weak_slot = assignable_slots - len(allocated) - 1
                if weak_slot > 0:
                    allocated = allocate_builds('', weak_slot)
                    if allocated:
                        _logger.debug('Scheduled builds %s where allocated to runbot' % allocated)
-            pending_build = Build.search(domain_host + [('local_state', '=', 'pending')], limit=available_slots)
+    def _get_builds_to_schedule(self, host):
-            if pending_build:
+        return self.env['runbot.build'].search(self.build_domain_host(host, [('local_state', 'in', ['testing', 'running'])]))
                pending_build._schedule()
    def _assign_pending_builds(self, host, nb_workers, domain=None):
        if not self.ids or host.assigned_only or nb_workers <= 0:
            return
        domain_host = self.build_domain_host(host)
        reserved_slots = self.env['runbot.build'].search_count(domain_host + [('local_state', 'in', ('testing', 'pending'))])
        assignable_slots = (nb_workers - reserved_slots)
        if assignable_slots > 0:
            allocated = self._allocate_builds(host, assignable_slots, domain)
            if allocated:
                _logger.debug('Builds %s where allocated to runbot' % allocated)
    def _get_builds_to_init(self, host):
        domain_host = self.build_domain_host(host)
        used_slots = self.env['runbot.build'].search_count(domain_host + [('local_state', '=', 'testing')])
        available_slots = host.get_nb_worker() - used_slots
        if available_slots <= 0:
            return self.env['runbot.build']
        return self.env['runbot.build'].search(domain_host + [('local_state', '=', 'pending')], limit=available_slots)
    def _gc_running(self, host):
        running_max = host.get_running_max()
        # terminate and reap doomed build
-        build_ids = Build.search(domain_host + [('local_state', '=', 'running'), ('keep_running', '!=', True)], order='job_start desc').ids
+        domain_host = self.build_domain_host(host)
-        # sort builds: the last build of each sticky branch then the rest
+        Build = self.env['runbot.build']
-        sticky = {}
+        # some builds are marked as keep running
-        non_sticky = []
+        cannot_be_killed_ids = Build.search(domain_host + [('keep_running', '!=', True)]).ids
-        for build in Build.browse(build_ids):
+        # we want to keep one build running per sticky, no mather which host
-            if build.branch_id.sticky and build.branch_id.id not in sticky:
+        sticky_branches_ids = self.env['runbot.branch'].search([('sticky', '=', True)]).ids
-                sticky[build.branch_id.id] = build.id
+        # search builds on host on sticky branches, order by position in branch history
-            else:
+        if sticky_branches_ids:
-                non_sticky.append(build.id)
+            self.env.cr.execute("""
-        build_ids = list(sticky.values())
+                SELECT
-        build_ids += non_sticky
+                    id
-        # terminate extra running builds
+                FROM (
                    SELECT
                        bu.id AS id,
                        bu.host as host,
                        row_number() OVER (PARTITION BY branch_id order by bu.id desc) AS row
                    FROM
                        runbot_branch br INNER JOIN runbot_build bu ON br.id=bu.branch_id
                    WHERE
                        br.id in %s AND (bu.hidden = 'f' OR bu.hidden IS NULL)
                    ) AS br_bu
                WHERE
                    row <= 4 AND host = %s
                ORDER BY row, id desc
                """, [tuple(sticky_branches_ids), host.name]
            )
            cannot_be_killed_ids += self.env.cr.fetchall()
        cannot_be_killed_ids = cannot_be_killed_ids[:running_max]  # ensure that we don't try to keep more than we can handle
        build_ids = Build.search(domain_host + [('local_state', '=', 'running'), ('id', 'not in', cannot_be_killed_ids)], order='job_start desc').ids
        Build.browse(build_ids)[running_max:]._kill()
-        Build.browse(build_ids)._reap()
+
    def _allocate_builds(self, host, nb_slots, domain=None):
        if nb_slots <= 0:
            return []
        non_allocated_domain = [('repo_id', 'in', self.ids), ('local_state', '=', 'pending'), ('host', '=', False)]
        if domain:
            non_allocated_domain = expression.AND([non_allocated_domain, domain])
        e = expression.expression(non_allocated_domain, self.env['runbot.build'])
        assert e.get_tables() == ['"runbot_build"']
        where_clause, where_params = e.to_sql()
        # self-assign to be sure that another runbot instance cannot self assign the same builds
        query = """UPDATE
                        runbot_build
                    SET
                        host = %%s
                    WHERE
                        runbot_build.id IN (
                            SELECT runbot_build.id
                            FROM runbot_build
                            LEFT JOIN runbot_branch
                            ON runbot_branch.id = runbot_build.branch_id
                            WHERE
                                %s
                            ORDER BY
                                array_position(array['normal','rebuild','indirect','scheduled']::varchar[], runbot_build.build_type) ASC,
                                runbot_branch.sticky DESC,
                                runbot_branch.priority DESC,
                                runbot_build.sequence ASC
                            FOR UPDATE OF runbot_build SKIP LOCKED
                            LIMIT %%s
                        )
                    RETURNING id""" % where_clause
        self.env.cr.execute(query, [host.name] + where_params + [nb_slots])
        return self.env.cr.fetchall()
    def _domain(self):
        return self.env.get('ir.config_parameter').get_param('runbot.runbot_domain', fqdn())
@ -613,9 +648,7 @@ class runbot_repo(models.Model):
            repos = self.search([('mode', '!=', 'disabled')])
            repos._update(force=False)
            repos._create_pending_builds()
-
+            self._commit()
            self.env.cr.commit()
            self.invalidate_cache()
            time.sleep(update_frequency)
    def _cron_fetch_and_build(self, hostname):
@ -629,7 +662,8 @@ class runbot_repo(models.Model):
        host = self.env['runbot.host']._get_current()
        host.set_psql_conn_count()
        host.last_start_loop = fields.Datetime.now()
-        self.env.cr.commit()
+        
        self._commit()
        start_time = time.time()
        # 1. source cleanup
        # -> Remove sources when no build is using them
@ -638,53 +672,41 @@ class runbot_repo(models.Model):
        # 2. db and log cleanup
        # -> Keep them as long as possible
        self.env['runbot.build']._local_cleanup()
        # 3. docker cleanup
-        docker_ps_result = docker_ps()
+        self.env['runbot.repo']._docker_cleanup()
        containers = {int(dc.split('-', 1)[0]):dc for dc in docker_ps_result if dest_reg.match(dc)}
        if containers:
            candidates = self.env['runbot.build'].search([('id', 'in', list(containers.keys())), ('local_state', '=', 'done')])
            for c in candidates:
                _logger.info('container %s found running with build state done', containers[c.id])
                docker_stop(containers[c.id])
        ignored = {dc for dc in docker_ps_result if not dest_reg.match(dc)}
        if ignored:
            _logger.debug('docker (%s) not deleted because not dest format', " ".join(list(ignored)))
        timeout = self._get_cron_period()
        icp = self.env['ir.config_parameter']
        update_frequency = int(icp.get_param('runbot.runbot_update_frequency', default=10))
        while time.time() - start_time < timeout:
-            repos = self.search([('mode', '!=', 'disabled')])
+            time.sleep(self._scheduler_loop_turn(host, update_frequency))
            try:
                repos._scheduler(host)
                host.last_success = fields.Datetime.now()
                self.env.cr.commit()
                self.env.reset()
                self = self.env()[self._name]
                self._reload_nginx()
                time.sleep(update_frequency)
            except TransactionRollbackError: # can lead to psycopg2.InternalError'>: "current transaction is aborted, commands ignored until end of transaction block
                _logger.exception('Trying to rollback')
                self.env.cr.rollback()
                self.env.reset()
                time.sleep(random.uniform(0, 3))
            except Exception as e:
                with registry(self._cr.dbname).cursor() as cr:  # user another cursor since transaction will be rollbacked
                    message = str(e)
                    chost = host.with_env(self.env(cr=cr))
                    if chost.last_exception == message:
                        chost.exception_count += 1
                    else:
                        chost.with_env(self.env(cr=cr)).last_exception = str(e)
                        chost.exception_count = 1
                raise
        if host.last_exception:
            host.last_exception = ""
            host.exception_count = 0
        host.last_end_loop = fields.Datetime.now()
    def _scheduler_loop_turn(self, host, default_sleep=1):
        repos = self.search([('mode', '!=', 'disabled')])
        try:
            repos._scheduler(host)
            host.last_success = fields.Datetime.now()
            self._commit()
        except Exception as e:
            self.env.cr.rollback()
            self.env.reset()
            _logger.exception(e)
            message = str(e)
            if host.last_exception == message:
                host.exception_count += 1
            else:
                host.last_exception = str(e)
                host.exception_count = 1
            self._commit()
            return random.uniform(0, 3)
        else:
            if host.last_exception:
                host.last_exception = ""
                host.exception_count = 0
            return default_sleep
    def _source_cleanup(self):
        try:
            if self.pool._init:
@ -721,23 +743,34 @@ class runbot_repo(models.Model):
                    assert 'static' in source_dir
                    shutil.rmtree(source_dir)
                _logger.info('%s/%s source folder where deleted (%s kept)' % (len(to_delete), len(to_delete+to_keep), len(to_keep)))
        except:
            _logger.error('An exception occured while cleaning sources')
            pass
-
+    def _docker_cleanup(self):
-    class RefTime(models.Model):
+        docker_ps_result = docker_ps()
-        _name = "runbot.repo.reftime"
+        containers = {int(dc.split('-', 1)[0]):dc for dc in docker_ps_result if dest_reg.match(dc)}
-        _log_access = False
+        if containers:
-
+            candidates = self.env['runbot.build'].search([('id', 'in', list(containers.keys())), ('local_state', '=', 'done')])
-        time = fields.Float('Time', index=True, required=True)
+            for c in candidates:
-        repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
+                _logger.info('container %s found running with build state done', containers[c.id])
                docker_stop(containers[c.id])
        ignored = {dc for dc in docker_ps_result if not dest_reg.match(dc)}
        if ignored:
            _logger.debug('docker (%s) not deleted because not dest format', " ".join(list(ignored)))
-    class HookTime(models.Model):
+class RefTime(models.Model):
-        _name = "runbot.repo.hooktime"
+    _name = "runbot.repo.reftime"
-        _log_access = False
+    _log_access = False
-        time = fields.Float('Time')
+    time = fields.Float('Time', index=True, required=True)
-        repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
+    repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
 class HookTime(models.Model):
    _name = "runbot.repo.hooktime"
    _log_access = False
    time = fields.Float('Time')
    repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
--- a/runbot/tests/common.py
+++ b/runbot/tests/common.py
@ -36,6 +36,7 @@ class RunbotCase(TransactionCase):
        self.start_patcher('isdir', 'odoo.addons.runbot.common.os.path.isdir', True)
        self.start_patcher('isfile', 'odoo.addons.runbot.common.os.path.isfile', True)
        self.start_patcher('docker_run', 'odoo.addons.runbot.models.build_config.docker_run')
        self.start_patcher('docker_build', 'odoo.addons.runbot.models.build.docker_build')
        self.start_patcher('docker_ps', 'odoo.addons.runbot.models.repo.docker_ps', [])
        self.start_patcher('docker_stop', 'odoo.addons.runbot.models.repo.docker_stop')
--- a/runbot/tests/test_cron.py
+++ b/runbot/tests/test_cron.py
@ -56,7 +56,6 @@ class Test_Cron(RunbotCase):
        ret = self.Repo._cron_fetch_and_build(hostname)
        self.assertEqual(None, ret)
        mock_scheduler.assert_called()
        self.assertTrue(mock_reload.called)
        host = self.env['runbot.host'].search([('name', '=', hostname)])
        self.assertEqual(host.name, hostname, 'A new host should have been created')
        self.assertGreater(host.psql_conn_count, 0, 'A least one connection should exist on the current psql instance')
--- a/runbot/tests/test_repo.py
+++ b/runbot/tests/test_repo.py
@ -257,10 +257,10 @@ class Test_Repo_Scheduler(RunbotCase):
            'name': 'refs/head/foo'
        })
    @patch('odoo.addons.runbot.models.build.runbot_build._reap')
    @patch('odoo.addons.runbot.models.build.runbot_build._kill')
    @patch('odoo.addons.runbot.models.build.runbot_build._schedule')
-    def test_repo_scheduler(self, mock_schedule, mock_kill, mock_reap):
+    @patch('odoo.addons.runbot.models.build.runbot_build._init_pendings')
    def test_repo_scheduler(self, mock_init_pendings, mock_schedule, mock_kill):
        self.env['ir.config_parameter'].set_param('runbot.runbot_workers', 6)
        builds = []
        # create 6 builds that are testing on the host to verify that
@ -293,8 +293,8 @@ class Test_Repo_Scheduler(RunbotCase):
            'local_state': 'pending',
        })
        builds.append(build)
-
+        host = self.env['runbot.host']._get_current()
-        self.foo_repo._scheduler()
+        self.foo_repo._scheduler(host)
        build.invalidate_cache()
        scheduled_build.invalidate_cache()
@ -304,7 +304,7 @@ class Test_Repo_Scheduler(RunbotCase):
        # give some room for the pending build
        self.Build.search([('name', '=', 'a')]).write({'local_state': 'done'})
-        self.foo_repo._scheduler()
+        self.foo_repo._scheduler(host)
        build.invalidate_cache()
        scheduled_build.invalidate_cache()
        self.assertEqual(build.host, 'host.runbot.com')
--- a/runbot/views/build_views.xml
+++ b/runbot/views/build_views.xml
@ -22,7 +22,6 @@
                        <field name="local_result"/>
                        <field name="global_result"/>
                        <field name="triggered_result" groups="base.group_no_one"/>
                        <field name="pid"/>
                        <field name="host"/>
                        <field name="job_start" groups="base.group_no_one"/>
                        <field name="job_end" groups="base.group_no_one"/>
@ -58,7 +57,6 @@
                <field name="port"/>
                <field name="job"/>
                <field name="coverage_result"/>
                <field name="pid"/>
                <field name="host"/>
                <field name="build_time"/>
                <field name="build_age"/>
--- a/runbot_builder/builder.py
+++ b/runbot_builder/builder.py
@ -18,10 +18,8 @@ _logger = logging.getLogger(__name__)
 class RunbotClient():
-    def __init__(self, env, args):
+    def __init__(self, env):
        self.env = env
        self.args = args
        self.fqdn = socket.getfqdn()
        self.ask_interrupt = threading.Event()
    def main_loop(self):
@ -31,15 +29,17 @@ class RunbotClient():
        host = self.env['runbot.host']._get_current()
        count = 0
        while True:
            host.last_start_loop = fields.Datetime.now()
            count = count % 60
            if count == 0:
                logging.info('Host %s running with %s slots on pid %s%s', host.name, host.get_nb_worker(), os.getpid(), ' (assigned only)' if host.assigned_only else '')
                self.env['runbot.repo']._source_cleanup()
                self.env['runbot.build']._local_cleanup()
-                host.last_end_loop = host.last_start_loop = fields.Datetime.now()
+                self.env['runbot.repo']._docker_cleanup()
                host.set_psql_conn_count()
            count += 1
            sleep_time = self.env['runbot.repo']._scheduler_loop_turn(host)
            host.last_end_loop = fields.Datetime.now()
            self.env.cr.commit()
            self.env.reset()
            self.sleep(sleep_time)
@ -97,7 +97,7 @@ def run():
    with odoo.api.Environment.manage():
        with registry.cursor() as cr:
            env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {})
-            runbot_client = RunbotClient(env, args)
+            runbot_client = RunbotClient(env)
            # run main loop
            runbot_client.main_loop()