[REF] runbot: refactor sheduler

2025-05-31 21:58:00 +07:00 · 2019-12-17 11:27:11 +01:00 · 2019-12-17 11:27:11 +01:00 · 426b7af2cb
commit 426b7af2cb
parent ae5f2906bf
9 changed files with 277 additions and 251 deletions
--- a/runbot/container.py
+++ b/runbot/container.py
@ -166,12 +166,13 @@ def docker_run(run_cmd, log_path, build_dir, container_name, exposed_ports=None,
    docker_command.extend(['odoo:runbot_tests', '/bin/bash', '-c', "%s" % run_cmd])
    docker_run = subprocess.Popen(docker_command, stdout=logs, stderr=logs, preexec_fn=preexec_fn, close_fds=False, cwd=build_dir)
    _logger.info('Started Docker container %s', container_name)
-    return docker_run.pid
+    return

 def docker_stop(container_name):
    """Stops the container named container_name"""
    _logger.info('Stopping container %s', container_name)
    dstop = subprocess.run(['docker', 'stop', container_name])
+    # todo delete os.path.join(build_dir, 'end-%s' % container_name)

 def docker_is_running(container_name):
    dinspect = subprocess.run(['docker', 'container', 'inspect', container_name], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
--- a/runbot/models/build.py
+++ b/runbot/models/build.py
@ -67,7 +67,6 @@ class runbot_build(models.Model):
    nb_running = fields.Integer("Number of test slot use", default=0)

    # should we add a stored field for children results?
-    pid = fields.Integer('Pid')
    active_step = fields.Many2one('runbot.build.config.step', 'Active step')
    job = fields.Char('Active step display name', compute='_compute_job')
    job_start = fields.Datetime('Job start')
@ -573,14 +572,40 @@ class runbot_build(models.Model):
        self.ensure_one()
        return '%s_%s' % (self.dest, self.active_step.name)

-    def _schedule(self):
-        """schedule the build"""
-        icp = self.env['ir.config_parameter']
-        # For retro-compatibility, keep this parameter in seconds
-
+    def _init_pendings(self, host):
+        for build in self:
+            if build.local_state != 'pending':
+                raise UserError("Build %s is not pending" % build.id)
+            if build.host != host.name:
+                raise UserError("Build %s does not have correct host" % build.id)
+            # allocate port and schedule first job
+            values = {
+                'port': self._find_port(),
+                'job_start': now(),
+                'build_start': now(),
+                'job_end': False,
+            }
+            values.update(build._next_job_values())
+            build.write(values)
+            if not build.active_step:
+                build._log('_schedule', 'No job in config, doing nothing')
+                continue
+            try:
+                build._log('_schedule', 'Init build environment with config %s ' % build.config_id.name)
+                # notify pending build - avoid confusing users by saying nothing
+                build._github_status()
+                os.makedirs(build._path('logs'), exist_ok=True)
+                build._log('_schedule', 'Building docker image')
+                docker_build(build._path('logs', 'docker_build.txt'), build._path())
+            except Exception:
+                _logger.exception('Failed initiating build %s', build.dest)
+                build._log('_schedule', 'Failed initiating build')
+                build._kill(result='ko')
+                continue
+            build._run_job()
+
+    def _process_requested_actions(self):
        for build in self:
-            self.env.cr.commit()  # commit between each build to minimise transactionnal errors due to state computations
-            self.invalidate_cache()
            if build.requested_action == 'deathrow':
                result = None
                if build.local_state != 'running' and build.global_result not in ('warn', 'ko'):
@ -617,97 +642,76 @@ class runbot_build(models.Model):
                        build.write({'requested_action': False, 'local_state': 'done'})
                continue

-            if build.local_state == 'pending':
-                # allocate port and schedule first job
-                port = self._find_port()
-                values = {
-                    'host': fqdn(), # or ip? of false? 
-                    'port': port,
-                    'job_start': now(),
-                    'build_start': now(),
-                    'job_end': False,
-                }
-                values.update(build._next_job_values())
-                build.write(values)
-                if not build.active_step:
-                    build._log('_schedule', 'No job in config, doing nothing')
+    def _schedule(self):
+        """schedule the build"""
+        icp = self.env['ir.config_parameter']
+        for build in self:
+            if build.local_state not in ['testing', 'running']:
+                raise UserError("Build %s is not testing/running: %s" % (build.id, build.local_state))
+            if build.local_state == 'testing':
+                # failfast in case of docker error (triggered in database)
+                if build.triggered_result and not build.active_step.ignore_triggered_result:
+                    worst_result = self._get_worst_result([build.triggered_result, build.local_result])
+                    if  worst_result != build.local_result:
+                        build.local_result = build.triggered_result
+                        build._github_status()  # failfast
+            # check if current job is finished
+            _docker_state = docker_state(build._get_docker_name(), build._path())
+            if _docker_state == 'RUNNING':
+                timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000)))
+                if build.local_state != 'running' and build.job_time > timeout:
+                    build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time))
+                    build._kill(result='killed')
+                continue
+            elif _docker_state == 'UNKNOWN' and build.active_step._is_docker_step():
+                if build.job_time < 60:
+                    _logger.debug('container "%s" seems too take a while to start', build._get_docker_name())
                    continue
-                try:
-                    build._log('_schedule', 'Init build environment with config %s ' % build.config_id.name)
-                    # notify pending build - avoid confusing users by saying nothing
-                    build._github_status()
-                    os.makedirs(build._path('logs'), exist_ok=True)
-                    build._log('_schedule', 'Building docker image')
-                    docker_build(build._path('logs', 'docker_build.txt'), build._path())
-                except Exception:
-                    _logger.exception('Failed initiating build %s', build.dest)
-                    build._log('_schedule', 'Failed initiating build')
-                    build._kill(result='ko')
-                    continue
-            else:  # testing/running build
-                if build.local_state == 'testing':
-                    # failfast in case of docker error (triggered in database)
-                    if build.triggered_result and not build.active_step.ignore_triggered_result:
-                        worst_result = self._get_worst_result([build.triggered_result, build.local_result])
-                        if  worst_result != build.local_result:
-                            build.local_result = build.triggered_result
-                            build._github_status()  # failfast
-                # check if current job is finished
-                _docker_state = docker_state(build._get_docker_name(), build._path())
-                if _docker_state == 'RUNNING':
-                    timeout = min(build.active_step.cpu_limit, int(icp.get_param('runbot.runbot_timeout', default=10000)))
-                    if build.local_state != 'running' and build.job_time > timeout:
-                        build._log('_schedule', '%s time exceeded (%ss)' % (build.active_step.name if build.active_step else "?", build.job_time))
-                        build._kill(result='killed')
-                    continue
-                elif _docker_state == 'UNKNOWN' and build.active_step._is_docker_step():
-                    if build.job_time < 60:
-                        _logger.debug('container "%s" seems too take a while to start', build._get_docker_name())
-                        continue
-                    else:
-                        build._log('_schedule', 'Docker not started after 60 seconds, skipping', level='ERROR')
-                # No job running, make result and select nex job
-                build_values = {
-                    'job_end': now(),
-                }
-                # make result of previous job
-                try:
-                    results = build.active_step._make_results(build)
-                except Exception as e:
-                    if isinstance(e, RunbotException):
-                        message = e.args[0]
-                    else:
-                        message = 'An error occured while computing results of %s:\n %s' % (build.job, str(e).replace('\\n', '\n').replace("\\'", "'"))
-                        _logger.exception(message)
-                    build._log('_make_results', message, level='ERROR')
-                    results = {'local_result': 'ko'}
+                else:
+                    build._log('_schedule', 'Docker not started after 60 seconds, skipping', level='ERROR')
+            # No job running, make result and select nex job
+            build_values = {
+                'job_end': now(),
+            }
+            # make result of previous job
+            try:
+                results = build.active_step._make_results(build)
+            except Exception as e:
+                if isinstance(e, RunbotException):
+                    message = e.args[0]
+                else:
+                    message = 'An error occured while computing results of %s:\n %s' % (build.job, str(e).replace('\\n', '\n').replace("\\'", "'"))
+                    _logger.exception(message)
+                build._log('_make_results', message, level='ERROR')
+                results = {'local_result': 'ko'}

-                build_values.update(results)
+            build_values.update(results)

-                build.active_step.log_end(build)
+            build.active_step.log_end(build)

-                build_values.update(build._next_job_values())  # find next active_step or set to done
+            build_values.update(build._next_job_values())  # find next active_step or set to done

-                ending_build = build.local_state not in ('done', 'running') and build_values.get('local_state') in ('done', 'running')
-                if ending_build:
-                    build.update_build_end()
+            ending_build = build.local_state not in ('done', 'running') and build_values.get('local_state') in ('done', 'running')
+            if ending_build:
+                build.update_build_end()

-                build.write(build_values)
-                if ending_build:
-                    build._github_status()
-                    if not build.local_result:  # Set 'ok' result if no result set (no tests job on build)
-                        build.local_result = 'ok'
-                        build._logger("No result set, setting ok by default")
+            build.write(build_values)
+            if ending_build:
+                build._github_status()
+                if not build.local_result:  # Set 'ok' result if no result set (no tests job on build)
+                    build.local_result = 'ok'
+                    build._logger("No result set, setting ok by default")
+            build._run_job()

-            # run job
-            pid = None
+    def _run_job(self):
+        # run job
+        for build in self:
            if build.local_state != 'done':
                build._logger('running %s', build.active_step.name)
                os.makedirs(build._path('logs'), exist_ok=True)
                os.makedirs(build._path('datadir'), exist_ok=True)
                try:
-                    pid = build.active_step._run(build)  # run should be on build?
-                    build.write({'pid': pid})  # no really usefull anymore with dockers
+                    build.active_step._run(build)  # run should be on build?
                except Exception as e:
                    if isinstance(e, RunbotException):
                        message = e.args[0]
@ -716,10 +720,6 @@ class runbot_build(models.Model):
                    _logger.exception(message)
                    build._log("run", message, level='ERROR')
                    build._kill(result='ko')
-                    continue
-
-        self.env.cr.commit()
-        self.invalidate_cache()

    def _path(self, *l, **kw):
        """Return the repo build path"""
@ -844,16 +844,6 @@ class runbot_build(models.Model):
            'line': '0',
        })

-    def _reap(self):
-        while True:
-            try:
-                pid, status, rusage = os.wait3(os.WNOHANG)
-            except OSError:
-                break
-            if pid == 0:
-                break
-            _logger.debug('reaping: pid: %s status: %s', pid, status)
-
    def _kill(self, result=None):
        host = fqdn()
        for build in self:
--- a/runbot/models/host.py
+++ b/runbot/models/host.py
@ -52,6 +52,10 @@ class RunboHost(models.Model):
        icp = self.env['ir.config_parameter']
        return self.nb_worker or int(icp.sudo().get_param('runbot.runbot_workers', default=6))

+    def get_running_max(self):
+        icp = self.env['ir.config_parameter']
+        return int(icp.get_param('runbot.runbot_running_max', default=75))
+
    def set_psql_conn_count(self):
        self.ensure_one()
        with local_pgadmin_cursor() as local_cr:
--- a/runbot/models/repo.py
+++ b/runbot/models/repo.py
@ -17,9 +17,11 @@ from odoo.tools.misc import DEFAULT_SERVER_DATETIME_FORMAT
 from odoo import models, fields, api, registry
 from odoo.modules.module import get_module_resource
 from odoo.tools import config
+from odoo.osv import expression
 from ..common import fqdn, dt2time, Commit, dest_reg, os
 from ..container import docker_ps, docker_stop
 from psycopg2.extensions import TransactionRollbackError
+
 _logger = logging.getLogger(__name__)

 class RunbotException(Exception):
@ -456,97 +458,130 @@ class runbot_repo(models.Model):
            except Exception:
                _logger.exception('Fail to update repo %s', repo.name)

-    @api.multi
-    def _scheduler(self, host=None):
-        """Schedule builds for the repository"""
-        ids = self.ids
-        if not ids:
-            return
-        icp = self.env['ir.config_parameter']
-        host = host or self.env['runbot.host']._get_current()
-        workers = host.get_nb_worker()
-        running_max = int(icp.get_param('runbot.runbot_running_max', default=75))
-        assigned_only = host.assigned_only
-
-        Build = self.env['runbot.build']
-        domain = [('repo_id', 'in', ids)]
-        domain_host = domain + [('host', '=', host.name)]
-
-        # schedule jobs (transitions testing -> running, kill jobs, ...)
-        build_ids = Build.search(domain_host + ['|', ('local_state', 'in', ['testing', 'running']), ('requested_action', 'in', ['wake_up', 'deathrow'])])
-        build_ids._schedule()
+    def _commit(self):
        self.env.cr.commit()
        self.invalidate_cache()
+        self.env.reset()

-        # launch new tests
+    @api.multi
+    def _scheduler(self, host):
+        nb_workers = host.get_nb_worker()

-        nb_testing = Build.search_count(domain_host + [('local_state', '=', 'testing')])
-        available_slots = workers - nb_testing
-        reserved_slots = Build.search_count(domain_host + [('local_state', '=', 'pending')])
-        assignable_slots = (available_slots - reserved_slots) if not assigned_only else 0
-        if available_slots > 0:
-            if assignable_slots > 0:  # note: slots have been addapt to be able to force host on pending build. Normally there is no pending with host.
-                # commit transaction to reduce the critical section duration
-                def allocate_builds(where_clause, limit):
-                    self.env.cr.commit()
-                    self.invalidate_cache()
-                    # self-assign to be sure that another runbot instance cannot self assign the same builds
-                    query = """UPDATE
-                                    runbot_build
-                                SET
-                                    host = %%(host)s
-                                WHERE
-                                    runbot_build.id IN (
-                                        SELECT runbot_build.id
-                                        FROM runbot_build
-                                        LEFT JOIN runbot_branch
-                                        ON runbot_branch.id = runbot_build.branch_id
-                                        WHERE
-                                            runbot_build.repo_id IN %%(repo_ids)s
-                                            AND runbot_build.local_state = 'pending'
-                                            AND runbot_build.host IS NULL
-                                            %s
-                                        ORDER BY
-                                            array_position(array['normal','rebuild','indirect','scheduled']::varchar[], runbot_build.build_type) ASC,
-                                            runbot_branch.sticky DESC,
-                                            runbot_branch.priority DESC,
-                                            runbot_build.sequence ASC
-                                        FOR UPDATE OF runbot_build SKIP LOCKED
-                                        LIMIT %%(limit)s
-                                    )
-                                RETURNING id""" % where_clause
+        for build in self._get_builds_with_requested_actions(host):
+            build._process_requested_actions()
+            self._commit()
+        for build in self._get_builds_to_schedule(host):
+            build._schedule()
+            self._commit()
+        self._assign_pending_builds(host, nb_workers, [('build_type', '!=', 'scheduled')])
+        self._commit()
+        self._assign_pending_builds(host, nb_workers-1 or nb_workers)
+        self._commit()
+        for build in self._get_builds_to_init(host):
+            build._init_pendings(host)
+            self._commit()
+        self._gc_running(host)
+        self._commit()
+        self._reload_nginx()

-                    self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': host.name, 'limit': limit})
-                    return self.env.cr.fetchall()
+    def build_domain_host(self, host, domain=None):
+        domain = domain or []
+        return [('repo_id', 'in', self.ids), ('host', '=', host.name)] + domain

-                allocated = allocate_builds("""AND runbot_build.build_type != 'scheduled'""", assignable_slots)
-                if allocated:
-                    _logger.debug('Normal builds %s where allocated to runbot' % allocated)
-                weak_slot = assignable_slots - len(allocated) - 1
-                if weak_slot > 0:
-                    allocated = allocate_builds('', weak_slot)
-                    if allocated:
-                        _logger.debug('Scheduled builds %s where allocated to runbot' % allocated)
+    def _get_builds_with_requested_actions(self, host):
+        return self.env['runbot.build'].search(self.build_domain_host(host, [('requested_action', 'in', ['wake_up', 'deathrow'])]))

-            pending_build = Build.search(domain_host + [('local_state', '=', 'pending')], limit=available_slots)
-            if pending_build:
-                pending_build._schedule()
+    def _get_builds_to_schedule(self, host):
+        return self.env['runbot.build'].search(self.build_domain_host(host, [('local_state', 'in', ['testing', 'running'])]))

+    def _assign_pending_builds(self, host, nb_workers, domain=None):
+        if not self.ids or host.assigned_only or nb_workers <= 0:
+            return
+        domain_host = self.build_domain_host(host)
+        reserved_slots = self.env['runbot.build'].search_count(domain_host + [('local_state', 'in', ('testing', 'pending'))])
+        assignable_slots = (nb_workers - reserved_slots)
+        if assignable_slots > 0:
+            allocated = self._allocate_builds(host, assignable_slots, domain)
+            if allocated:
+                _logger.debug('Builds %s where allocated to runbot' % allocated)
+
+    def _get_builds_to_init(self, host):
+        domain_host = self.build_domain_host(host)
+        used_slots = self.env['runbot.build'].search_count(domain_host + [('local_state', '=', 'testing')])
+        available_slots = host.get_nb_worker() - used_slots
+        if available_slots <= 0:
+            return self.env['runbot.build']
+        return self.env['runbot.build'].search(domain_host + [('local_state', '=', 'pending')], limit=available_slots)
+
+    def _gc_running(self, host):
+        running_max = host.get_running_max()
        # terminate and reap doomed build
-        build_ids = Build.search(domain_host + [('local_state', '=', 'running'), ('keep_running', '!=', True)], order='job_start desc').ids
-        # sort builds: the last build of each sticky branch then the rest
-        sticky = {}
-        non_sticky = []
-        for build in Build.browse(build_ids):
-            if build.branch_id.sticky and build.branch_id.id not in sticky:
-                sticky[build.branch_id.id] = build.id
-            else:
-                non_sticky.append(build.id)
-        build_ids = list(sticky.values())
-        build_ids += non_sticky
-        # terminate extra running builds
+        domain_host = self.build_domain_host(host)
+        Build = self.env['runbot.build']
+        # some builds are marked as keep running
+        cannot_be_killed_ids = Build.search(domain_host + [('keep_running', '!=', True)]).ids
+        # we want to keep one build running per sticky, no mather which host
+        sticky_branches_ids = self.env['runbot.branch'].search([('sticky', '=', True)]).ids
+        # search builds on host on sticky branches, order by position in branch history
+        if sticky_branches_ids:
+            self.env.cr.execute("""
+                SELECT
+                    id
+                FROM (
+                    SELECT
+                        bu.id AS id,
+                        bu.host as host,
+                        row_number() OVER (PARTITION BY branch_id order by bu.id desc) AS row
+                    FROM
+                        runbot_branch br INNER JOIN runbot_build bu ON br.id=bu.branch_id
+                    WHERE
+                        br.id in %s AND (bu.hidden = 'f' OR bu.hidden IS NULL)
+                    ) AS br_bu
+                WHERE
+                    row <= 4 AND host = %s
+                ORDER BY row, id desc
+                """, [tuple(sticky_branches_ids), host.name]
+            )
+            cannot_be_killed_ids += self.env.cr.fetchall()
+        cannot_be_killed_ids = cannot_be_killed_ids[:running_max]  # ensure that we don't try to keep more than we can handle
+
+        build_ids = Build.search(domain_host + [('local_state', '=', 'running'), ('id', 'not in', cannot_be_killed_ids)], order='job_start desc').ids
        Build.browse(build_ids)[running_max:]._kill()
-        Build.browse(build_ids)._reap()
+
+    def _allocate_builds(self, host, nb_slots, domain=None):
+        if nb_slots <= 0:
+            return []
+        non_allocated_domain = [('repo_id', 'in', self.ids), ('local_state', '=', 'pending'), ('host', '=', False)]
+        if domain:
+            non_allocated_domain = expression.AND([non_allocated_domain, domain])
+        e = expression.expression(non_allocated_domain, self.env['runbot.build'])
+        assert e.get_tables() == ['"runbot_build"']
+        where_clause, where_params = e.to_sql()
+
+        # self-assign to be sure that another runbot instance cannot self assign the same builds
+        query = """UPDATE
+                        runbot_build
+                    SET
+                        host = %%s
+                    WHERE
+                        runbot_build.id IN (
+                            SELECT runbot_build.id
+                            FROM runbot_build
+                            LEFT JOIN runbot_branch
+                            ON runbot_branch.id = runbot_build.branch_id
+                            WHERE
+                                %s
+                            ORDER BY
+                                array_position(array['normal','rebuild','indirect','scheduled']::varchar[], runbot_build.build_type) ASC,
+                                runbot_branch.sticky DESC,
+                                runbot_branch.priority DESC,
+                                runbot_build.sequence ASC
+                            FOR UPDATE OF runbot_build SKIP LOCKED
+                            LIMIT %%s
+                        )
+                    RETURNING id""" % where_clause
+        self.env.cr.execute(query, [host.name] + where_params + [nb_slots])
+        return self.env.cr.fetchall()

    def _domain(self):
        return self.env.get('ir.config_parameter').get_param('runbot.runbot_domain', fqdn())
@ -613,9 +648,7 @@ class runbot_repo(models.Model):
            repos = self.search([('mode', '!=', 'disabled')])
            repos._update(force=False)
            repos._create_pending_builds()
-
-            self.env.cr.commit()
-            self.invalidate_cache()
+            self._commit()
            time.sleep(update_frequency)

    def _cron_fetch_and_build(self, hostname):
@ -629,7 +662,8 @@ class runbot_repo(models.Model):
        host = self.env['runbot.host']._get_current()
        host.set_psql_conn_count()
        host.last_start_loop = fields.Datetime.now()
-        self.env.cr.commit()
+        
+        self._commit()
        start_time = time.time()
        # 1. source cleanup
        # -> Remove sources when no build is using them
@ -638,53 +672,41 @@ class runbot_repo(models.Model):
        # 2. db and log cleanup
        # -> Keep them as long as possible
        self.env['runbot.build']._local_cleanup()
-
        # 3. docker cleanup
-        docker_ps_result = docker_ps()
-        containers = {int(dc.split('-', 1)[0]):dc for dc in docker_ps_result if dest_reg.match(dc)}
-        if containers:
-            candidates = self.env['runbot.build'].search([('id', 'in', list(containers.keys())), ('local_state', '=', 'done')])
-            for c in candidates:
-                _logger.info('container %s found running with build state done', containers[c.id])
-                docker_stop(containers[c.id])
-        ignored = {dc for dc in docker_ps_result if not dest_reg.match(dc)}
-        if ignored:
-            _logger.debug('docker (%s) not deleted because not dest format', " ".join(list(ignored)))
+        self.env['runbot.repo']._docker_cleanup()

        timeout = self._get_cron_period()
        icp = self.env['ir.config_parameter']
        update_frequency = int(icp.get_param('runbot.runbot_update_frequency', default=10))
        while time.time() - start_time < timeout:
-            repos = self.search([('mode', '!=', 'disabled')])
-            try:
-                repos._scheduler(host)
-                host.last_success = fields.Datetime.now()
-                self.env.cr.commit()
-                self.env.reset()
-                self = self.env()[self._name]
-                self._reload_nginx()
-                time.sleep(update_frequency)
-            except TransactionRollbackError: # can lead to psycopg2.InternalError'>: "current transaction is aborted, commands ignored until end of transaction block
-                _logger.exception('Trying to rollback')
-                self.env.cr.rollback()
-                self.env.reset()
-                time.sleep(random.uniform(0, 3))
-            except Exception as e:
-                with registry(self._cr.dbname).cursor() as cr:  # user another cursor since transaction will be rollbacked
-                    message = str(e)
-                    chost = host.with_env(self.env(cr=cr))
-                    if chost.last_exception == message:
-                        chost.exception_count += 1
-                    else:
-                        chost.with_env(self.env(cr=cr)).last_exception = str(e)
-                        chost.exception_count = 1
-                raise
+            time.sleep(self._scheduler_loop_turn(host, update_frequency))

-        if host.last_exception:
-            host.last_exception = ""
-            host.exception_count = 0
        host.last_end_loop = fields.Datetime.now()

+    def _scheduler_loop_turn(self, host, default_sleep=1):
+        repos = self.search([('mode', '!=', 'disabled')])
+        try:
+            repos._scheduler(host)
+            host.last_success = fields.Datetime.now()
+            self._commit()
+        except Exception as e:
+            self.env.cr.rollback()
+            self.env.reset()
+            _logger.exception(e)
+            message = str(e)
+            if host.last_exception == message:
+                host.exception_count += 1
+            else:
+                host.last_exception = str(e)
+                host.exception_count = 1
+            self._commit()
+            return random.uniform(0, 3)
+        else:
+            if host.last_exception:
+                host.last_exception = ""
+                host.exception_count = 0
+            return default_sleep
+
    def _source_cleanup(self):
        try:
            if self.pool._init:
@ -721,23 +743,34 @@ class runbot_repo(models.Model):
                    assert 'static' in source_dir
                    shutil.rmtree(source_dir)
                _logger.info('%s/%s source folder where deleted (%s kept)' % (len(to_delete), len(to_delete+to_keep), len(to_keep)))
-
        except:
            _logger.error('An exception occured while cleaning sources')
            pass

-
-    class RefTime(models.Model):
-        _name = "runbot.repo.reftime"
-        _log_access = False
-
-        time = fields.Float('Time', index=True, required=True)
-        repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
+    def _docker_cleanup(self):
+        docker_ps_result = docker_ps()
+        containers = {int(dc.split('-', 1)[0]):dc for dc in docker_ps_result if dest_reg.match(dc)}
+        if containers:
+            candidates = self.env['runbot.build'].search([('id', 'in', list(containers.keys())), ('local_state', '=', 'done')])
+            for c in candidates:
+                _logger.info('container %s found running with build state done', containers[c.id])
+                docker_stop(containers[c.id])
+        ignored = {dc for dc in docker_ps_result if not dest_reg.match(dc)}
+        if ignored:
+            _logger.debug('docker (%s) not deleted because not dest format', " ".join(list(ignored)))


-    class HookTime(models.Model):
-        _name = "runbot.repo.hooktime"
-        _log_access = False
+class RefTime(models.Model):
+    _name = "runbot.repo.reftime"
+    _log_access = False

-        time = fields.Float('Time')
-        repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
+    time = fields.Float('Time', index=True, required=True)
+    repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
+
+
+class HookTime(models.Model):
+    _name = "runbot.repo.hooktime"
+    _log_access = False
+
+    time = fields.Float('Time')
+    repo_id = fields.Many2one('runbot.repo', 'Repository', required=True, ondelete='cascade')
--- a/runbot/tests/common.py
+++ b/runbot/tests/common.py
@ -36,6 +36,7 @@ class RunbotCase(TransactionCase):
        self.start_patcher('isdir', 'odoo.addons.runbot.common.os.path.isdir', True)
        self.start_patcher('isfile', 'odoo.addons.runbot.common.os.path.isfile', True)
        self.start_patcher('docker_run', 'odoo.addons.runbot.models.build_config.docker_run')
+        self.start_patcher('docker_build', 'odoo.addons.runbot.models.build.docker_build')
        self.start_patcher('docker_ps', 'odoo.addons.runbot.models.repo.docker_ps', [])
        self.start_patcher('docker_stop', 'odoo.addons.runbot.models.repo.docker_stop')

--- a/runbot/tests/test_cron.py
+++ b/runbot/tests/test_cron.py
@ -56,7 +56,6 @@ class Test_Cron(RunbotCase):
        ret = self.Repo._cron_fetch_and_build(hostname)
        self.assertEqual(None, ret)
        mock_scheduler.assert_called()
-        self.assertTrue(mock_reload.called)
        host = self.env['runbot.host'].search([('name', '=', hostname)])
        self.assertEqual(host.name, hostname, 'A new host should have been created')
        self.assertGreater(host.psql_conn_count, 0, 'A least one connection should exist on the current psql instance')
--- a/runbot/tests/test_repo.py
+++ b/runbot/tests/test_repo.py
@ -257,10 +257,10 @@ class Test_Repo_Scheduler(RunbotCase):
            'name': 'refs/head/foo'
        })

-    @patch('odoo.addons.runbot.models.build.runbot_build._reap')
    @patch('odoo.addons.runbot.models.build.runbot_build._kill')
    @patch('odoo.addons.runbot.models.build.runbot_build._schedule')
-    def test_repo_scheduler(self, mock_schedule, mock_kill, mock_reap):
+    @patch('odoo.addons.runbot.models.build.runbot_build._init_pendings')
+    def test_repo_scheduler(self, mock_init_pendings, mock_schedule, mock_kill):
        self.env['ir.config_parameter'].set_param('runbot.runbot_workers', 6)
        builds = []
        # create 6 builds that are testing on the host to verify that
@ -293,8 +293,8 @@ class Test_Repo_Scheduler(RunbotCase):
            'local_state': 'pending',
        })
        builds.append(build)
-
-        self.foo_repo._scheduler()
+        host = self.env['runbot.host']._get_current()
+        self.foo_repo._scheduler(host)

        build.invalidate_cache()
        scheduled_build.invalidate_cache()
@ -304,7 +304,7 @@ class Test_Repo_Scheduler(RunbotCase):
        # give some room for the pending build
        self.Build.search([('name', '=', 'a')]).write({'local_state': 'done'})

-        self.foo_repo._scheduler()
+        self.foo_repo._scheduler(host)
        build.invalidate_cache()
        scheduled_build.invalidate_cache()
        self.assertEqual(build.host, 'host.runbot.com')
--- a/runbot/views/build_views.xml
+++ b/runbot/views/build_views.xml
@ -22,7 +22,6 @@
                        <field name="local_result"/>
                        <field name="global_result"/>
                        <field name="triggered_result" groups="base.group_no_one"/>
-                        <field name="pid"/>
                        <field name="host"/>
                        <field name="job_start" groups="base.group_no_one"/>
                        <field name="job_end" groups="base.group_no_one"/>
@ -58,7 +57,6 @@
                <field name="port"/>
                <field name="job"/>
                <field name="coverage_result"/>
-                <field name="pid"/>
                <field name="host"/>
                <field name="build_time"/>
                <field name="build_age"/>
--- a/runbot_builder/builder.py
+++ b/runbot_builder/builder.py
@ -18,10 +18,8 @@ _logger = logging.getLogger(__name__)

 class RunbotClient():

-    def __init__(self, env, args):
+    def __init__(self, env):
        self.env = env
-        self.args = args
-        self.fqdn = socket.getfqdn()
        self.ask_interrupt = threading.Event()

    def main_loop(self):
@ -31,15 +29,17 @@ class RunbotClient():
        host = self.env['runbot.host']._get_current()
        count = 0
        while True:
+            host.last_start_loop = fields.Datetime.now()
            count = count % 60
            if count == 0:
                logging.info('Host %s running with %s slots on pid %s%s', host.name, host.get_nb_worker(), os.getpid(), ' (assigned only)' if host.assigned_only else '')
                self.env['runbot.repo']._source_cleanup()
                self.env['runbot.build']._local_cleanup()
-                host.last_end_loop = host.last_start_loop = fields.Datetime.now()
+                self.env['runbot.repo']._docker_cleanup()
                host.set_psql_conn_count()
            count += 1
            sleep_time = self.env['runbot.repo']._scheduler_loop_turn(host)
+            host.last_end_loop = fields.Datetime.now()
            self.env.cr.commit()
            self.env.reset()
            self.sleep(sleep_time)
@ -97,7 +97,7 @@ def run():
    with odoo.api.Environment.manage():
        with registry.cursor() as cr:
            env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {})
-            runbot_client = RunbotClient(env, args)
+            runbot_client = RunbotClient(env)
            # run main loop
            runbot_client.main_loop()