[REF] runbot: split cron to speed up builds

The Runbot Cron is executed on each runbot instance. When the number of
instances scales, the time needed for an instance to obtain the cron
increases.

With this commit, the original runbot_cron is removed. Instead, a cron
have to be created to run the _cron_fetch_and_schedule method.
This method will fetch the repo and create pending builds. This cron is
intended to run only on one runbot instance. This method needs a host
parameter to specify which runbot instance will be in charge of this
task.

On the other hand, a dedicated cron have to be manually created for each
runbot instance that will have the build task.
Those cron's only have to call the _cron_fetch_and_build method with the
runbot hostname as a parameter. This method will then self
assign pending builds if there are slots available.
All available build slots are reserved in a single LOCKED SQL query.

Both methods are intended to last a large amount of time, just a few
minutes below the cron timeout to maximize the cron productivity.
The timeout is randomized to avoid deadlocks if the runbot instances are
started at the same time.

So the --limit-time-real parameter have to be set to a minimum of 180
sec (600 or 1200 are probably better targets).
This commit is contained in:
Christophe Monniez 2019-02-06 14:37:36 +01:00
parent ffd27739a4
commit fe018aeefa
8 changed files with 193 additions and 72 deletions

View File

@ -6,7 +6,7 @@
'author': "Odoo SA",
'website': "http://runbot.odoo.com",
'category': 'Website',
'version': '2.6',
'version': '3.0',
'depends': ['website', 'base'],
'data': [
'security/runbot_security.xml',
@ -23,6 +23,5 @@
'templates/nginx.xml',
'templates/badge.xml',
'templates/branch.xml',
'data/runbot_cron.xml'
],
}

View File

@ -1,13 +0,0 @@
<?xml version="1.0" encoding='UTF-8'?>
<odoo>
<record model="ir.cron" id="runbot_repo_cron">
<field name="name">Runbot Cron</field>
<field name="model_id" ref="model_runbot_repo"/>
<field name="state">code</field>
<field name="code">model._cron()</field>
<field name="interval_number">1</field>
<field name="interval_type">minutes</field>
<field name="numbercall">-1</field>
<field name="doall" eval="False"/>
</record>
</odoo>

View File

@ -103,7 +103,7 @@ class runbot_build(models.Model):
if branch.job_type == 'none' or vals.get('job_type', '') == 'none':
return self.env['runbot.build']
build_id = super(runbot_build, self).create(vals)
extra_info = {'sequence': build_id.id}
extra_info = {'sequence': build_id.id if not build_id.sequence else build_id.sequence}
job_type = vals['job_type'] if 'job_type' in vals else build_id.branch_id.job_type
extra_info.update({'job_type': job_type})
context = self.env.context
@ -462,8 +462,6 @@ class runbot_build(models.Model):
build._log(build.job, "failed running job method, see runbot log")
build._kill(result='ko')
continue
# needed to prevent losing pids if multiple jobs are started and one them raise an exception
self.env.cr.commit()
if pid == -2:
# no process to wait, directly call next job

View File

@ -4,6 +4,7 @@ import dateutil
import json
import logging
import os
import random
import re
import requests
import signal
@ -115,36 +116,14 @@ class runbot_repo(models.Model):
else:
raise
def _update_git(self):
""" Update the git repo on FS """
def _find_new_commits(self, repo):
""" Find new commits in bare repo """
self.ensure_one()
repo = self
_logger.debug('repo %s updating branches', repo.name)
Branch = self.env['runbot.branch']
Build = self.env['runbot.build']
icp = self.env['ir.config_parameter']
max_age = int(icp.get_param('runbot.runbot_max_age', default=30))
Build = self.env['runbot.build']
Branch = self.env['runbot.branch']
if not os.path.isdir(os.path.join(repo.path)):
os.makedirs(repo.path)
if not os.path.isdir(os.path.join(repo.path, 'refs')):
_logger.info("Cloning repository '%s' in '%s'" % (repo.name, repo.path))
subprocess.call(['git', 'clone', '--bare', repo.name, repo.path])
# check for mode == hook
fname_fetch_head = os.path.join(repo.path, 'FETCH_HEAD')
if os.path.isfile(fname_fetch_head):
fetch_time = os.path.getmtime(fname_fetch_head)
if repo.mode == 'hook' and repo.hook_time and dt2time(repo.hook_time) < fetch_time:
t0 = time.time()
_logger.debug('repo %s skip hook fetch fetch_time: %ss ago hook_time: %ss ago',
repo.name, int(t0 - fetch_time), int(t0 - dt2time(repo.hook_time)))
return
repo._git(['fetch', '-p', 'origin', '+refs/heads/*:refs/heads/*', '+refs/pull/*/head:refs/pull/*'])
fields = ['refname', 'objectname', 'committerdate:iso8601', 'authorname', 'authoremail', 'subject', 'committername', 'committeremail']
fmt = "%00".join(["%(" + field + ")" for field in fields])
git_refs = repo._git(['for-each-ref', '--format', fmt, '--sort=-committerdate', 'refs/heads', 'refs/pull'])
@ -162,14 +141,6 @@ class runbot_repo(models.Model):
for name, sha, date, author, author_email, subject, committer, committer_email in refs:
# create or get branch
# branch = repo.branch_ids.search([('name', '=', name), ('repo_id', '=', repo.id)])
# if not branch:
# _logger.debug('repo %s found new branch %s', repo.name, name)
# branch = self.branch_ids.create({
# 'repo_id': repo.id,
# 'name': name})
# keep for next version with a branch_ids field
if ref_branches.get(name):
branch_id = ref_branches[name]
else:
@ -232,6 +203,44 @@ class runbot_repo(models.Model):
builds_to_be_skipped = Build.search(skippable_domain, order='sequence desc', offset=running_max)
builds_to_be_skipped._skip()
def _create_pending_builds(self, repos):
""" Find new commits in physical repos"""
for repo in repos:
try:
repo._find_new_commits(repo)
except Exception:
_logger.exception('Fail to find new commits in repo %s', repo.name)
def _clone(self):
""" Clone the remote repo if needed """
self.ensure_one()
repo = self
if not os.path.isdir(os.path.join(repo.path, 'refs')):
_logger.info("Cloning repository '%s' in '%s'" % (repo.name, repo.path))
subprocess.call(['git', 'clone', '--bare', repo.name, repo.path])
def _update_git(self):
""" Update the git repo on FS """
self.ensure_one()
repo = self
_logger.debug('repo %s updating branches', repo.name)
if not os.path.isdir(os.path.join(repo.path)):
os.makedirs(repo.path)
self._clone()
# check for mode == hook
fname_fetch_head = os.path.join(repo.path, 'FETCH_HEAD')
if os.path.isfile(fname_fetch_head):
fetch_time = os.path.getmtime(fname_fetch_head)
if repo.mode == 'hook' and repo.hook_time and dt2time(repo.hook_time) < fetch_time:
t0 = time.time()
_logger.debug('repo %s skip hook fetch fetch_time: %ss ago hook_time: %ss ago',
repo.name, int(t0 - fetch_time), int(t0 - dt2time(repo.hook_time)))
return
repo._git(['fetch', '-p', 'origin', '+refs/heads/*:refs/heads/*', '+refs/pull/*/head:refs/pull/*'])
def _update(self, repos):
""" Update the physical git reposotories on FS"""
for repo in repos:
@ -242,6 +251,8 @@ class runbot_repo(models.Model):
def _scheduler(self, ids=None):
"""Schedule builds for the repository"""
if not ids:
return
icp = self.env['ir.config_parameter']
workers = int(icp.get_param('runbot.runbot_workers', default=6))
running_max = int(icp.get_param('runbot.runbot_running_max', default=75))
@ -256,22 +267,40 @@ class runbot_repo(models.Model):
build_ids._schedule()
# launch new tests
testing = Build.search_count(domain_host + [('state', '=', 'testing')])
pending = Build.search_count(domain + [('state', '=', 'pending')])
nb_testing = Build.search_count(domain_host + [('state', '=', 'testing')])
available_slots = workers - nb_testing
if available_slots > 0:
# commit transaction to reduce the critical section duration
self.env.cr.commit()
# self-assign to be sure that another runbot instance cannot self assign the same builds
query = """UPDATE
runbot_build
SET
host = %(host)s
WHERE
runbot_build.id IN (
SELECT
runbot_build.id
FROM
runbot_build
LEFT JOIN runbot_branch ON runbot_branch.id = runbot_build.branch_id
WHERE
runbot_build.repo_id IN %(repo_ids)s
AND runbot_build.state = 'pending'
AND runbot_branch.job_type != 'none'
AND runbot_build.host IS NULL
ORDER BY
runbot_branch.sticky DESC,
runbot_branch.priority DESC,
runbot_build.sequence ASC
FOR UPDATE OF runbot_build SKIP LOCKED
LIMIT %(available_slots)s)"""
while testing < workers and pending > 0:
# find sticky / priority pending build if any, otherwise, last pending (by id, not by sequence) will do the job
pending_ids = Build.search(domain + [('state', '=', 'pending'), '|', ('branch_id.sticky', '=', True), ('branch_id.priority', '=', True)], limit=1)
if not pending_ids:
pending_ids = Build.search(domain + [('state', '=', 'pending')], order="sequence", limit=1)
pending_ids._schedule()
# compute the number of testing and pending jobs again
testing = Build.search_count(domain_host + [('state', '=', 'testing')])
pending = Build.search_count(domain + [('state', '=', 'pending')])
self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': fqdn(), 'available_slots': available_slots})
pending_build = Build.search(domain + domain_host + [('state', '=', 'pending')])
if pending_build:
pending_build._schedule()
self.env.cr.commit()
# terminate and reap doomed build
build_ids = Build.search(domain_host + [('state', '=', 'running')]).ids
@ -321,8 +350,46 @@ class runbot_repo(models.Model):
else:
_logger.debug('failed to start nginx - failed to kill orphan worker - oh well')
def _cron(self):
repos = self.search([('mode', '!=', 'disabled')])
self._update(repos)
self._scheduler(repos.ids)
self._reload_nginx()
def _get_cron_period(self, min_margin=120):
""" Compute a randomized cron period with a 2 min margin below
real cron timeout from config.
"""
cron_limit = config.get('limit_time_real_cron')
req_limit = config.get('limit_time_real')
cron_timeout = cron_limit if cron_limit > -1 else req_limit
return cron_timeout - (min_margin + random.randint(1, 60))
def _cron_fetch_and_schedule(self, hostname):
"""This method have to be called from a dedicated cron on a runbot
in charge of orchestration.
"""
if hostname != fqdn():
return 'Not for me'
start_time = time.time()
timeout = self._get_cron_period()
icp = self.env['ir.config_parameter']
update_frequency = int(icp.get_param('runbot.runbot_update_frequency', default=10))
while time.time() - start_time < timeout:
repos = self.search([('mode', '!=', 'disabled')])
self._update(repos)
self._create_pending_builds(repos)
self.env.cr.commit()
time.sleep(update_frequency)
def _cron_fetch_and_build(self, hostname):
""" This method have to be called from a dedicated cron
created on each runbot instance.
"""
if hostname != fqdn():
return 'Not for me'
start_time = time.time()
timeout = self._get_cron_period()
icp = self.env['ir.config_parameter']
update_frequency = int(icp.get_param('runbot.runbot_update_frequency', default=10))
while time.time() - start_time < timeout:
repos = self.search([('mode', '!=', 'disabled')])
self._update(repos)
self._scheduler(repos.ids)
self.env.cr.commit()
self._reload_nginx()
time.sleep(update_frequency)

View File

@ -14,6 +14,7 @@ class ResConfigSettings(models.TransientModel):
runbot_domain = fields.Char('Runbot domain')
runbot_max_age = fields.Integer('Max branch age (in days)')
runbot_logdb_uri = fields.Char('Runbot URI for build logs')
runbot_update_frequency = fields.Integer('Update frequency (in seconds)')
@api.model
def get_values(self):
@ -26,6 +27,7 @@ class ResConfigSettings(models.TransientModel):
runbot_domain=get_param('runbot.runbot_domain', default=common.fqdn()),
runbot_max_age=int(get_param('runbot.runbot_max_age', default=30)),
runbot_logdb_uri=get_param('runbot.runbot_logdb_uri', default=False),
runbot_update_frequency=get_param('runbot.runbot_update_frequency', default=10),
)
return res
@ -40,3 +42,4 @@ class ResConfigSettings(models.TransientModel):
set_param("runbot.runbot_domain", self.runbot_domain)
set_param("runbot.runbot_max_age", self.runbot_max_age)
set_param("runbot.runbot_logdb_uri", self.runbot_logdb_uri)
set_param('runbot.runbot_update_frequency', self.runbot_update_frequency)

View File

@ -5,3 +5,4 @@ from . import test_jobs
from . import test_frontend
from . import test_job_types
from . import test_schedule
from . import test_cron

62
runbot/tests/test_cron.py Normal file
View File

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
from unittest.mock import patch
from odoo.tests import common
class Test_Cron(common.TransactionCase):
def setUp(self):
super(Test_Cron, self).setUp()
self.Repo = self.env['runbot.repo']
@patch('odoo.addons.runbot.models.repo.config.get')
def test_cron_period(self, mock_config_get):
""" Test that the random cron period stays below margin
Assuming a configuration of 10 minutes cron limit
"""
mock_config_get.return_value = 600
period = self.Repo._get_cron_period(min_margin=200)
for i in range(200):
self.assertLess(period, 400)
@patch('odoo.addons.runbot.models.repo.fqdn')
def test_crons_returns(self, mock_fqdn):
""" test that cron_fetch_and_schedule and _cron_fetch_and_build
return directly when called on wrong host
"""
mock_fqdn.return_value = 'runboty.foo.com'
ret = self.Repo._cron_fetch_and_schedule('runbotx.foo.com')
self.assertEqual(ret, 'Not for me')
ret = self.Repo._cron_fetch_and_build('runbotx.foo.com')
self.assertEqual(ret, 'Not for me')
@patch('odoo.addons.runbot.models.repo.runbot_repo._get_cron_period')
@patch('odoo.addons.runbot.models.repo.runbot_repo._create_pending_builds')
@patch('odoo.addons.runbot.models.repo.runbot_repo._update')
@patch('odoo.addons.runbot.models.repo.fqdn')
def test_cron_schedule(self, mock_fqdn, mock_update, mock_create, mock_cron_period):
""" test that cron_fetch_and_schedule do its work """
mock_fqdn.return_value = 'runbotx.foo.com'
mock_cron_period.return_value = 2
self.env['ir.config_parameter'].sudo().set_param('runbot.runbot_update_frequency', 1)
ret = self.Repo._cron_fetch_and_schedule('runbotx.foo.com')
self.assertEqual(None, ret)
mock_update.assert_called_with(self.Repo)
mock_create.assert_called_with(self.Repo)
@patch('odoo.addons.runbot.models.repo.runbot_repo._get_cron_period')
@patch('odoo.addons.runbot.models.repo.runbot_repo._reload_nginx')
@patch('odoo.addons.runbot.models.repo.runbot_repo._scheduler')
@patch('odoo.addons.runbot.models.repo.runbot_repo._update')
@patch('odoo.addons.runbot.models.repo.fqdn')
def test_cron_build(self, mock_fqdn, mock_update, mock_scheduler, mock_reload, mock_cron_period):
""" test that cron_fetch_and_build do its work """
mock_fqdn.return_value = 'runbotx.foo.com'
mock_cron_period.return_value = 2
self.env['ir.config_parameter'].sudo().set_param('runbot.runbot_update_frequency', 1)
ret = self.Repo._cron_fetch_and_build('runbotx.foo.com')
self.assertEqual(None, ret)
mock_update.assert_called_with(self.Repo)
mock_scheduler.assert_called_with([])
self.assertTrue(mock_reload.called)

View File

@ -46,6 +46,10 @@
<label for="runbot_logdb_uri" class="col-xs-3 o_light_label" style="width: 60%;"/>
<field name="runbot_logdb_uri" style="width: 30%;"/>
</div>
<div class="mt-16 row">
<label for="runbot_update_frequency" class="col-xs-3 o_light_label" style="width: 60%;"/>
<field name="runbot_update_frequency" style="width: 30%;"/>
</div>
</div>
</div>
</div>