From 0b30b9c1040bf8725c58839f305e20befaa0005c Mon Sep 17 00:00:00 2001 From: Xavier-Do Date: Thu, 25 Nov 2021 13:22:19 +0100 Subject: [PATCH] [IMP] runbot: create a separate process for cron As for the builder, this give the ability to run the discovery of new commits and all related logic in a separate process. This will mainly be usefull to restart frontend without waiting for cron or restart "leader" without stoping the frontend. This will also be usefull for optimisation purpose. --- runbot/models/batch.py | 7 ++- runbot/models/build.py | 5 ++ runbot/models/host.py | 1 + runbot/models/runbot.py | 109 +++++++++++++++++++-------------- runbot_builder/builder.py | 125 ++++---------------------------------- runbot_builder/leader.py | 18 ++++++ runbot_builder/tester.py | 16 +++++ runbot_builder/tools.py | 124 +++++++++++++++++++++++++++++++++++++ 8 files changed, 247 insertions(+), 158 deletions(-) create mode 100755 runbot_builder/leader.py create mode 100755 runbot_builder/tester.py create mode 100644 runbot_builder/tools.py diff --git a/runbot/models/batch.py b/runbot/models/batch.py index 0b1de10b..5a5a0d3d 100644 --- a/runbot/models/batch.py +++ b/runbot/models/batch.py @@ -97,12 +97,17 @@ class Batch(models.Model): batch._log('Cannot kill or skip build %s, build is used in another bundle: %s', build.id, bundles.mapped('name')) def _process(self): + processed = self.browse() for batch in self: if batch.state == 'preparing' and batch.last_update < fields.Datetime.now() - datetime.timedelta(seconds=60): batch._prepare() + processed |= batch elif batch.state == 'ready' and all(slot.build_id.global_state in (False, 'running', 'done') for slot in batch.slot_ids): + _logger.info('Batch %s is done', self.id) batch._log('Batch done') batch.state = 'done' + processed |= batch + return processed def _create_build(self, params): """ @@ -130,12 +135,12 @@ class Batch(models.Model): return link_type, build def _prepare(self, auto_rebase=False): + _logger.info('Preparing batch %s', self.id) for level, message in self.bundle_id.consistency_warning(): if level == "warning": self.warning("Bundle warning: %s" % message) self.state = 'ready' - _logger.info('Preparing batch %s', self.id) bundle = self.bundle_id project = bundle.project_id diff --git a/runbot/models/build.py b/runbot/models/build.py index 8486b744..f929c172 100644 --- a/runbot/models/build.py +++ b/runbot/models/build.py @@ -756,6 +756,11 @@ class BuildResult(models.Model): self._log('_checkout', 'Multiple repo have same export path in build, some source may be missing for %s' % build_export_path, level='ERROR') self._kill(result='ko') exports[build_export_path] = commit.export() + + checkout_time = time.time() - start + if checkout_time > 60: + self._log('checkout', 'Checkout took %s seconds' % int(checkout_time)) + return exports def _get_available_modules(self): diff --git a/runbot/models/host.py b/runbot/models/host.py index bce208ec..ae251dd1 100644 --- a/runbot/models/host.py +++ b/runbot/models/host.py @@ -72,6 +72,7 @@ class Host(models.Model): def _docker_build(self): """ build docker images needed by locally pending builds""" + _logger.info('Building docker image...') self.ensure_one() static_path = self._get_work_path() self.clear_caches() # needed to ensure that content is updated on all hosts diff --git a/runbot/models/runbot.py b/runbot/models/runbot.py index 0f92822b..1dcdc5e3 100644 --- a/runbot/models/runbot.py +++ b/runbot/models/runbot.py @@ -7,6 +7,7 @@ import signal import subprocess import shutil +from contextlib import contextmanager from requests.exceptions import HTTPError from ..common import fqdn, dest_reg, os @@ -203,7 +204,7 @@ class Runbot(models.AbstractModel): This method is the default cron for new commit discovery and build sheduling. The cron runs for a long time to avoid spamming logs """ - pull_info_failures = set() + pull_info_failures = {} start_time = time.time() timeout = self._get_cron_period() get_param = self.env['ir.config_parameter'].get_param @@ -222,55 +223,74 @@ class Runbot(models.AbstractModel): self.env['runbot.build']._local_cleanup() self._docker_cleanup() _logger.info('Starting loop') - while time.time() - start_time < timeout: - repos = self.env['runbot.repo'].search([('mode', '!=', 'disabled')]) + if runbot_do_schedule or runbot_do_fetch: + while time.time() - start_time < timeout: + if runbot_do_fetch: + self._fetch_loop_turn(host, pull_info_failures) + if runbot_do_schedule: + sleep_time = self._scheduler_loop_turn(host, update_frequency) + self.sleep(sleep_time) + else: + self.sleep(update_frequency) + self._commit() - processing_batch = self.env['runbot.batch'].search([('state', 'in', ('preparing', 'ready'))], order='id asc') - preparing_batch = processing_batch.filtered(lambda b: b.state == 'preparing') - self._commit() - if runbot_do_fetch: - for repo in repos: - try: - repo._update_batches(bool(preparing_batch), ignore=pull_info_failures) - self._commit() - except HTTPError as e: - # Sometimes a pr pull info can fail. - # - Most of the time it is only temporary and it will be successfull on next try. - # - In some rare case the pr will always fail (github inconsistency) The pr exists in git (for-each-ref) but not on github api. - # For this rare case, we store the pr in memory in order to unstuck other pr/branches update. - # We consider that this error should not remain, in this case github needs to fix this inconsistency. - # Another solution would be to create the pr with fake pull info. This idea is not the best one - # since we want to avoid to have many pr with fake pull_info in case of temporary failure of github services. - # With this solution, the pr will be retried once every cron loop (~10 minutes). - # We dont except to have pr with this kind of persistent failure more than every few mounths/years. - self.env.cr.rollback() - self.env.clear() - pull_number = e.response.url.split('/')[-1] - pull_info_failures.add(pull_number) - self.warning('Pr pull info failed for %s', pull_number) - self._commit() - if processing_batch: - _logger.info('starting processing of %s batches', len(processing_batch)) - for batch in processing_batch: - batch._process() - self._commit() - _logger.info('end processing') - self._commit() - if runbot_do_schedule: - sleep_time = self._scheduler_loop_turn(host, update_frequency) - self.sleep(sleep_time) - else: - self.sleep(update_frequency) - self._commit() - - host.last_end_loop = fields.Datetime.now() + host.last_end_loop = fields.Datetime.now() def sleep(self, t): time.sleep(t) + def _fetch_loop_turn(self, host, pull_info_failures, default_sleep=1): + with self.manage_host_exception(host) as manager: + repos = self.env['runbot.repo'].search([('mode', '!=', 'disabled')]) + processing_batch = self.env['runbot.batch'].search([('state', 'in', ('preparing', 'ready'))], order='id asc') + preparing_batch = processing_batch.filtered(lambda b: b.state == 'preparing') + self._commit() + for repo in repos: + try: + repo._update_batches(force=bool(preparing_batch), ignore=pull_info_failures) + self._commit() # commit is mainly here to avoid to lose progression in case of fetch failure or concurrent update + except HTTPError as e: + # Sometimes a pr pull info can fail. + # - Most of the time it is only temporary and it will be successfull on next try. + # - In some rare case the pr will always fail (github inconsistency) The pr exists in git (for-each-ref) but not on github api. + # For this rare case, we store the pr in memory in order to unstuck other pr/branches update. + # We consider that this error should not remain, in this case github needs to fix this inconsistency. + # Another solution would be to create the pr with fake pull info. This idea is not the best one + # since we want to avoid to have many pr with fake pull_info in case of temporary failure of github services. + # With this solution, the pr will be retried once every cron loop (~10 minutes). + # We dont except to have pr with this kind of persistent failure more than every few mounths/years. + self.env.cr.rollback() + self.env.clear() + pull_number = e.response.url.split('/')[-1] + pull_info_failures[pull_number] = time.time() + self.warning('Pr pull info failed for %s', pull_number) + self._commit() + + if processing_batch: + for batch in processing_batch: + if batch._process(): + self._commit() + self._commit() + + # cleanup old pull_info_failures + for pr_number, t in pull_info_failures.items(): + if t + 15*60 < time.time(): + _logger.warning('Removing %s from pull_info_failures', pr_number) + del self.pull_info_failures[pr_number] + + return manager.get('sleep', default_sleep) + def _scheduler_loop_turn(self, host, default_sleep=1): - try: + _logger.info('Scheduling...') + with self.manage_host_exception(host) as manager: self._scheduler(host) + return manager.get('sleep', default_sleep) + + @contextmanager + def manage_host_exception(self, host): + res = {} + try: + yield res host.last_success = fields.Datetime.now() self._commit() except Exception as e: @@ -284,12 +304,11 @@ class Runbot(models.AbstractModel): host.last_exception = str(e) host.exception_count = 1 self._commit() - return random.uniform(0, 3) + res['sleep'] = random.uniform(0, 3) else: if host.last_exception: host.last_exception = "" host.exception_count = 0 - return default_sleep def _source_cleanup(self): try: diff --git a/runbot_builder/builder.py b/runbot_builder/builder.py index 8dd13f7b..6ea8a5ff 100755 --- a/runbot_builder/builder.py +++ b/runbot_builder/builder.py @@ -1,122 +1,23 @@ #!/usr/bin/python3 -import argparse +from tools import RunbotClient, run import logging -import os -import sys -import threading -import signal - -from logging.handlers import WatchedFileHandler - -LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s' -logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) -logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG) -logging.addLevelName(25, "!NFO") _logger = logging.getLogger(__name__) +class BuilderClient(RunbotClient): -class RunbotClient(): + def on_start(self): + self.env['runbot.repo'].search([('mode', '!=', 'disabled')])._update() - def __init__(self, env): - self.env = env - self.ask_interrupt = threading.Event() - - def main_loop(self): - from odoo import fields - signal.signal(signal.SIGINT, self.signal_handler) - signal.signal(signal.SIGTERM, self.signal_handler) - signal.signal(signal.SIGQUIT, self.dump_stack) - host = self.env['runbot.host']._get_current() - host._bootstrap() - count = 0 - while True: - try: - host.last_start_loop = fields.Datetime.now() - count = count % 60 - if count == 0: - logging.info('Host %s running with %s slots on pid %s%s', host.name, host.nb_worker, os.getpid(), ' (assigned only)' if host.assigned_only else '') - self.env['runbot.runbot']._source_cleanup() - self.env['runbot.build']._local_cleanup() - self.env['runbot.runbot']._docker_cleanup() - host.set_psql_conn_count() - _logger.info('Building docker image...') - host._docker_build() - _logger.info('Scheduling...') - count += 1 - sleep_time = self.env['runbot.runbot']._scheduler_loop_turn(host) - host.last_end_loop = fields.Datetime.now() - self.env.cr.commit() - self.env.clear() - self.sleep(sleep_time) - except Exception as e: - _logger.exception('Builder main loop failed with: %s', e) - self.env.cr.rollback() - self.env.clear() - self.sleep(10) - - if self.ask_interrupt.is_set(): - return - - def signal_handler(self, signal, frame): - if self.ask_interrupt.is_set(): - _logger.info("Second Interrupt detected, force exit") - os._exit(1) - - _logger.info("Interrupt detected") - self.ask_interrupt.set() - - def dump_stack(self, signal, frame): - import odoo - odoo.tools.misc.dumpstacks() - - def sleep(self, t): - self.ask_interrupt.wait(t) - - -def run(): - # parse args - parser = argparse.ArgumentParser() - parser.add_argument('--odoo-path', help='Odoo sources path') - parser.add_argument('--db_host', default='127.0.0.1') - parser.add_argument('--db_port', default='5432') - parser.add_argument('--db_user') - parser.add_argument('--db_password') - parser.add_argument('-d', '--database', default='runbot', help='name of runbot db') - parser.add_argument('--logfile', default=False) - args = parser.parse_args() - if args.logfile: - dirname = os.path.dirname(args.logfile) - if dirname and not os.path.isdir(dirname): - os.makedirs(dirname) - - handler = WatchedFileHandler(args.logfile) - formatter = logging.Formatter(LOG_FORMAT) - handler.setFormatter(formatter) - logging.getLogger().addHandler(handler) - - # configure odoo - sys.path.append(args.odoo_path) - import odoo - _logger.info("Starting scheduler on database %s", args.database) - odoo.tools.config['db_host'] = args.db_host - odoo.tools.config['db_port'] = args.db_port - odoo.tools.config['db_user'] = args.db_user - odoo.tools.config['db_password'] = args.db_password - addon_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) - config_addons_path = odoo.tools.config['addons_path'] - odoo.tools.config['addons_path'] = ','.join([config_addons_path, addon_path]) - - # create environment - registry = odoo.registry(args.database) - with odoo.api.Environment.manage(): - with registry.cursor() as cr: - env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {}) - runbot_client = RunbotClient(env) - # run main loop - runbot_client.main_loop() + def loop_turn(self): + if self.count == 1: # cleanup at second iteration + self.env['runbot.runbot']._source_cleanup() + self.env['runbot.build']._local_cleanup() + self.env['runbot.runbot']._docker_cleanup() + self.host.set_psql_conn_count() + self.host._docker_build() + return self.env['runbot.runbot']._scheduler_loop_turn(self.host) if __name__ == '__main__': - run() - _logger.info("Stopping gracefully") + run(BuilderClient) diff --git a/runbot_builder/leader.py b/runbot_builder/leader.py new file mode 100755 index 00000000..6a21ced9 --- /dev/null +++ b/runbot_builder/leader.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 +from tools import RunbotClient, run +import logging +import time + +_logger = logging.getLogger(__name__) + +class LeaderClient(RunbotClient): # Conductor, Director, Main, Maestro, Lead + def __init__(self, env): + self.pull_info_failures = {} + super().__init__(env) + + def loop_turn(self): + return self.env['runbot.runbot']._fetch_loop_turn(self.host, self.pull_info_failures) + + +if __name__ == '__main__': + run(LeaderClient) diff --git a/runbot_builder/tester.py b/runbot_builder/tester.py new file mode 100755 index 00000000..3d0629d3 --- /dev/null +++ b/runbot_builder/tester.py @@ -0,0 +1,16 @@ +#!/usr/bin/python3 +from tools import RunbotClient, run +import logging + +_logger = logging.getLogger(__name__) + +class TesterClient(RunbotClient): + + def loop_turn(self): + _logger.info('='*50) + _logger.info('Testing: %s', self.env['runbot.build'].search_count([('local_state', '=', 'testing')])) + _logger.info('Pending: %s', self.env['runbot.build'].search_count([('local_state', '=', 'pending')])) + return 10 + +if __name__ == '__main__': + run(TesterClient) diff --git a/runbot_builder/tools.py b/runbot_builder/tools.py new file mode 100644 index 00000000..fc0a5ae6 --- /dev/null +++ b/runbot_builder/tools.py @@ -0,0 +1,124 @@ +#!/usr/bin/python3 +import argparse +import logging +import os +import sys +import threading +import signal + +from logging.handlers import WatchedFileHandler + +LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s' +logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) +logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG) +logging.addLevelName(25, "!NFO") + +_logger = logging.getLogger(__name__) + + +class RunbotClient(): + + def __init__(self, env): + self.env = env + self.ask_interrupt = threading.Event() + self.host = None + self.count = 0 + self.max_count = 60 + + def on_start(self): + pass + + def main_loop(self): + from odoo import fields + self.on_start() + signal.signal(signal.SIGINT, self.signal_handler) + signal.signal(signal.SIGTERM, self.signal_handler) + signal.signal(signal.SIGQUIT, self.dump_stack) + self.host = self.env['runbot.host']._get_current() + self.host._bootstrap() + logging.info( + 'Host %s running with %s slots on pid %s%s', + self.host.name, + self.host.nb_worker, + os.getpid(), + ' (assigned only)' if self.host.assigned_only else '' + ) + while True: + try: + self.host.last_start_loop = fields.Datetime.now() + self.count = self.count % self.max_count + sleep_time = self.loop_turn() + self.count += 1 + self.host.last_end_loop = fields.Datetime.now() + self.env.cr.commit() + self.env.clear() + self.sleep(sleep_time) + except Exception as e: + _logger.exception('Builder main loop failed with: %s', e) + self.env.cr.rollback() + self.env.clear() + self.sleep(10) + if self.ask_interrupt.is_set(): + return + + def loop_turn(self): + raise NotImplementedError() + + def signal_handler(self, _signal, _frame): + if self.ask_interrupt.is_set(): + _logger.info("Second Interrupt detected, force exit") + os._exit(1) + + _logger.info("Interrupt detected") + self.ask_interrupt.set() + + def dump_stack(self, _signal, _frame): + import odoo + odoo.tools.misc.dumpstacks() + + def sleep(self, t): + self.ask_interrupt.wait(t) + + +def run(client_class): + # parse args + parser = argparse.ArgumentParser() + parser.add_argument('--odoo-path', help='Odoo sources path') + parser.add_argument('--db_host') + parser.add_argument('--db_port') + parser.add_argument('--db_user') + parser.add_argument('--db_password') + parser.add_argument('-d', '--database', default='runbot', help='name of runbot db') + parser.add_argument('--logfile', default=False) + args = parser.parse_args() + if args.logfile: + dirname = os.path.dirname(args.logfile) + if dirname and not os.path.isdir(dirname): + os.makedirs(dirname) + + handler = WatchedFileHandler(args.logfile) + formatter = logging.Formatter(LOG_FORMAT) + handler.setFormatter(formatter) + logging.getLogger().addHandler(handler) + + # configure odoo + sys.path.append(args.odoo_path) + import odoo + _logger.info("Starting scheduler on database %s", args.database) + odoo.tools.config['db_host'] = args.db_host + odoo.tools.config['db_port'] = args.db_port + odoo.tools.config['db_user'] = args.db_user + odoo.tools.config['db_password'] = args.db_password + addon_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) + config_addons_path = odoo.tools.config['addons_path'] + odoo.tools.config['addons_path'] = ','.join([config_addons_path, addon_path]) + + # create environment + registry = odoo.registry(args.database) + with odoo.api.Environment.manage(): + with registry.cursor() as cr: + env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {}) + client = client_class(env) + # run main loop + client.main_loop() + _logger.info("Stopping gracefully")