From 0b30b9c1040bf8725c58839f305e20befaa0005c Mon Sep 17 00:00:00 2001
From: Xavier-Do <xdo@odoo.com>
Date: Thu, 25 Nov 2021 13:22:19 +0100
Subject: [PATCH] [IMP] runbot: create a separate process for cron

As for the builder, this give the ability to run the discovery of new
commits and all related logic in a separate process.

This will mainly be usefull to restart frontend without waiting for cron
or restart "leader" without stoping the frontend. This will also be
usefull for optimisation purpose.
---
 runbot/models/batch.py    |   7 ++-
 runbot/models/build.py    |   5 ++
 runbot/models/host.py     |   1 +
 runbot/models/runbot.py   | 109 +++++++++++++++++++--------------
 runbot_builder/builder.py | 125 ++++----------------------------------
 runbot_builder/leader.py  |  18 ++++++
 runbot_builder/tester.py  |  16 +++++
 runbot_builder/tools.py   | 124 +++++++++++++++++++++++++++++++++++++
 8 files changed, 247 insertions(+), 158 deletions(-)
 create mode 100755 runbot_builder/leader.py
 create mode 100755 runbot_builder/tester.py
 create mode 100644 runbot_builder/tools.py

diff --git a/runbot/models/batch.py b/runbot/models/batch.py
index 0b1de10b..5a5a0d3d 100644
--- a/runbot/models/batch.py
+++ b/runbot/models/batch.py
@@ -97,12 +97,17 @@ class Batch(models.Model):
                         batch._log('Cannot kill or skip build %s, build is used in another bundle: %s', build.id, bundles.mapped('name'))
 
     def _process(self):
+        processed = self.browse()
         for batch in self:
             if batch.state == 'preparing' and batch.last_update < fields.Datetime.now() - datetime.timedelta(seconds=60):
                 batch._prepare()
+                processed |= batch
             elif batch.state == 'ready' and all(slot.build_id.global_state in (False, 'running', 'done') for slot in batch.slot_ids):
+                _logger.info('Batch %s is done', self.id)
                 batch._log('Batch done')
                 batch.state = 'done'
+                processed |= batch
+        return processed
 
     def _create_build(self, params):
         """
@@ -130,12 +135,12 @@ class Batch(models.Model):
         return link_type, build
 
     def _prepare(self, auto_rebase=False):
+        _logger.info('Preparing batch %s', self.id)
         for level, message in self.bundle_id.consistency_warning():
             if level == "warning":
                 self.warning("Bundle warning: %s" % message)
 
         self.state = 'ready'
-        _logger.info('Preparing batch %s', self.id)
 
         bundle = self.bundle_id
         project = bundle.project_id
diff --git a/runbot/models/build.py b/runbot/models/build.py
index 8486b744..f929c172 100644
--- a/runbot/models/build.py
+++ b/runbot/models/build.py
@@ -756,6 +756,11 @@ class BuildResult(models.Model):
                 self._log('_checkout', 'Multiple repo have same export path in build, some source may be missing for %s' % build_export_path, level='ERROR')
                 self._kill(result='ko')
             exports[build_export_path] = commit.export()
+
+        checkout_time = time.time() - start
+        if checkout_time > 60:
+            self._log('checkout', 'Checkout took %s seconds' % int(checkout_time))
+
         return exports
 
     def _get_available_modules(self):
diff --git a/runbot/models/host.py b/runbot/models/host.py
index bce208ec..ae251dd1 100644
--- a/runbot/models/host.py
+++ b/runbot/models/host.py
@@ -72,6 +72,7 @@ class Host(models.Model):
 
     def _docker_build(self):
         """ build docker images needed by locally pending builds"""
+        _logger.info('Building docker image...')
         self.ensure_one()
         static_path = self._get_work_path()
         self.clear_caches()  # needed to ensure that content is updated on all hosts
diff --git a/runbot/models/runbot.py b/runbot/models/runbot.py
index 0f92822b..1dcdc5e3 100644
--- a/runbot/models/runbot.py
+++ b/runbot/models/runbot.py
@@ -7,6 +7,7 @@ import signal
 import subprocess
 import shutil
 
+from contextlib import contextmanager
 from requests.exceptions import HTTPError
 
 from ..common import fqdn, dest_reg, os
@@ -203,7 +204,7 @@ class Runbot(models.AbstractModel):
         This method is the default cron for new commit discovery and build sheduling.
         The cron runs for a long time to avoid spamming logs
         """
-        pull_info_failures = set()
+        pull_info_failures = {}
         start_time = time.time()
         timeout = self._get_cron_period()
         get_param = self.env['ir.config_parameter'].get_param
@@ -222,55 +223,74 @@ class Runbot(models.AbstractModel):
             self.env['runbot.build']._local_cleanup()
             self._docker_cleanup()
         _logger.info('Starting loop')
-        while time.time() - start_time < timeout:
-            repos = self.env['runbot.repo'].search([('mode', '!=', 'disabled')])
+        if runbot_do_schedule or runbot_do_fetch:
+            while time.time() - start_time < timeout:
+                if runbot_do_fetch:
+                    self._fetch_loop_turn(host, pull_info_failures)
+                if runbot_do_schedule:
+                    sleep_time = self._scheduler_loop_turn(host, update_frequency)
+                    self.sleep(sleep_time)
+                else:
+                    self.sleep(update_frequency)
+                self._commit()
 
-            processing_batch = self.env['runbot.batch'].search([('state', 'in', ('preparing', 'ready'))], order='id asc')
-            preparing_batch = processing_batch.filtered(lambda b: b.state == 'preparing')
-            self._commit()
-            if runbot_do_fetch:
-                for repo in repos:
-                    try:
-                        repo._update_batches(bool(preparing_batch), ignore=pull_info_failures)
-                        self._commit()
-                    except HTTPError as e:
-                        # Sometimes a pr pull info can fail.
-                        # - Most of the time it is only temporary and it will be successfull on next try. 
-                        # - In some rare case the pr will always fail (github inconsistency) The pr exists in git (for-each-ref) but not on github api.
-                        # For this rare case, we store the pr in memory in order to unstuck other pr/branches update.
-                        # We consider that this error should not remain, in this case github needs to fix this inconsistency.
-                        # Another solution would be to create the pr with fake pull info. This idea is not the best one
-                        # since we want to avoid to have many pr with fake pull_info in case of temporary failure of github services.
-                        # With this solution, the pr will be retried once every cron loop (~10 minutes).
-                        # We dont except to have pr with this kind of persistent failure more than every few mounths/years.
-                        self.env.cr.rollback()
-                        self.env.clear()
-                        pull_number = e.response.url.split('/')[-1]
-                        pull_info_failures.add(pull_number)
-                        self.warning('Pr pull info failed for %s', pull_number)
-                        self._commit()
-            if processing_batch:
-                _logger.info('starting processing of %s batches', len(processing_batch))
-                for batch in processing_batch:
-                    batch._process()
-                    self._commit()
-                _logger.info('end processing')
-            self._commit()
-            if runbot_do_schedule:
-                sleep_time = self._scheduler_loop_turn(host, update_frequency)
-                self.sleep(sleep_time)
-            else:
-                self.sleep(update_frequency)
-            self._commit()
-
-        host.last_end_loop = fields.Datetime.now()
+            host.last_end_loop = fields.Datetime.now()
 
     def sleep(self, t):
         time.sleep(t)
 
+    def _fetch_loop_turn(self, host, pull_info_failures, default_sleep=1):
+        with self.manage_host_exception(host) as manager:
+            repos = self.env['runbot.repo'].search([('mode', '!=', 'disabled')])
+            processing_batch = self.env['runbot.batch'].search([('state', 'in', ('preparing', 'ready'))], order='id asc')
+            preparing_batch = processing_batch.filtered(lambda b: b.state == 'preparing')
+            self._commit()
+            for repo in repos:
+                try:
+                    repo._update_batches(force=bool(preparing_batch), ignore=pull_info_failures)
+                    self._commit() # commit is mainly here to avoid to lose progression in case of fetch failure or concurrent update
+                except HTTPError as e:
+                    # Sometimes a pr pull info can fail.
+                    # - Most of the time it is only temporary and it will be successfull on next try.
+                    # - In some rare case the pr will always fail (github inconsistency) The pr exists in git (for-each-ref) but not on github api.
+                    # For this rare case, we store the pr in memory in order to unstuck other pr/branches update.
+                    # We consider that this error should not remain, in this case github needs to fix this inconsistency.
+                    # Another solution would be to create the pr with fake pull info. This idea is not the best one
+                    # since we want to avoid to have many pr with fake pull_info in case of temporary failure of github services.
+                    # With this solution, the pr will be retried once every cron loop (~10 minutes).
+                    # We dont except to have pr with this kind of persistent failure more than every few mounths/years.
+                    self.env.cr.rollback()
+                    self.env.clear()
+                    pull_number = e.response.url.split('/')[-1]
+                    pull_info_failures[pull_number] = time.time()
+                    self.warning('Pr pull info failed for %s', pull_number)
+                    self._commit()
+
+            if processing_batch:
+                for batch in processing_batch:
+                    if batch._process():
+                        self._commit()
+            self._commit()
+
+            # cleanup old pull_info_failures
+            for pr_number, t in pull_info_failures.items():
+                if t + 15*60 < time.time():
+                    _logger.warning('Removing %s from pull_info_failures', pr_number)
+                    del self.pull_info_failures[pr_number]
+
+        return manager.get('sleep', default_sleep)
+
     def _scheduler_loop_turn(self, host, default_sleep=1):
-        try:
+        _logger.info('Scheduling...')
+        with self.manage_host_exception(host) as manager:
             self._scheduler(host)
+        return manager.get('sleep', default_sleep)
+
+    @contextmanager
+    def manage_host_exception(self, host):
+        res = {}
+        try:
+            yield res
             host.last_success = fields.Datetime.now()
             self._commit()
         except Exception as e:
@@ -284,12 +304,11 @@ class Runbot(models.AbstractModel):
                 host.last_exception = str(e)
                 host.exception_count = 1
             self._commit()
-            return random.uniform(0, 3)
+            res['sleep'] = random.uniform(0, 3)
         else:
             if host.last_exception:
                 host.last_exception = ""
                 host.exception_count = 0
-            return default_sleep
 
     def _source_cleanup(self):
         try:
diff --git a/runbot_builder/builder.py b/runbot_builder/builder.py
index 8dd13f7b..6ea8a5ff 100755
--- a/runbot_builder/builder.py
+++ b/runbot_builder/builder.py
@@ -1,122 +1,23 @@
 #!/usr/bin/python3
-import argparse
+from tools import RunbotClient, run
 import logging
-import os
-import sys
-import threading
-import signal
-
-from logging.handlers import WatchedFileHandler
-
-LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
-logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
-logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG)
-logging.addLevelName(25, "!NFO")
 
 _logger = logging.getLogger(__name__)
 
+class BuilderClient(RunbotClient):
 
-class RunbotClient():
+    def on_start(self):
+        self.env['runbot.repo'].search([('mode', '!=', 'disabled')])._update()
 
-    def __init__(self, env):
-        self.env = env
-        self.ask_interrupt = threading.Event()
-
-    def main_loop(self):
-        from odoo import fields
-        signal.signal(signal.SIGINT, self.signal_handler)
-        signal.signal(signal.SIGTERM, self.signal_handler)
-        signal.signal(signal.SIGQUIT, self.dump_stack)
-        host = self.env['runbot.host']._get_current()
-        host._bootstrap()
-        count = 0
-        while True:
-            try:
-                host.last_start_loop = fields.Datetime.now()
-                count = count % 60
-                if count == 0:
-                    logging.info('Host %s running with %s slots on pid %s%s', host.name, host.nb_worker, os.getpid(), ' (assigned only)' if host.assigned_only else '')
-                    self.env['runbot.runbot']._source_cleanup()
-                    self.env['runbot.build']._local_cleanup()
-                    self.env['runbot.runbot']._docker_cleanup()
-                    host.set_psql_conn_count()
-                    _logger.info('Building docker image...')
-                    host._docker_build()
-                    _logger.info('Scheduling...')
-                count += 1
-                sleep_time = self.env['runbot.runbot']._scheduler_loop_turn(host)
-                host.last_end_loop = fields.Datetime.now()
-                self.env.cr.commit()
-                self.env.clear()
-                self.sleep(sleep_time)
-            except Exception as e:
-                _logger.exception('Builder main loop failed with: %s', e)
-                self.env.cr.rollback()
-                self.env.clear()
-                self.sleep(10)
-
-            if self.ask_interrupt.is_set():
-                return
-
-    def signal_handler(self, signal, frame):
-        if self.ask_interrupt.is_set():
-            _logger.info("Second Interrupt detected, force exit")
-            os._exit(1)
-
-        _logger.info("Interrupt detected")
-        self.ask_interrupt.set()
-
-    def dump_stack(self, signal, frame):
-        import odoo
-        odoo.tools.misc.dumpstacks()
-
-    def sleep(self, t):
-        self.ask_interrupt.wait(t)
-
-
-def run():
-    # parse args
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--odoo-path', help='Odoo sources path')
-    parser.add_argument('--db_host', default='127.0.0.1')
-    parser.add_argument('--db_port', default='5432')
-    parser.add_argument('--db_user')
-    parser.add_argument('--db_password')
-    parser.add_argument('-d', '--database', default='runbot', help='name of runbot db')
-    parser.add_argument('--logfile', default=False)
-    args = parser.parse_args()
-    if args.logfile:
-        dirname = os.path.dirname(args.logfile)
-        if dirname and not os.path.isdir(dirname):
-            os.makedirs(dirname)
-
-        handler = WatchedFileHandler(args.logfile)
-        formatter = logging.Formatter(LOG_FORMAT)
-        handler.setFormatter(formatter)
-        logging.getLogger().addHandler(handler)
-
-    # configure odoo
-    sys.path.append(args.odoo_path)
-    import odoo
-    _logger.info("Starting scheduler on database %s", args.database)
-    odoo.tools.config['db_host'] = args.db_host
-    odoo.tools.config['db_port'] = args.db_port
-    odoo.tools.config['db_user'] = args.db_user
-    odoo.tools.config['db_password'] = args.db_password
-    addon_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
-    config_addons_path = odoo.tools.config['addons_path']
-    odoo.tools.config['addons_path'] = ','.join([config_addons_path, addon_path])
-
-    # create environment
-    registry = odoo.registry(args.database)
-    with odoo.api.Environment.manage():
-        with registry.cursor() as cr:
-            env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {})
-            runbot_client = RunbotClient(env)
-            # run main loop
-            runbot_client.main_loop()
+    def loop_turn(self):
+        if self.count == 1: # cleanup at second iteration
+            self.env['runbot.runbot']._source_cleanup()
+            self.env['runbot.build']._local_cleanup()
+            self.env['runbot.runbot']._docker_cleanup()
+            self.host.set_psql_conn_count()
+            self.host._docker_build()
+        return self.env['runbot.runbot']._scheduler_loop_turn(self.host)
 
 
 if __name__ == '__main__':
-    run()
-    _logger.info("Stopping gracefully")
+    run(BuilderClient)
diff --git a/runbot_builder/leader.py b/runbot_builder/leader.py
new file mode 100755
index 00000000..6a21ced9
--- /dev/null
+++ b/runbot_builder/leader.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python3
+from tools import RunbotClient, run
+import logging
+import time
+
+_logger = logging.getLogger(__name__)
+
+class LeaderClient(RunbotClient):  # Conductor, Director, Main, Maestro, Lead
+    def __init__(self, env):
+        self.pull_info_failures = {}
+        super().__init__(env)
+
+    def loop_turn(self):
+        return self.env['runbot.runbot']._fetch_loop_turn(self.host, self.pull_info_failures)
+
+
+if __name__ == '__main__':
+    run(LeaderClient)
diff --git a/runbot_builder/tester.py b/runbot_builder/tester.py
new file mode 100755
index 00000000..3d0629d3
--- /dev/null
+++ b/runbot_builder/tester.py
@@ -0,0 +1,16 @@
+#!/usr/bin/python3
+from tools import RunbotClient, run
+import logging
+
+_logger = logging.getLogger(__name__)
+
+class TesterClient(RunbotClient):
+
+    def loop_turn(self):
+        _logger.info('='*50)
+        _logger.info('Testing: %s', self.env['runbot.build'].search_count([('local_state', '=', 'testing')]))
+        _logger.info('Pending: %s', self.env['runbot.build'].search_count([('local_state', '=', 'pending')]))
+        return 10
+
+if __name__ == '__main__':
+    run(TesterClient)
diff --git a/runbot_builder/tools.py b/runbot_builder/tools.py
new file mode 100644
index 00000000..fc0a5ae6
--- /dev/null
+++ b/runbot_builder/tools.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python3
+import argparse
+import logging
+import os
+import sys
+import threading
+import signal
+
+from logging.handlers import WatchedFileHandler
+
+LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
+logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG)
+logging.addLevelName(25, "!NFO")
+
+_logger = logging.getLogger(__name__)
+
+
+class RunbotClient():
+
+    def __init__(self, env):
+        self.env = env
+        self.ask_interrupt = threading.Event()
+        self.host = None
+        self.count = 0
+        self.max_count = 60
+
+    def on_start(self):
+        pass
+
+    def main_loop(self):
+        from odoo import fields
+        self.on_start()
+        signal.signal(signal.SIGINT, self.signal_handler)
+        signal.signal(signal.SIGTERM, self.signal_handler)
+        signal.signal(signal.SIGQUIT, self.dump_stack)
+        self.host = self.env['runbot.host']._get_current()
+        self.host._bootstrap()
+        logging.info(
+            'Host %s running with %s slots on pid %s%s',
+            self.host.name,
+            self.host.nb_worker,
+            os.getpid(),
+            ' (assigned only)' if self.host.assigned_only else ''
+        )
+        while True:
+            try:
+                self.host.last_start_loop = fields.Datetime.now()
+                self.count = self.count % self.max_count
+                sleep_time = self.loop_turn()
+                self.count += 1
+                self.host.last_end_loop = fields.Datetime.now()
+                self.env.cr.commit()
+                self.env.clear()
+                self.sleep(sleep_time)
+            except Exception as e:
+                _logger.exception('Builder main loop failed with: %s', e)
+                self.env.cr.rollback()
+                self.env.clear()
+                self.sleep(10)
+            if self.ask_interrupt.is_set():
+                return
+
+    def loop_turn(self):
+        raise NotImplementedError()
+
+    def signal_handler(self, _signal, _frame):
+        if self.ask_interrupt.is_set():
+            _logger.info("Second Interrupt detected, force exit")
+            os._exit(1)
+
+        _logger.info("Interrupt detected")
+        self.ask_interrupt.set()
+
+    def dump_stack(self, _signal, _frame):
+        import odoo
+        odoo.tools.misc.dumpstacks()
+
+    def sleep(self, t):
+        self.ask_interrupt.wait(t)
+
+
+def run(client_class):
+    # parse args
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--odoo-path', help='Odoo sources path')
+    parser.add_argument('--db_host')
+    parser.add_argument('--db_port')
+    parser.add_argument('--db_user')
+    parser.add_argument('--db_password')
+    parser.add_argument('-d', '--database', default='runbot', help='name of runbot db')
+    parser.add_argument('--logfile', default=False)
+    args = parser.parse_args()
+    if args.logfile:
+        dirname = os.path.dirname(args.logfile)
+        if dirname and not os.path.isdir(dirname):
+            os.makedirs(dirname)
+
+        handler = WatchedFileHandler(args.logfile)
+        formatter = logging.Formatter(LOG_FORMAT)
+        handler.setFormatter(formatter)
+        logging.getLogger().addHandler(handler)
+
+    # configure odoo
+    sys.path.append(args.odoo_path)
+    import odoo
+    _logger.info("Starting scheduler on database %s", args.database)
+    odoo.tools.config['db_host'] = args.db_host
+    odoo.tools.config['db_port'] = args.db_port
+    odoo.tools.config['db_user'] = args.db_user
+    odoo.tools.config['db_password'] = args.db_password
+    addon_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
+    config_addons_path = odoo.tools.config['addons_path']
+    odoo.tools.config['addons_path'] = ','.join([config_addons_path, addon_path])
+
+    # create environment
+    registry = odoo.registry(args.database)
+    with odoo.api.Environment.manage():
+        with registry.cursor() as cr:
+            env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {})
+            client = client_class(env)
+            # run main loop
+            client.main_loop()
+    _logger.info("Stopping gracefully")