[IMP] runbot: try to fetch multiple times before disabling

Sometimes, it happens that a `git fetch` fails with an error code 128 for example. When this happens, the runbot host is immediately disabled. During investigations of such cases, we found that simply retrying the fetch command works. With this commit, the fetch command is tried 5 times with an increasing delay before deciding to disable the runbot host.
2025-03-17 08:25:46 +07:00 · 2020-04-20 11:39:06 +02:00 · 2020-04-20 11:39:06 +02:00 · b517df4ff0
commit b517df4ff0
parent c005561546
2 changed files with 62 additions and 9 deletions
--- a/runbot/models/repo.py
+++ b/runbot/models/repo.py
@ -175,7 +175,7 @@ class runbot_repo(models.Model):
        self.ensure_one()
        _logger.debug("git command: git (dir %s) %s", self.short_name, ' '.join(cmd))
        cmd = ['git', '--git-dir=%s' % self.path] + cmd
-        return subprocess.check_output(cmd).decode('utf-8')
+        return subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode()

    def _git_rev_parse(self, branch_name):
        return self._git(['rev-parse', branch_name]).strip()
@ -425,13 +425,23 @@ class runbot_repo(models.Model):
        # Extracted from update_git to be easily overriden in external module
        self.ensure_one()
        repo = self
-        try:
-            repo._git(['fetch', '-p', 'origin', '+refs/heads/*:refs/heads/*', '+refs/pull/*/head:refs/pull/*'])
-        except subprocess.CalledProcessError as e:
-            message = 'Failed to fetch repo %s with return code %s. Original command was %s' % (repo.name, e.returncode, e.cmd)
-            _logger.exception(message)
-            host = self.env['runbot.host'].search([('name', '=', fqdn())])
-            host.disable()
+        try_count = 0
+        failure = True
+        delay = 0
+
+        while failure and try_count < 5:
+            time.sleep(delay)
+            try:
+                repo._git(['fetch', '-p', 'origin', '+refs/heads/*:refs/heads/*', '+refs/pull/*/head:refs/pull/*'])
+                failure = False
+            except subprocess.CalledProcessError as e:
+                try_count += 1
+                delay = delay * 1.5 if delay else 0.5
+                if try_count > 4:
+                    message = 'Failed to fetch repo %s: %s' % (repo.name, e.output.decode())
+                    _logger.exception(message)
+                    host = self.env['runbot.host']._get_current()
+                    host.disable()

    def _update(self, force=True):
        """ Update the physical git reposotories on FS"""
--- a/runbot/tests/test_repo.py
+++ b/runbot/tests/test_repo.py
@ -2,7 +2,9 @@
 import datetime
 from unittest import skip
 from unittest.mock import patch, Mock
+from subprocess import CalledProcessError
 from odoo.tests import common, TransactionCase
+from odoo.tools import mute_logger
 import logging
 import odoo
 import time
@ -211,7 +213,6 @@ class Test_Repo(RunbotCase):
        _test_times('runbot.repo.reftime', 'set_ref_time', 'get_ref_time')


-
 class Test_Github(TransactionCase):
    def test_github(self):
        """ Test different github responses or failures"""
@ -239,6 +240,48 @@ class Test_Github(TransactionCase):
            self.assertEqual(2, mock_session.return_value.post.call_count, "_github method should try two times by default")


+class TestFetch(RunbotCase):
+
+    def setUp(self):
+        super(TestFetch, self).setUp()
+        self.mock_root = self.patchers['repo_root_patcher']
+
+    def test_update_fetch_cmd(self):
+        """ Test that git fetch is tried multiple times before disabling host """
+
+        fetch_count = 0
+        force_failure = False
+
+        def git_side_effect(cmd):
+            nonlocal fetch_count
+            fetch_count += 1
+            if fetch_count < 3 or force_failure:
+                raise CalledProcessError(128, cmd, 'Dummy Error'.encode('utf-8'))
+            else:
+                return True
+
+        git_patcher = self.patchers['git_patcher']
+        git_patcher.side_effect = git_side_effect
+
+        repo = self.Repo.create({'name': 'bla@example.com:foo/bar'})
+        host = self.env['runbot.host']._get_current()
+
+        self.assertFalse(host.assigned_only)
+        # Ensure that Host is not disabled if fetch succeeds after 3 tries
+        with mute_logger("odoo.addons.runbot.models.repo"):
+            repo._update_fetch_cmd()
+        self.assertFalse(host.assigned_only, "Host should not be disabled when fetch succeeds")
+        self.assertEqual(fetch_count, 3)
+
+        # Now ensure that host is disabled after 5 unsuccesful tries
+        force_failure = True
+        fetch_count = 0
+        with mute_logger("odoo.addons.runbot.models.repo"):
+            repo._update_fetch_cmd()
+        self.assertTrue(host.assigned_only)
+        self.assertEqual(fetch_count, 5)
+
+
 class Test_Repo_Scheduler(RunbotCase):

    def setUp(self):