From b517df4ff0b4e1c4772875c35bf6991a55c9fc85 Mon Sep 17 00:00:00 2001
From: Christophe Monniez <moc@odoo.com>
Date: Mon, 20 Apr 2020 11:39:06 +0200
Subject: [PATCH] [IMP] runbot: try to fetch multiple times before disabling

Sometimes, it happens that a `git fetch` fails with an error code 128
for example. When this happens, the runbot host is immediately disabled.

During investigations of such cases, we found that simply retrying the
fetch command works.

With this commit, the fetch command is tried 5 times with an increasing
delay before deciding to disable the runbot host.
---
 runbot/models/repo.py     | 26 +++++++++++++++-------
 runbot/tests/test_repo.py | 45 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/runbot/models/repo.py b/runbot/models/repo.py
index ef4bb1b1..983c5605 100644
--- a/runbot/models/repo.py
+++ b/runbot/models/repo.py
@@ -175,7 +175,7 @@ class runbot_repo(models.Model):
         self.ensure_one()
         _logger.debug("git command: git (dir %s) %s", self.short_name, ' '.join(cmd))
         cmd = ['git', '--git-dir=%s' % self.path] + cmd
-        return subprocess.check_output(cmd).decode('utf-8')
+        return subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode()
 
     def _git_rev_parse(self, branch_name):
         return self._git(['rev-parse', branch_name]).strip()
@@ -425,13 +425,23 @@ class runbot_repo(models.Model):
         # Extracted from update_git to be easily overriden in external module
         self.ensure_one()
         repo = self
-        try:
-            repo._git(['fetch', '-p', 'origin', '+refs/heads/*:refs/heads/*', '+refs/pull/*/head:refs/pull/*'])
-        except subprocess.CalledProcessError as e:
-            message = 'Failed to fetch repo %s with return code %s. Original command was %s' % (repo.name, e.returncode, e.cmd)
-            _logger.exception(message)
-            host = self.env['runbot.host'].search([('name', '=', fqdn())])
-            host.disable()
+        try_count = 0
+        failure = True
+        delay = 0
+
+        while failure and try_count < 5:
+            time.sleep(delay)
+            try:
+                repo._git(['fetch', '-p', 'origin', '+refs/heads/*:refs/heads/*', '+refs/pull/*/head:refs/pull/*'])
+                failure = False
+            except subprocess.CalledProcessError as e:
+                try_count += 1
+                delay = delay * 1.5 if delay else 0.5
+                if try_count > 4:
+                    message = 'Failed to fetch repo %s: %s' % (repo.name, e.output.decode())
+                    _logger.exception(message)
+                    host = self.env['runbot.host']._get_current()
+                    host.disable()
 
     def _update(self, force=True):
         """ Update the physical git reposotories on FS"""
diff --git a/runbot/tests/test_repo.py b/runbot/tests/test_repo.py
index 5111de2f..34b91859 100644
--- a/runbot/tests/test_repo.py
+++ b/runbot/tests/test_repo.py
@@ -2,7 +2,9 @@
 import datetime
 from unittest import skip
 from unittest.mock import patch, Mock
+from subprocess import CalledProcessError
 from odoo.tests import common, TransactionCase
+from odoo.tools import mute_logger
 import logging
 import odoo
 import time
@@ -211,7 +213,6 @@ class Test_Repo(RunbotCase):
         _test_times('runbot.repo.reftime', 'set_ref_time', 'get_ref_time')
 
 
-
 class Test_Github(TransactionCase):
     def test_github(self):
         """ Test different github responses or failures"""
@@ -239,6 +240,48 @@ class Test_Github(TransactionCase):
             self.assertEqual(2, mock_session.return_value.post.call_count, "_github method should try two times by default")
 
 
+class TestFetch(RunbotCase):
+
+    def setUp(self):
+        super(TestFetch, self).setUp()
+        self.mock_root = self.patchers['repo_root_patcher']
+
+    def test_update_fetch_cmd(self):
+        """ Test that git fetch is tried multiple times before disabling host """
+
+        fetch_count = 0
+        force_failure = False
+
+        def git_side_effect(cmd):
+            nonlocal fetch_count
+            fetch_count += 1
+            if fetch_count < 3 or force_failure:
+                raise CalledProcessError(128, cmd, 'Dummy Error'.encode('utf-8'))
+            else:
+                return True
+
+        git_patcher = self.patchers['git_patcher']
+        git_patcher.side_effect = git_side_effect
+
+        repo = self.Repo.create({'name': 'bla@example.com:foo/bar'})
+        host = self.env['runbot.host']._get_current()
+
+        self.assertFalse(host.assigned_only)
+        # Ensure that Host is not disabled if fetch succeeds after 3 tries
+        with mute_logger("odoo.addons.runbot.models.repo"):
+            repo._update_fetch_cmd()
+        self.assertFalse(host.assigned_only, "Host should not be disabled when fetch succeeds")
+        self.assertEqual(fetch_count, 3)
+
+        # Now ensure that host is disabled after 5 unsuccesful tries
+        force_failure = True
+        fetch_count = 0
+        with mute_logger("odoo.addons.runbot.models.repo"):
+            repo._update_fetch_cmd()
+        self.assertTrue(host.assigned_only)
+        self.assertEqual(fetch_count, 5)
+
+
 class Test_Repo_Scheduler(RunbotCase):
 
     def setUp(self):