From 5d3a2de698ddd4578da830b771665b82592b44a1 Mon Sep 17 00:00:00 2001
From: Xavier-Do <xdo@odoo.com>
Date: Mon, 17 May 2021 12:03:04 +0200
Subject: [PATCH] [FIX] runbot: don't fail all branch discovery if one pull
 info fails.

Sometimes a pr pull info can fail.
- Most of the time it is only temporary and it will be successfull on next try.
- In some rare case the pr will always fail (github inconsistency) The pr exist in git but not on github api.
For this rare case, we store the pr in memory in order to unstuck other pr/branches update.
We consider that this error should not remain, in this case github needs to fix the inconsistency.
This is why the runbot model don't handle such a case for now.
Another solution would be to create the pr with fake pull info. This idea is not the best one
since we want to avoid to have many pr with fake pull_info in case of temporary failure of giothub services.
With this solution, the pr will be retried once every cron loop.
We dont except to have pr with this kind of persistent failure more than every few mounths/years.
---
 runbot/models/repo.py          | 11 +++++------
 runbot/models/runbot.py        | 30 +++++++++++++++++++++++++++---
 runbot/views/warning_views.xml |  3 ++-
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/runbot/models/repo.py b/runbot/models/repo.py
index 0fd03c22..cddb39d9 100644
--- a/runbot/models/repo.py
+++ b/runbot/models/repo.py
@@ -345,13 +345,12 @@ class Repo(models.Model):
             return os.path.getmtime(fname_fetch_head)
         return 0
 
-    def _get_refs(self, max_age=30):
+    def _get_refs(self, max_age=30, ignore=None):
         """Find new refs
         :return: list of tuples with following refs informations:
         name, sha, date, author, author_email, subject, committer, committer_email
         """
         self.ensure_one()
-
         get_ref_time = round(self._get_fetch_head_time(), 4)
         if not self.get_ref_time or get_ref_time > self.get_ref_time:
             try:
@@ -367,6 +366,8 @@ class Repo(models.Model):
                     return []
                 refs = [tuple(field for field in line.split('\x00')) for line in git_refs.split('\n')]
                 refs = [r for r in refs if dateutil.parser.parse(r[2][:19]) + datetime.timedelta(days=max_age) > datetime.datetime.now()]
+                if ignore:
+                    refs = [r for r in refs if r[0].split('/')[-1] not in ignore]
                 return refs
             except Exception:
                 _logger.exception('Fail to get refs for repo %s', self.name)
@@ -401,7 +402,6 @@ class Repo(models.Model):
                 _logger.info('new branch %s found in %s', name, self.name)
         if new_branch_values:
             _logger.info('Creating new branches')
-            # TODO ASAP dont fail all if pr status fail, aka github is dans les choux
             new_branches = self.env['runbot.branch'].create(new_branch_values)
             for branch in new_branches:
                 ref_branches[branch.ref()] = branch
@@ -457,13 +457,13 @@ class Repo(models.Model):
                 if bundle.last_batch.state == 'preparing':
                     bundle.last_batch._new_commit(branch)
 
-    def _update_batches(self, force=False):
+    def _update_batches(self, force=False, ignore=None):
         """ Find new commits in physical repos"""
         updated = False
         for repo in self:
             if repo.remote_ids and self._update(poll_delay=30 if force else 60*5):
                 max_age = int(self.env['ir.config_parameter'].get_param('runbot.runbot_max_age', default=30))
-                ref = repo._get_refs(max_age)
+                ref = repo._get_refs(max_age, ignore=ignore)
                 ref_branches = repo._find_or_create_branches(ref)
                 repo._find_new_commits(ref, ref_branches)
                 updated = True
@@ -550,7 +550,6 @@ class Repo(models.Model):
             except Exception:
                 _logger.exception('Fail to update repo %s', repo.name)
 
-
 class RefTime(models.Model):
     _name = 'runbot.repo.reftime'
     _description = "Repo reftime"
diff --git a/runbot/models/runbot.py b/runbot/models/runbot.py
index 8dfc9d43..8b1ab09e 100644
--- a/runbot/models/runbot.py
+++ b/runbot/models/runbot.py
@@ -7,6 +7,8 @@ import signal
 import subprocess
 import shutil
 
+from requests.exceptions import HTTPError
+
 from ..common import fqdn, dest_reg, os
 from ..container import docker_ps, docker_stop
 
@@ -17,7 +19,6 @@ from odoo.modules.module import get_module_resource
 
 _logger = logging.getLogger(__name__)
 
-
 # after this point, not realy a repo buisness
 class Runbot(models.AbstractModel):
     _name = 'runbot.runbot'
@@ -202,6 +203,7 @@ class Runbot(models.AbstractModel):
         This method is the default cron for new commit discovery and build sheduling.
         The cron runs for a long time to avoid spamming logs
         """
+        pull_info_failures = set()
         start_time = time.time()
         timeout = self._get_cron_period()
         get_param = self.env['ir.config_parameter'].get_param
@@ -228,8 +230,25 @@ class Runbot(models.AbstractModel):
             self._commit()
             if runbot_do_fetch:
                 for repo in repos:
-                    repo._update_batches(bool(preparing_batch))
-                    self._commit()
+                    try:
+                        repo._update_batches(bool(preparing_batch), ignore=pull_info_failures)
+                        self._commit()
+                    except HTTPError as e:
+                        # Sometimes a pr pull info can fail.
+                        # - Most of the time it is only temporary and it will be successfull on next try. 
+                        # - In some rare case the pr will always fail (github inconsistency) The pr exists in git (for-each-ref) but not on github api.
+                        # For this rare case, we store the pr in memory in order to unstuck other pr/branches update.
+                        # We consider that this error should not remain, in this case github needs to fix this inconsistency.
+                        # Another solution would be to create the pr with fake pull info. This idea is not the best one
+                        # since we want to avoid to have many pr with fake pull_info in case of temporary failure of github services.
+                        # With this solution, the pr will be retried once every cron loop (~10 minutes).
+                        # We dont except to have pr with this kind of persistent failure more than every few mounths/years.
+                        self.env.cr.rollback()
+                        self.env.clear()
+                        pull_number = e.response.url.split('/')[-1]
+                        pull_info_failures.add(pull_number)
+                        self.warning('Pr pull info failed for %s', pull_number)
+                        self._commit()
             if processing_batch:
                 _logger.info('starting processing of %s batches', len(processing_batch))
                 for batch in processing_batch:
@@ -335,6 +354,9 @@ class Runbot(models.AbstractModel):
     def warning(self, message, *args):
         if args:
             message = message % args
+        existing = self.env['runbot.warning'].search([('message', '=', message)])
+        if existing:
+            existing.count += 1
         return self.env['runbot.warning'].create({'message': message})
 
 
@@ -342,8 +364,10 @@ class RunbotWarning(models.Model):
     """
     Generic Warnings for runbot
     """
+    _order = 'write_date desc, id desc'
 
     _name = 'runbot.warning'
     _description = 'Generic Runbot Warning'
 
     message = fields.Char("Warning", index=True)
+    count = fields.Integer("Count", default=1)
diff --git a/runbot/views/warning_views.xml b/runbot/views/warning_views.xml
index 25aa54ec..2a983d76 100644
--- a/runbot/views/warning_views.xml
+++ b/runbot/views/warning_views.xml
@@ -5,8 +5,9 @@
       <field name="model">runbot.warning</field>
       <field name="arch" type="xml">
         <tree string="Runbot Warnings">
-          <field name="create_date"/>
+          <field name="write_date"/>
           <field name="message"/>
+          <field name="count"/>
         </tree>
       </field>
     </record>