[IMP] runbot_merge: retry FF on failure in case it's transient

Further improvements are possible, but that seems like a good start (hopefully). closes #94
2025-03-15 23:45:44 +07:00 · 2019-02-28 16:09:07 +01:00 · 2019-02-28 16:09:07 +01:00 · 79b03a6995
commit 79b03a6995
parent 42046cb21c
1 changed files with 62 additions and 29 deletions
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@ -1226,35 +1226,7 @@ class Stagings(models.Model):
            repo_name = None
            staging_heads = json.loads(self.heads)
            try:
-                # reverting updates doesn't work if the branches are
-                # protected (because a revert is basically a force
-                # push), instead use the tmp branch as a dry-run
-                tmp_target = 'tmp.' + self.target.name
-                # first force-push the current targets to all tmps
-                for repo_name in staging_heads.keys():
-                    if repo_name.endswith('^'):
-                        continue
-                    g = gh[repo_name]
-                    g.set_ref(tmp_target, g.head(self.target.name))
-
-                # then attempt to FF the tmp to the staging
-                for repo_name, head in staging_heads.items():
-                    if repo_name.endswith('^'):
-                        continue
-                    gh[repo_name].fast_forward(tmp_target, staging_heads.get(repo_name + '^') or head)
-
-                # there is still a race condition here, but it's way
-                # lower than "the entire staging duration"...
-                for repo_name, head in staging_heads.items():
-                    if repo_name.endswith('^'):
-                        continue
-
-                    # if the staging has a $repo^ head, merge that,
-                    # otherwise merge the regular (CI'd) head
-                    gh[repo_name].fast_forward(
-                        self.target.name,
-                        staging_heads.get(repo_name + '^') or head
-                    )
+                repo_name = self._safety_dance(gh, staging_heads)
            except exceptions.FastForwardError as e:
                logger.warning(
                    "Could not fast-forward successful staging on %s:%s",
@ -1285,6 +1257,67 @@ class Stagings(models.Model):
        elif self.state == 'failure' or project.is_timed_out(self):
            self.try_splitting()

+    def _safety_dance(self, gh, staging_heads):
+        """ Reverting updates doesn't work if the branches are protected
+        (because a revert is basically a force push). So we can update
+        REPO_A, then fail to update REPO_B for some reason, and we're hosed.
+
+        To try and make this issue less likely, do the safety dance:
+
+        * First, perform a dry run using the tmp branches (which can be
+          force-pushed and sacrificed), that way if somebody pushed directly
+          to REPO_B during the staging we catch it. If we're really unlucky
+          they could still push after the dry run but...
+        * An other issue then is that the github call sometimes fails for no
+          noticeable reason (e.g. network failure or whatnot), if it fails
+          on REPO_B when REPO_A has already been updated things get pretty
+          bad. In that case, wait a bit and retry for now. A more complex
+          strategy (including disabling the branch entirely until somebody
+          has looked at and fixed the issue) might be necessary.
+
+        :returns: the last repo it tried to update (probably the one on which
+                  it failed, if it failed)
+        """
+        # FIXME: would make sense for FFE to be richer, and contain the repo name
+        repo_name = None
+        tmp_target = 'tmp.' + self.target.name
+        # first force-push the current targets to all tmps
+        for repo_name in staging_heads.keys():
+            if repo_name.endswith('^'):
+                continue
+            g = gh[repo_name]
+            g.set_ref(tmp_target, g.head(self.target.name))
+        # then attempt to FF the tmp to the staging
+        for repo_name, head in staging_heads.items():
+            if repo_name.endswith('^'):
+                continue
+            gh[repo_name].fast_forward(tmp_target, staging_heads.get(repo_name + '^') or head)
+        # there is still a race condition here, but it's way
+        # lower than "the entire staging duration"...
+        first = True
+        for repo_name, head in staging_heads.items():
+            if repo_name.endswith('^'):
+                continue
+
+            for pause in [0.1, 0.3, 0.5, 0.9, 0]: # last one must be 0/falsy of we lose the exception
+                try:
+                    # if the staging has a $repo^ head, merge that,
+                    # otherwise merge the regular (CI'd) head
+                    gh[repo_name].fast_forward(
+                        self.target.name,
+                        staging_heads.get(repo_name + '^') or head
+                    )
+                except exceptions.FastForwardError:
+                    # The GH API regularly fails us. If the failure does not
+                    # occur on the first repository, retry a few times with a
+                    # little pause.
+                    if not first and pause:
+                        time.sleep(pause)
+                        continue
+                    raise
+            first = False
+        return repo_name
+
 class Split(models.Model):
    _name = 'runbot_merge.split'