[FIX] runbot_merge: handle missing patch commits

Commits can take some time to propagate through the network (I guess), or human error can lead to the wrong commit being set. Either way, because the entire thing was done using a single fetch in `check=True` the cron job would fail entirely if any of the patch commits was yet unavailable. Update the updater to: - fallback on individual fetches - remove the patch from the set of applicable patch if we (still) can't find its commit I'd have hoped `fetch` could retrieve whatever it found, but apparently the server just crashes out when it doesn't find the commit we ask for, and `fetch` doesn't update anything. No linked issue because I apparently forgot to jot it down (and only remembered about this issue with the #1063 patching issue) but this was reported by mat last week (2025-02-21) when they were wondering why one of their patches was taking a while: - At 0832 patch was created by automated script. - At 0947, an attempt to apply was made, the commit was not found. - At 1126, a second attempt was made but an other patch had been created whose commit was not found, failing both. - At 1255, there was a concurrency error ("cannot lock ref" on the target branch). - Finally at 1427 the patch was applied. All in all it took 6 hours to apply the patch, which is 3-4 staging cycles.
2025-03-15 15:35:46 +07:00 · 2025-02-25 14:00:45 +01:00 · 2025-02-25 14:00:45 +01:00 · e6057afde7
commit e6057afde7
parent 5d08b79550
2 changed files with 60 additions and 8 deletions
--- a/runbot_merge/models/patcher.py
+++ b/runbot_merge/models/patcher.py
@ -6,6 +6,7 @@ overhead, or FBI backdoors oh wait forget about that last one.
 """
 from __future__ import annotations

+import collections
 import contextlib
 import logging
 import pathlib
@ -14,6 +15,7 @@ import subprocess
 import tarfile
 import tempfile
 from dataclasses import dataclass
+from datetime import timedelta
 from email import message_from_string, policy
 from email.utils import parseaddr
 from typing import Union
@ -241,24 +243,42 @@ class Patch(models.Model):

    def _apply_patches(self, target: Branch) -> bool:
        patches = self.search([('target', '=', target.id)], order='id asc')
-        if not patches:
+        selected = len(patches)
+        if not selected:
            return True

-        commits = {}
+        commits = collections.defaultdict(set)
        repos = {}
        for p in patches:
-            repos[p.repository] = git.get_local(p.repository).check(True)
-            commits.setdefault(p.repository, set())
+            if p.repository not in repos:
+                repos[p.repository] = git.get_local(p.repository).check(True)
            if p.commit:
-                commits[p.repository].add(p.commit)
+                commits[p.repository].add(p)
+
+        for repo, ps in commits.items():
+            r = repos[repo].check(False)
+            remote = git.source_url(repo)
+            if r.fetch(
+                remote,
+                f"+refs/heads/{target.name}:refs/heads/{target.name}",
+                *(p.commit for p in ps),
+                no_tags=True
+            ).returncode:
+                r.fetch(remote, f"+refs/heads/{target.name}:refs/heads/{target.name}", no_tags=True)
+                for p in ps:
+                    if r.fetch(remote, p.commit, no_tags=True).returncode:
+                        patches -= p
+                        p.message_post(body=f"Unable to apply patch: commit {p.commit} not found.")
+        # if some of the commits are not available (yet) schedule a new staging
+        # in case this is a low traffic period (so there might not be staging
+        # triggers every other minute
+        if len(patches) < selected:
+            self.env.ref('runbot_merge.staging_cron')._trigger(fields.Datetime.now() + timedelta(minutes=30))

        for patch in patches:
            patch.active = False
            r = repos[patch.repository]
            remote = git.source_url(patch.repository)
-            if (cs := commits.pop(patch.repository, None)) is not None:
-                # first time encountering a repo, fetch the branch and commits
-                r.fetch(remote, f"+refs/heads/{target.name}:refs/heads/{target.name}", *cs, no_tags=True)

            _logger.info(
                "Applying %s to %r (in %s)",
--- a/runbot_merge/tests/test_patching.py
+++ b/runbot_merge/tests/test_patching.py
@ -193,6 +193,38 @@ CONFLICT (content): Merge conflict in b<br></p>\
        (False, '', [('active', 1, 0)]),
    ]

+def test_apply_not_found(env, project, repo, users):
+    """ Github can take some time to propagate commits through the network,
+    resulting in patches getting not found and killing the application.
+
+    Commits which are not found should just be skipped (and trigger a new
+    staging?).
+    """
+    with repo:
+        [c] = repo.make_commits("x", Commit("c", tree={"b": "2"}), ref="heads/abranch")
+        repo.delete_ref('heads/abranch')
+
+    p1 = env['runbot_merge.patch'].create({
+        'target': project.branch_ids.id,
+        'repository': project.repo_ids.id,
+        'commit': c,
+    })
+    # simulate commit which hasn't propagated yet
+    p2 = env['runbot_merge.patch'].create({
+        'target': project.branch_ids.id,
+        'repository': project.repo_ids.id,
+        'commit': "0123456789012345678901234567890123456789",
+    })
+
+    env.run_crons()
+
+    assert not p1.active
+    assert p2.active
+    assert p2.message_ids.mapped('body')[::-1] == [
+        "<p>Unstaged direct-application patch created</p>",
+        "<p>Unable to apply patch: commit 0123456789012345678901234567890123456789 not found.</p>",
+    ]
+
 def test_apply_udiff(env, project, repo, users):
    p = env['runbot_merge.patch'].create({
        'target': project.branch_ids.id,