[FIX] *: unstage on status going from success to failure

And unconditionally unstage when the HEAD of a PR is synchronised. While a rebuild on a PR which was previously staged can be a false positive (e.g. because it hit a non-derministic test the second time around) it can also be legitimate e.g. auto-rebase of an old PR to check it out. In that case the PR should be unstaged. Furthermore, as soon as the PR gets rebuilt it goes back into `approved` (because the status goes to pending and currently there's no great way to suppress that in the rebuild case without also fucking it up for the sync case). This would cause a sync on the PR to be missed (in that it would not unstage the PR), which is broken. Fix that by not checking the state of the PR beforehand, it seems to be an unnecessary remnant of older code, and not really an optimisation (or at least one likely not worth bothering with, especially as we then proceed to perform a full PR validation anyway). Fixes #950
2025-03-15 15:35:46 +07:00 · 2024-09-18 15:19:13 +02:00 · 2024-09-18 15:19:13 +02:00 · c8a06601a7
commit c8a06601a7
parent a046cf2f7c
5 changed files with 121 additions and 9 deletions
--- a/forwardport/tests/test_weird.py
+++ b/forwardport/tests/test_weird.py
@ -3,7 +3,7 @@ from datetime import datetime, timedelta

 import pytest

-from utils import seen, Commit, to_pr, make_basic
+from utils import seen, Commit, to_pr, make_basic, prevent_unstaging


 def test_no_token(env, config, make_repo):
@ -935,7 +935,8 @@ def test_missing_magic_ref(env, config, make_repo):
    pr_id = to_pr(env, pr)
    assert pr_id.staging_id

-    pr_id.head = '0'*40
+    with prevent_unstaging(pr_id.staging_id):
+        pr_id.head = '0'*40
    with prod:
        prod.post_status('staging.a', 'success', 'legal/cla')
        prod.post_status('staging.a', 'success', 'ci/runbot')
--- a/mergebot_test_utils/utils.py
+++ b/mergebot_test_utils/utils.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import contextlib
 import itertools
 import re
 import time
@ -200,3 +201,16 @@ Signed-off-by: {pr_id.reviewed_by.formatted_email}"""
 def ensure_one(records):
    assert len(records) == 1
    return records
+
+
+@contextlib.contextmanager
+def prevent_unstaging(st) -> None:
+    # hack: `Stagings.cancel` only cancels *active* stagings,
+    # so if we disable the staging, do a thing, then re-enable
+    # then things should work out
+    assert st and st.active, "preventing unstaging of not-a-staging is not useful"
+    st.active = False
+    try:
+        yield
+    finally:
+        st.active = True
--- a/runbot_merge/controllers/init.py
+++ b/runbot_merge/controllers/init.py
@ -285,9 +285,6 @@ def handle_pr(env, event):
            _logger.error("PR %s sync %s -> %s => failure (closed)", pr_obj.display_name, pr_obj.head, pr['head']['sha'])
            return "It's my understanding that closed/merged PRs don't get sync'd"

-        if pr_obj.state == 'ready':
-            pr_obj.unstage("updated by %s", event['sender']['login'])
-
        _logger.info(
            "PR %s sync %s -> %s by %s => reset to 'open' and squash=%s",
            pr_obj.display_name,
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@ -1164,7 +1164,7 @@ For your own safety I've ignored *everything in your entire comment*.
        # temporarily on the same head, or on the same head with different
        # targets
        updateable = self.filtered(lambda p: not p.merge_date)
-        updateable.statuses = statuses
+        updateable.statuses = statuses or '{}'
        for pr in updateable:
            if pr.status == "failure":
                statuses = json.loads(pr.statuses_full)
@ -1212,6 +1212,9 @@ For your own safety I've ignored *everything in your entire comment*.
                    break
                if v == 'pending':
                    st = 'pending'
+            if pr.status != 'failure' and st == 'failure':
+                pr.unstage("had CI failure after staging")
+
            pr.status = st

    @api.depends(
@ -1346,7 +1349,7 @@ For your own safety I've ignored *everything in your entire comment*.

        pr = super().create(vals)
        c = self.env['runbot_merge.commit'].search([('sha', '=', pr.head)])
-        pr._validate(c.statuses or '{}')
+        pr._validate(c.statuses)

        if pr.state not in ('closed', 'merged'):
            self.env.ref('runbot_merge.pr.created')._send(
@ -1423,8 +1426,13 @@ For your own safety I've ignored *everything in your entire comment*.

        newhead = vals.get('head')
        if newhead:
+            if pid := self.env.cr.precommit.data.get('change-author'):
+                writer = self.env['res.partner'].browse(pid)
+            else:
+                writer = self.env.user.partner_id
+            self.unstage("updated by %s", writer.github_login or writer.name)
            c = self.env['runbot_merge.commit'].search([('sha', '=', newhead)])
-            self._validate(c.statuses or '{}')
+            self._validate(c.statuses)
        return w

    def _check_linked_prs_statuses(self, commit=False):
--- a/runbot_merge/tests/test_staging.py
+++ b/runbot_merge/tests/test_staging.py
@ -1,4 +1,4 @@
-from utils import Commit, to_pr
+from utils import Commit, to_pr, make_basic, prevent_unstaging


 def test_staging_disabled_branch(env, project, repo, config):
@ -26,3 +26,95 @@ def test_staging_disabled_branch(env, project, repo, config):
        "master is allowed to stage, should be staged"
    assert not to_pr(env, other_pr).staging_id, \
        "other is *not* allowed to stage, should not be staged"
+
+
+def test_staged_failure(env, config, repo, users):
+    """If a PR is staged and gets a new CI failure, it should be unstaged
+
+    This was an issue with odoo/odoo#165931 which got rebuilt and that triggered
+    a failure, which made the PR !ready but kept the staging going. So the PR
+    ended up in an odd state of being both staged and not ready.
+
+    And while the CI failure it got was a false positive, it was in fact the
+    problematic PR breaking that staging.
+
+    More relevant the runbot's "automatic rebase" mode sends CI to the original
+    commits so receiving legitimate failures after staging very much makes
+    sense e.g. an old PR is staged, the staging starts failing, somebody notices
+    the outdated PR and triggers autorebase, which fails (because something
+    incompatible was merged in the meantime), the PR *should* be unstaged.
+    """
+    with repo:
+        repo.make_commits(None, Commit("master", tree={'a': '1'}), ref="heads/master")
+
+        repo.make_commits('master', Commit('c', tree={'a': 'b'}), ref="heads/mybranch")
+        pr = repo.make_pr(target='master', head='mybranch')
+        repo.post_status('mybranch', 'success')
+        pr.post_comment('hansen r+', config['role_reviewer']['token'])
+    env.run_crons()
+
+    pr_id = to_pr(env, pr)
+    staging = pr_id.staging_id
+    assert staging, "pr should be staged"
+
+    with repo:
+        # started rebuild, nothing should happen
+        repo.post_status('mybranch', 'pending')
+    env.run_crons()
+    assert pr_id.staging_id
+    # can't find a clean way to keep this "ready" when transitioning from
+    # success to pending *without updating the head*, at least not without
+    # adding a lot of contextual information around `_compute_statuses`
+    # assert pr_id.state == 'ready'
+
+    with repo:
+        # rebuild failed omg!
+        repo.post_status('mybranch', 'failure')
+    env.run_crons()
+
+    assert pr_id.status == 'failure'
+    assert pr_id.state == 'approved'
+
+    assert not pr_id.staging_id, "pr should be unstaged"
+    assert staging.state == "cancelled"
+    assert staging.reason == f"{pr_id.display_name} had CI failure after staging"
+
+
+def test_update_unready(env, config, repo, users):
+    """Less likely with `test_staged_failure` fixing most of the issue, but
+    clearly the assumption that a staged PR will be `ready` is not strictly
+    enforced.
+
+    As such, updating the PR should try to `unstage` it no matter what state
+    it's in, this will lead to slightly higher loads on sync but loads on the
+    mergebot are generally non-existent outside of the git maintenance cron,
+    and there are doubtless other optimisations missing, or that (and other
+    items) can be done asynchronously.
+    """
+    with repo:
+        repo.make_commits(None, Commit("master", tree={'a': '1'}), ref="heads/master")
+
+        repo.make_commits('master', Commit('c', tree={'a': 'b'}), ref="heads/mybranch")
+        pr = repo.make_pr(target='master', head='mybranch')
+        repo.post_status('mybranch', 'success')
+        pr.post_comment('hansen r+', config['role_reviewer']['token'])
+
+        [c2] = repo.make_commits('master', Commit('c', tree={'a': 'c'}))
+    env.run_crons()
+
+    pr_id = to_pr(env, pr)
+    staging = pr_id.staging_id
+    assert staging, "pr should be staged"
+
+    with prevent_unstaging(pr_id.staging_id):
+        pr_id.overrides = '{"default": {"state": "failure"}}'
+    assert pr_id.state == "approved"
+    assert pr_id.staging_id, "pr should not have been unstaged because we cheated"
+
+    with repo:
+        repo.update_ref("heads/mybranch", c2, force=True)
+    env.run_crons()
+
+    assert not pr_id.staging_id, "pr should be unstaged"
+    assert staging.state == "cancelled"
+    assert staging.reason == f"{pr_id.display_name} updated by {users['user']}"