[FIX] runbot_merge: don't over-bump timeout

By updating the staging timeout every time we run `_compute_state` and still have a `pending` status, we'd actually bump the timeout *on every success CI* except for the last one. Which was never the intention and can add an hour or two to the mergebot-side timeout. Instead, add an `updated_at` attribute to statuses (name taken from the webhook payload even though we don't use that), read it when we see `pending` required statuses, and update the staging timeout based on that if necessary. That way as long as we don't get *new* pending statuses, the timeout doesn't get updated. Fixes #952
2025-03-15 15:35:46 +07:00 · 2024-09-20 12:17:17 +02:00 · 2024-09-20 12:17:17 +02:00 · e309e1a3a2
commit e309e1a3a2
parent 154e610bbc
3 changed files with 46 additions and 27 deletions
--- a/runbot_merge/controllers/init.py
+++ b/runbot_merge/controllers/init.py
@ -2,6 +2,7 @@ import hashlib
 import hmac
 import logging
 import json
+from datetime import datetime

 import sentry_sdk
 import werkzeug.exceptions
@ -358,7 +359,8 @@ def handle_status(env, event):
        event['context']: {
            'state': event['state'],
            'target_url': event['target_url'],
-            'description': event['description']
+            'description': event['description'],
+            'updated_at': datetime.now().isoformat(timespec='seconds'),
        }
    })
    # create status, or merge update into commit *unless* the update is already
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@ -2133,6 +2133,7 @@ class Stagings(models.Model):
        if not self.env.user.has_group('runbot_merge.status'):
            raise AccessError("You are not allowed to post a status.")

+        now = datetime.datetime.now().isoformat(timespec='seconds')
        for s in self:
            if not s.target.project_id.staging_rpc:
                continue
@ -2145,6 +2146,7 @@ class Stagings(models.Model):
                'state': status,
                'target_url': target_url,
                'description': description,
+                'updated_at': now,
            }
            s.statuses_cache = json.dumps(st)

@ -2158,39 +2160,45 @@ class Stagings(models.Model):
        "heads.repository_id.status_ids.context",
    )
    def _compute_state(self):
-        for s in self:
-            if s.state != 'pending':
+        for staging in self:
+            if staging.state != 'pending':
                continue

            # maps commits to the statuses they need
            required_statuses = [
-                (h.commit_id.sha, h.repository_id.status_ids._for_staging(s).mapped('context'))
-                for h in s.heads
+                (h.commit_id.sha, h.repository_id.status_ids._for_staging(staging).mapped('context'))
+                for h in staging.heads
            ]
-            cmap = json.loads(s.statuses_cache)
+            cmap = json.loads(staging.statuses_cache)

-            update_timeout_limit = False
-            st = 'success'
+            last_pending = ""
+            state = 'success'
            for head, reqs in required_statuses:
                statuses = cmap.get(head) or {}
-                for v in map(lambda n: statuses.get(n, {}).get('state'), reqs):
-                    if st == 'failure' or v in ('error', 'failure'):
-                        st = 'failure'
+                for status in (statuses.get(n, {}) for n in reqs):
+                    v = status.get('state')
+                    if state == 'failure' or v in ('error', 'failure'):
+                        state = 'failure'
                    elif v is None:
-                        st = 'pending'
+                        state = 'pending'
                    elif v == 'pending':
-                        st = 'pending'
-                        update_timeout_limit = True
+                        state = 'pending'
+                        last_pending = max(last_pending, status.get('updated_at', ''))
                    else:
                        assert v == 'success'

-            s.state = st
-            if s.state != 'pending':
+            staging.state = state
+            if staging.state != 'pending':
                self.env.ref("runbot_merge.merge_cron")._trigger()
-            if update_timeout_limit:
-                s.timeout_limit = datetime.datetime.now() + datetime.timedelta(minutes=s.target.project_id.ci_timeout)
-                self.env.ref("runbot_merge.merge_cron")._trigger(s.timeout_limit)
-                _logger.debug("%s got pending status, bumping timeout to %s (%s)", self, s.timeout_limit, cmap)
+
+            if last_pending:
+                timeout = datetime.datetime.fromisoformat(last_pending) \
+                      + datetime.timedelta(minutes=staging.target.project_id.ci_timeout)
+
+                if timeout > staging.timeout_limit:
+                    staging.timeout_limit = timeout
+                    self.env.ref("runbot_merge.merge_cron")._trigger(timeout)
+                    _logger.debug("%s got pending status, bumping timeout to %s", staging, timeout)

    def action_cancel(self):
        w = self.env['runbot_merge.stagings.cancel'].create({
--- a/runbot_merge/tests/test_basic.py
+++ b/runbot_merge/tests/test_basic.py
@ -605,7 +605,6 @@ def test_staging_ci_timeout(env, repo, config, page, update_op: Callable[[int],
    assert dangerbox
    assert dangerbox[0].text == 'timed out (>60 minutes)'

-@pytest.mark.defaultstatuses
 def test_timeout_bump_on_pending(env, repo, config):
    with repo:
        [m, c] = repo.make_commits(
@ -615,8 +614,9 @@ def test_timeout_bump_on_pending(env, repo, config):
        )
        repo.make_ref('heads/master', m)

-        prx = repo.make_pr(title='title', body='body', target='master', head=c)
-        repo.post_status(prx.head, 'success')
+        prx = repo.make_pr(target='master', head=c)
+        repo.post_status(prx.head, 'success', 'ci/runbot')
+        repo.post_status(prx.head, 'success', 'legal/cla')
        prx.post_comment('hansen r+', config['role_reviewer']['token'])
    env.run_crons()

@ -624,9 +624,18 @@ def test_timeout_bump_on_pending(env, repo, config):
    old_timeout = odoo.fields.Datetime.to_string(datetime.datetime.now() - datetime.timedelta(days=15))
    st.timeout_limit = old_timeout
    with repo:
-        repo.post_status('staging.master', 'pending')
+        repo.post_status('staging.master', 'pending', 'ci/runbot')
    env.run_crons(None)
-    assert st.timeout_limit > old_timeout
+    assert st.timeout_limit > old_timeout, "receiving a pending status should bump the timeout"
+
+    st.timeout_limit = old_timeout
+    # clear the statuses cache to remove the memoized status times
+    st.statuses_cache = "{}"
+    st.commit_ids.statuses = "{}"
+    with repo:
+        repo.post_status('staging.master', 'success', 'legal/cla')
+    env.run_crons(None)
+    assert st.timeout_limit == old_timeout, "receiving a success status should *not* bump the timeout"

 def test_staging_ci_failure_single(env, repo, users, config, page):
    """ on failure of single-PR staging, mark & notify failure
@ -3327,8 +3336,8 @@ class TestUnknownPR:

        c = env['runbot_merge.commit'].search([('sha', '=', prx.head)])
        assert json.loads(c.statuses) == {
-            'legal/cla': {'state': 'success', 'target_url': None, 'description': None},
-            'ci/runbot': {'state': 'success', 'target_url': 'http://example.org/wheee', 'description': None}
+            'legal/cla': {'state': 'success', 'target_url': None, 'description': None, 'updated_at': matches("$$")},
+            'ci/runbot': {'state': 'success', 'target_url': 'http://example.org/wheee', 'description': None, 'updated_at': matches("$$")}
        }
        assert prx.comments == [
            seen(env, prx, users),