[IMP] runbot_merge: update staging timeout on 'pending' CI

If the CI is greatly backed up (either insufficient capacity or jobs spike) a timeout which is normally perfectly fine might be insufficient e.g. given a 2h timeout, if a job normally takes 80mn but the staging's job starts 40mn after the staging was actually created we're sunk. And cancelling the staging once the job has finally gotten started is not going to improve load on the CI, it just wastes a CI slot. Therefore assume a `pending` event denotes the actual start of the job on the CI, and reset the timeout to start from that moment so ci_timeout is the timeout of the CI job itself, not of the staging having been created. Closes #202
2025-03-15 23:45:44 +07:00 · 2019-09-23 15:42:18 +02:00 · 2019-09-23 15:42:18 +02:00 · 7659293a2b
commit 7659293a2b
parent 78ad4b4e4b
2 changed files with 41 additions and 5 deletions
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@ -143,7 +143,7 @@ class Project(models.Model):
        self.env['runbot_merge.pull_requests.feedback'].browse(to_remove).unlink()

    def is_timed_out(self, staging):
-        return fields.Datetime.from_string(staging.staged_at) + datetime.timedelta(minutes=self.ci_timeout) < datetime.datetime.now()
+        return fields.Datetime.from_string(staging.timeout_limit) < datetime.datetime.now()

    def _check_fetch(self, commit=False):
        """
@ -1311,6 +1311,7 @@ class Stagings(models.Model):
    active = fields.Boolean(default=True)

    staged_at = fields.Datetime(default=fields.Datetime.now)
+    timeout_limit = fields.Datetime(store=True, compute='_compute_timeout_limit')
    reason = fields.Text("Reason for final state (if any)")

    # seems simpler than adding yet another indirection through a model
@ -1343,6 +1344,17 @@ class Stagings(models.Model):
                for status in [to_status(st)]
            ]

+    # only depend on staged_at as it should not get modified, but we might
+    # update the CI timeout after the staging have been created and we
+    # *do not* want to update the staging timeouts in that case
+    @api.depends('staged_at')
+    def _compute_timeout_limit(self):
+        for st in self:
+            st.timeout_limit = fields.Datetime.to_string(
+                  fields.Datetime.from_string(st.staged_at)
+                + datetime.timedelta(minutes=st.target.project_id.ci_timeout)
+            )
+
    def _validate(self):
        Commits = self.env['runbot_merge.commit']
        for s in self:
@ -1357,6 +1369,7 @@ class Stagings(models.Model):
                ('sha', 'in', heads)
            ])

+            update_timeout_limit = False
            reqs = [r.strip() for r in s.target.project_id.required_statuses.split(',')]
            st = 'success'
            for c in commits:
@ -1364,17 +1377,22 @@ class Stagings(models.Model):
                for v in map(lambda n: state_(statuses, n), reqs):
                    if st == 'failure' or v in ('error', 'failure'):
                        st = 'failure'
-                    elif v in (None, 'pending'):
+                    elif v is None:
                        st = 'pending'
+                    elif v == 'pending':
+                        st = 'pending'
+                        update_timeout_limit = True
                    else:
                        assert v == 'success'
            # mark failure as soon as we find a failed status, but wait until
            # all commits are known & not pending to mark a success
            if st == 'success' and len(commits) < len(heads):
-                s.state = 'pending'
-                continue
+                st = 'pending'

-            s.state = st
+            vals = {'state': st}
+            if update_timeout_limit:
+                vals['timeout_limit'] = fields.Datetime.to_string(datetime.datetime.now() + datetime.timedelta(minutes=s.target.project_id.ci_timeout))
+            s.write(vals)

    @api.multi
    def action_cancel(self):
--- a/runbot_merge/tests/test_basic.py
+++ b/runbot_merge/tests/test_basic.py
@ -440,6 +440,24 @@ def test_staging_ci_timeout(env, repo, users):
    env['runbot_merge.project']._check_progress()
    assert pr1.state == 'error', "timeout should fail the PR"

+def test_timeout_bump_on_pending(env, repo):
+    m = repo.make_commit(None, 'initial', None, tree={'f': '0'})
+    repo.make_ref('heads/master', m)
+
+    c = repo.make_commit(m, 'c', None, tree={'f': '1'})
+    prx = repo.make_pr('title', 'body', target='master', ctid=c, user='user')
+    repo.post_status(prx.head, 'success', 'ci/runbot')
+    repo.post_status(prx.head, 'success', 'legal/cla')
+    prx.post_comment('hansen r+', 'reviewer')
+    run_crons(env)
+
+    st = env['runbot_merge.stagings'].search([])
+    old_timeout = odoo.fields.Datetime.to_string(datetime.datetime.now() - datetime.timedelta(days=15))
+    st.timeout_limit = old_timeout
+    repo.post_status(repo.commit('heads/staging.master').id, 'pending', 'ci/runbot')
+    env['runbot_merge.commit']._notify()
+    assert st.timeout_limit > old_timeout
+
 def test_staging_ci_failure_single(env, repo, users):
    """ on failure of single-PR staging, mark & notify failure
    """