From 7659293a2bd01cfab34c39d8e0aec7226e1fb033 Mon Sep 17 00:00:00 2001
From: Xavier Morel <xmo@odoo.com>
Date: Mon, 23 Sep 2019 15:42:18 +0200
Subject: [PATCH] [IMP] runbot_merge: update staging timeout on 'pending' CI

If the CI is greatly backed up (either insufficient capacity or jobs
spike) a timeout which is normally perfectly fine might be
insufficient e.g. given a 2h timeout, if a job normally takes 80mn but
the staging's job starts 40mn after the staging was actually created
we're sunk. And cancelling the staging once the job has finally gotten
started is not going to improve load on the CI, it just wastes a CI
slot.

Therefore assume a `pending` event denotes the actual start of the job
on the CI, and reset the timeout to start from that moment so
ci_timeout is the timeout of the CI job itself, not of the staging
having been created.

Closes #202
---
 runbot_merge/models/pull_requests.py | 28 +++++++++++++++++++++++-----
 runbot_merge/tests/test_basic.py     | 18 ++++++++++++++++++
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/runbot_merge/models/pull_requests.py b/runbot_merge/models/pull_requests.py
index e659188f..64b54d99 100644
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@@ -143,7 +143,7 @@ class Project(models.Model):
         self.env['runbot_merge.pull_requests.feedback'].browse(to_remove).unlink()
 
     def is_timed_out(self, staging):
-        return fields.Datetime.from_string(staging.staged_at) + datetime.timedelta(minutes=self.ci_timeout) < datetime.datetime.now()
+        return fields.Datetime.from_string(staging.timeout_limit) < datetime.datetime.now()
 
     def _check_fetch(self, commit=False):
         """
@@ -1311,6 +1311,7 @@ class Stagings(models.Model):
     active = fields.Boolean(default=True)
 
     staged_at = fields.Datetime(default=fields.Datetime.now)
+    timeout_limit = fields.Datetime(store=True, compute='_compute_timeout_limit')
     reason = fields.Text("Reason for final state (if any)")
 
     # seems simpler than adding yet another indirection through a model
@@ -1343,6 +1344,17 @@ class Stagings(models.Model):
                 for status in [to_status(st)]
             ]
 
+    # only depend on staged_at as it should not get modified, but we might
+    # update the CI timeout after the staging have been created and we
+    # *do not* want to update the staging timeouts in that case
+    @api.depends('staged_at')
+    def _compute_timeout_limit(self):
+        for st in self:
+            st.timeout_limit = fields.Datetime.to_string(
+                  fields.Datetime.from_string(st.staged_at)
+                + datetime.timedelta(minutes=st.target.project_id.ci_timeout)
+            )
+
     def _validate(self):
         Commits = self.env['runbot_merge.commit']
         for s in self:
@@ -1357,6 +1369,7 @@ class Stagings(models.Model):
                 ('sha', 'in', heads)
             ])
 
+            update_timeout_limit = False
             reqs = [r.strip() for r in s.target.project_id.required_statuses.split(',')]
             st = 'success'
             for c in commits:
@@ -1364,17 +1377,22 @@ class Stagings(models.Model):
                 for v in map(lambda n: state_(statuses, n), reqs):
                     if st == 'failure' or v in ('error', 'failure'):
                         st = 'failure'
-                    elif v in (None, 'pending'):
+                    elif v is None:
                         st = 'pending'
+                    elif v == 'pending':
+                        st = 'pending'
+                        update_timeout_limit = True
                     else:
                         assert v == 'success'
             # mark failure as soon as we find a failed status, but wait until
             # all commits are known & not pending to mark a success
             if st == 'success' and len(commits) < len(heads):
-                s.state = 'pending'
-                continue
+                st = 'pending'
 
-            s.state = st
+            vals = {'state': st}
+            if update_timeout_limit:
+                vals['timeout_limit'] = fields.Datetime.to_string(datetime.datetime.now() + datetime.timedelta(minutes=s.target.project_id.ci_timeout))
+            s.write(vals)
 
     @api.multi
     def action_cancel(self):
diff --git a/runbot_merge/tests/test_basic.py b/runbot_merge/tests/test_basic.py
index 6de13bb3..4c9bf19d 100644
--- a/runbot_merge/tests/test_basic.py
+++ b/runbot_merge/tests/test_basic.py
@@ -440,6 +440,24 @@ def test_staging_ci_timeout(env, repo, users):
     env['runbot_merge.project']._check_progress()
     assert pr1.state == 'error', "timeout should fail the PR"
 
+def test_timeout_bump_on_pending(env, repo):
+    m = repo.make_commit(None, 'initial', None, tree={'f': '0'})
+    repo.make_ref('heads/master', m)
+
+    c = repo.make_commit(m, 'c', None, tree={'f': '1'})
+    prx = repo.make_pr('title', 'body', target='master', ctid=c, user='user')
+    repo.post_status(prx.head, 'success', 'ci/runbot')
+    repo.post_status(prx.head, 'success', 'legal/cla')
+    prx.post_comment('hansen r+', 'reviewer')
+    run_crons(env)
+
+    st = env['runbot_merge.stagings'].search([])
+    old_timeout = odoo.fields.Datetime.to_string(datetime.datetime.now() - datetime.timedelta(days=15))
+    st.timeout_limit = old_timeout
+    repo.post_status(repo.commit('heads/staging.master').id, 'pending', 'ci/runbot')
+    env['runbot_merge.commit']._notify()
+    assert st.timeout_limit > old_timeout
+
 def test_staging_ci_failure_single(env, repo, users):
     """ on failure of single-PR staging, mark & notify failure
     """