From 08b8da08df23c9f795a8ea887631d1427e073d19 Mon Sep 17 00:00:00 2001 From: Xavier Morel Date: Wed, 20 Jan 2021 09:22:11 +0100 Subject: [PATCH] [IMP] runbot_merge: precisely filter stagings before validating them Before this, we would "roughly" select stagings by looking at stagings whose heads matched a specific sha then validating them all. This could perform extra validations on stagings once in a while but this was assumed not to be much an issue, at least originally. However two changes later on have contributed to this likely being the cause of #429 (stagings never timing out): * heads of the staging branches are uniquifier commits stored in the heads map, but the actual heads of the stagings are also stored there, some of which are no-ops (hence the uniquifiers) so assuming repos A and B, if a staging contains PRs touching A then the head of B actual will also be a head of B * when a staging is validated, if it *contains* any pending result the timeout limit gets bumped back The issue here is that if a success / failure status is lost (which would be the most common reason for timeouts) *and* someone has forked and is regularly rebuilding a branch-head used as-is by a staging, each of those rebuilds will trigger a validation of the staging, which will find that one of the statuses is still pending (because we missed the success / failure), which will bump up the timeout limit, continuing until the branch stops getting rebuilt. This is probably one of the reasons why some stagings last for *way* more than 2h, though it is far from explaining all of them: 90% of the stagings lasting more than *3*h end up succeeding. Tho it's always possible that this is because someone notices and resends a success for the missing status it seems somewhat doubtful. Oh well. Also fix the incorrect log call on `update_timeout_limit` triggering. --- runbot_merge/models/pull_requests.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/runbot_merge/models/pull_requests.py b/runbot_merge/models/pull_requests.py index 5ed05ad4..e955be29 100644 --- a/runbot_merge/models/pull_requests.py +++ b/runbot_merge/models/pull_requests.py @@ -1539,9 +1539,14 @@ class Commit(models.Model): pr = PRs.search([('head', '=', c.sha)]) if pr: pr._validate(st) - # heads is a json-encoded mapping of reponame:head, so chances - # are if a sha matches a heads it's matching one of the shas - stagings = Stagings.search([('heads', 'ilike', c.sha)]) + + stagings = Stagings.search([('heads', 'ilike', c.sha)]).filtered( + lambda s, h=c.sha: any( + head == h + for repo, head in json.loads(s.heads).items() + if not repo.endswith('^') + ) + ) if stagings: stagings._validate() except Exception: @@ -1670,7 +1675,7 @@ class Stagings(models.Model): vals = {'state': st} if update_timeout_limit: vals['timeout_limit'] = fields.Datetime.to_string(datetime.datetime.now() + datetime.timedelta(minutes=s.target.project_id.ci_timeout)) - _logger.debug("staging %s: got pending status, bumping timeout to %s (%s)", vals['timeout_limit'], cmap) + _logger.debug("%s got pending status, bumping timeout to %s (%s)", self, vals['timeout_limit'], cmap) s.write(vals) def action_cancel(self):