[ADD] runbot_merge: basic support for false positive detection

Adds a very limited ability to try and look for false positive / non-determinstic staging errors. It tries to err on the side of limiting false false positives, so it's likely to miss many. Currently has no automation / reporting, just sets a flag on the stagings which are strongly believed to have failed due to false positives. While at it, add link between a "root" staging and its splits. It's necessary to clear the "false positive" flag, and surfacing it in the UI could be useful one day. Fixes #660
2025-03-15 15:35:46 +07:00 · 2024-12-09 15:51:09 +01:00 · 2024-12-09 15:51:09 +01:00 · 2e107111f0
commit 2e107111f0
parent d6e1516f31
4 changed files with 247 additions and 50 deletions
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@ -8,7 +8,6 @@ import datetime
 import itertools
 import json
 import logging
-import pprint
 import re
 import time
 from enum import IntEnum
@ -1576,7 +1575,14 @@ For your own safety I've ignored *everything in your entire comment*.
        """ If the PR is staged, cancel the staging. If the PR is split and
        waiting, remove it from the split (possibly delete the split entirely)
        """
-        split = self.batch_id.split_id
+        split: Split = self.batch_id.split_id
+        if split:
+            if split.source_id.likely_false_positive:
+                split.source_id.likely_false_positive = False
+                split.source_id.message_post(
+                    body=f"Assuming failure is a true positive due to {self.display_name} being removed from split.",
+                )
+
            if len(split.batch_ids) == 1:
                # only the batch of this PR -> delete split
                split.unlink()
@ -2050,9 +2056,14 @@ class Commit(models.Model):


 class Stagings(models.Model):
-    _name = _description = 'runbot_merge.stagings'
+    _name = 'runbot_merge.stagings'
+    _description = "A set of batches being tested for integration"
+    _inherit = ['mail.thread']

    target = fields.Many2one('runbot_merge.branch', required=True, index=True)
+    parent_id = fields.Many2one('runbot_merge.stagings')
+    child_ids = fields.One2many('runbot_merge.stagings', 'parent_id')
+    likely_false_positive = fields.Boolean(default=False)

    staging_batch_ids = fields.One2many('runbot_merge.staging.batch', 'runbot_merge_stagings_id')
    batch_ids = fields.Many2many(
@ -2264,6 +2275,7 @@ class Stagings(models.Model):
            return False

        _logger.info("Cancelling staging %s: " + reason, self, *args)
+        self.parent_id.likely_false_positive = False
        self.write({
            'active': False,
            'state': 'cancelled',
@ -2283,6 +2295,8 @@ class Stagings(models.Model):
               format_args={'pr': pr, 'message': message},
           )

+        if not self.target.split_ids:
+            self.parent_id.likely_false_positive = False
        self.write({
            'active': False,
            'state': 'failure',
@ -2296,20 +2310,22 @@ class Stagings(models.Model):
            midpoint = batches // 2
            h, t = self.batch_ids[:midpoint], self.batch_ids[midpoint:]
            # NB: batches remain attached to their original staging
-            sh = self.env['runbot_merge.split'].create({
+            sh, st = self.env['runbot_merge.split'].create([{
                'target': self.target.id,
+                'source_id': (self.parent_id or self).id,
                'batch_ids': [Command.link(batch.id) for batch in h],
-            })
-            st = self.env['runbot_merge.split'].create({
+            }, {
                'target': self.target.id,
+                'source_id': (self.parent_id or self).id,
                'batch_ids': [Command.link(batch.id) for batch in t],
-            })
+            }])
            _logger.info("Split %s to %s (%s) and %s (%s)",
                         self, h, sh, t, st)
            self.write({
                'active': False,
                'state': 'failure',
-                'reason': self.reason if self.state == 'failure' else 'timed out'
+                'reason': self.reason if self.state == 'failure' else 'timed out',
+                'likely_false_positive': not self.parent_id,
            })
            return True

@ -2513,8 +2529,14 @@ class Split(models.Model):
    _name = _description = 'runbot_merge.split'

    target = fields.Many2one('runbot_merge.branch', required=True)
+    source_id = fields.Many2one('runbot_merge.stagings', required=True)
    batch_ids = fields.One2many('runbot_merge.batch', 'split_id', context={'active_test': False})

+    def unlink(self):
+        if not self.env.context.get('staging_split'):
+            self.source_id.likely_false_positive = False
+        return super().unlink()
+

 class FetchJob(models.Model):
    _name = _description = 'runbot_merge.fetch_job'
--- a/runbot_merge/models/stagings_create.py
+++ b/runbot_merge/models/stagings_create.py
@ -64,10 +64,12 @@ def try_staging(branch: Branch) -> Optional[Stagings]:

    alone, batches = ready_batches(for_branch=branch)

+    parent_id = False
    if alone:
        log("staging high-priority PRs %s", batches)
    elif branch.project_id.staging_priority == 'default':
-        if split := branch.split_ids[:1]:
+        if split := branch.split_ids[:1].with_context(staging_split=True):
+            parent_id = split.source_id.id
            batches = split.batch_ids
            split.unlink()
            log("staging split PRs %s (prioritising splits)", batches)
@ -79,7 +81,8 @@ def try_staging(branch: Branch) -> Optional[Stagings]:
        if batches:
            log("staging ready PRs %s (prioritising ready)", batches)
        else:
-            split = branch.split_ids[:1]
+            split = branch.split_ids[:1].with_context(staging_split=True)
+            parent_id = split.source_id.id
            batches = split.batch_ids
            split.unlink()
            log("staging split PRs %s (prioritising ready)", batches)
@ -176,6 +179,7 @@ For-Commit-Id: {it.head}
        'heads': heads,
        'commits': commits,
        'issues_to_close': issues,
+        'parent_id': parent_id,
    })
    for repo, it in staging_state.items():
        _logger.info(
--- a/runbot_merge/tests/test_nondeterministic_failures.py
+++ b/runbot_merge/tests/test_nondeterministic_failures.py
@ -0,0 +1,155 @@
+"""This is a bunch of tests for the automated flagging of stagings as likely
+failed due to false positives / non-deterministic failures.
+"""
+import pytest
+
+from utils import Commit, to_pr
+
+def setup_prs(env, repo, config):
+    with repo:
+        m = repo.make_commits(
+            None,
+            Commit("root", tree={'a': ''}),
+            ref="heads/master",
+        )
+        repo.make_commits(m, Commit("c1", tree={'1': ''}), ref="heads/feature1")
+        pr1 = repo.make_pr(title="whatever", target="master", head="feature1")
+        repo.make_commits(m, Commit("c2", tree={'2': ''}), ref="heads/feature2")
+        pr2 = repo.make_pr(title="whatever", target="master", head="feature2")
+    env.run_crons(None)
+
+    with repo:
+        pr1.post_comment("hansen r+", config['role_reviewer']['token'])
+        repo.post_status(pr1.head, 'success')
+        pr2.post_comment("hansen r+", config['role_reviewer']['token'])
+        repo.post_status(pr2.head, 'success')
+    env.run_crons(None)
+
+    return pr1, pr2
+
+def test_false_positive(env, repo, users, config):
+    """ If we end up merging everything, consider that the original error was a
+    false positive
+    """
+    pr1, pr2 = setup_prs(env, repo, config)
+
+    staging = env['runbot_merge.stagings'].search([])
+    assert staging
+    assert staging.likely_false_positive == False
+
+    with repo:
+        repo.post_status('staging.master', 'failure')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    # all splits succeeded => original failure was likely a false positive or non-deterministic
+    with repo:
+        repo.post_status('staging.master', 'success')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    with repo:
+        repo.post_status('staging.master', 'success')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    assert to_pr(env, pr1).state == 'merged'
+    assert to_pr(env, pr2).state == 'merged'
+
+def test_success_is_not_flagged(env, repo, users, config):
+    setup_prs(env, repo, config)
+
+    staging = env['runbot_merge.stagings'].search([])
+    assert staging
+    assert staging.likely_false_positive == False
+
+    with repo:
+        repo.post_status('staging.master', 'success')
+    env.run_crons(None)
+    assert staging.likely_false_positive == False
+
+def test_true_failure_is_not_flagged(env, repo, users, config):
+    """ If we end up flagging (at least) one specific PR, assume it's a true
+    positive, even though enough false positives can end like that too.
+    """
+    pr1, pr2 = setup_prs(env, repo, config)
+
+    staging = env['runbot_merge.stagings'].search([])
+    assert staging
+    assert staging.likely_false_positive == False
+
+    with repo:
+        repo.post_status('staging.master', 'failure')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    with repo:
+        repo.post_status('staging.master', 'success')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    # PR pinpointed as a true error => no false positive
+    with repo:
+        repo.post_status('staging.master', 'failure')
+    env.run_crons(None)
+    assert staging.likely_false_positive == False
+
+    assert to_pr(env, pr1).state == 'merged'
+    assert to_pr(env, pr2).state == 'error'
+
+def test_cancel_staging_not_flagged(env, repo, users, config):
+    """ If we cancel a staging, assume there was a legit reason to do so and if
+    there's a false positive we already found it.
+    """
+    setup_prs(env, repo, config)
+
+    staging = env['runbot_merge.stagings'].search([])
+    assert staging
+    assert staging.likely_false_positive == False
+
+    with repo:
+        repo.post_status('staging.master', 'failure')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    # `staging` is the source and not active anymore, can't cancel it
+    staging.target.active_staging_id.cancel("because")
+    assert staging.likely_false_positive == False
+
+def test_removed_split_not_flagged(env, repo, users, config):
+    """ If we delete a split, basically the same idea as for cancelling stagings
+    """
+    setup_prs(env, repo, config)
+
+    staging = env['runbot_merge.stagings'].search([])
+    assert staging
+    assert staging.likely_false_positive == False
+
+    with repo:
+        repo.post_status('staging.master', 'failure')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    staging.target.split_ids.unlink()
+    assert staging.likely_false_positive == False
+
+@pytest.mark.parametrize('pr_index', [0, 1])
+def test_unstaged_pr_not_flagged(env, repo, users, config, pr_index):
+    """ If we cancel a staging by unstaging a PR or remove a PR from a split,
+    assume it's because the PR caused a true failure
+    """
+    prs = setup_prs(env, repo, config)
+
+    staging = env['runbot_merge.stagings'].search([])
+    assert staging
+    assert staging.likely_false_positive == False
+
+    with repo:
+        repo.post_status('staging.master', 'failure')
+    env.run_crons(None)
+    assert staging.likely_false_positive == True
+
+    with repo:
+        prs[pr_index].post_comment("hansen r-", config['role_reviewer']['token'])
+
+    assert staging.likely_false_positive == False
--- a/runbot_merge/views/mergebot.xml
+++ b/runbot_merge/views/mergebot.xml
@ -333,24 +333,8 @@
                                    name="staging_duration" widget="integer"/>
                        </group>
                    </group>
-                    <group>
-                        <group string="Heads">
-                            <field name="head_ids" colspan="2" nolabel="1" readonly="1">
-                                <tree>
-                                    <button type="object" name="get_formview_action" icon="fa-external-link" title="open head"/>
-                                    <field name="sha"/>
-                                </tree>
-                            </field>
-                        </group>
-                        <group string="Commits">
-                            <field name="commit_ids" colspan="2" nolabel="1" readonly="1">
-                                <tree>
-                                    <button type="object" name="get_formview_action" icon="fa-external-link" title="open commit"/>
-                                    <field name="sha"/>
-                                </tree>
-                            </field>
-                        </group>
-                    </group>
+                    <notebook>
+                        <page string="PRs">
                            <group string="Batches">
                                <field name="batch_ids" colspan="4" nolabel="1" readonly="1">
                                    <tree>
@ -370,6 +354,38 @@
                                    </tree>
                                </field>
                            </group>
+                        </page>
+                        <page string="Commits">
+                            <group>
+                                <group string="Heads">
+                                    <field name="head_ids" colspan="2" nolabel="1" readonly="1">
+                                        <tree>
+                                            <button type="object" name="get_formview_action" icon="fa-external-link" title="open head"/>
+                                            <field name="sha"/>
+                                        </tree>
+                                    </field>
+                                </group>
+                                <group string="Commits">
+                                    <field name="commit_ids" colspan="2" nolabel="1" readonly="1">
+                                        <tree>
+                                            <button type="object" name="get_formview_action" icon="fa-external-link" title="open commit"/>
+                                            <field name="sha"/>
+                                        </tree>
+                                    </field>
+                                </group>
+                            </group>
+                        </page>
+                        <page string="Splits">
+                            <field name="child_ids" colspan="4" nolabel="1" readonly="1">
+                                <tree>
+                                    <button type="object" name="get_formview_action" icon="fa-external-link" title="open pr"/>
+                                    <field name="display_name"/>
+                                    <field name="state"/>
+                                    <field name="reason"/>
+                                </tree>
+                            </field>
+                        </page>
+                    </notebook>
                </sheet>
            </form>
        </field>