[ADD] runbot_merge: basic support for false positive detection

Adds a very limited ability to try and look for false positive / non-determinstic staging errors. It tries to err on the side of limiting false false positives, so it's likely to miss many. Currently has no automation / reporting, just sets a flag on the stagings which are strongly believed to have failed due to false positives. While at it, add link between a "root" staging and its splits. It's necessary to clear the "false positive" flag, and surfacing it in the UI could be useful one day. Fixes #660
2025-03-15 15:35:46 +07:00 · 2024-12-09 15:51:09 +01:00 · 2024-12-09 15:51:09 +01:00 · 2e107111f0
commit 2e107111f0
parent d6e1516f31
4 changed files with 247 additions and 50 deletions
--- a/runbot_merge/models/pull_requests.py
+++ b/runbot_merge/models/pull_requests.py
@ -8,7 +8,6 @@ import datetime
 import itertools
 import json
 import logging
 import pprint
 import re
 import time
 from enum import IntEnum
@ -1576,13 +1575,20 @@ For your own safety I've ignored *everything in your entire comment*.
        """ If the PR is staged, cancel the staging. If the PR is split and
        waiting, remove it from the split (possibly delete the split entirely)
        """
-        split = self.batch_id.split_id
+        split: Split = self.batch_id.split_id
-        if len(split.batch_ids) == 1:
+        if split:
-            # only the batch of this PR -> delete split
+            if split.source_id.likely_false_positive:
-            split.unlink()
+                split.source_id.likely_false_positive = False
-        else:
+                split.source_id.message_post(
-            # else remove this batch from the split
+                    body=f"Assuming failure is a true positive due to {self.display_name} being removed from split.",
-            self.batch_id.split_id = False
+                )
            if len(split.batch_ids) == 1:
                # only the batch of this PR -> delete split
                split.unlink()
            else:
                # else remove this batch from the split
                self.batch_id.split_id = False
        self.staging_id.cancel('%s ' + reason, self.display_name, *args)
@ -2050,9 +2056,14 @@ class Commit(models.Model):
 class Stagings(models.Model):
-    _name = _description = 'runbot_merge.stagings'
+    _name = 'runbot_merge.stagings'
    _description = "A set of batches being tested for integration"
    _inherit = ['mail.thread']
    target = fields.Many2one('runbot_merge.branch', required=True, index=True)
    parent_id = fields.Many2one('runbot_merge.stagings')
    child_ids = fields.One2many('runbot_merge.stagings', 'parent_id')
    likely_false_positive = fields.Boolean(default=False)
    staging_batch_ids = fields.One2many('runbot_merge.staging.batch', 'runbot_merge_stagings_id')
    batch_ids = fields.Many2many(
@ -2264,6 +2275,7 @@ class Stagings(models.Model):
            return False
        _logger.info("Cancelling staging %s: " + reason, self, *args)
        self.parent_id.likely_false_positive = False
        self.write({
            'active': False,
            'state': 'cancelled',
@ -2283,6 +2295,8 @@ class Stagings(models.Model):
               format_args={'pr': pr, 'message': message},
           )
        if not self.target.split_ids:
            self.parent_id.likely_false_positive = False
        self.write({
            'active': False,
            'state': 'failure',
@ -2296,20 +2310,22 @@ class Stagings(models.Model):
            midpoint = batches // 2
            h, t = self.batch_ids[:midpoint], self.batch_ids[midpoint:]
            # NB: batches remain attached to their original staging
-            sh = self.env['runbot_merge.split'].create({
+            sh, st = self.env['runbot_merge.split'].create([{
                'target': self.target.id,
                'source_id': (self.parent_id or self).id,
                'batch_ids': [Command.link(batch.id) for batch in h],
-            })
+            }, {
            st = self.env['runbot_merge.split'].create({
                'target': self.target.id,
                'source_id': (self.parent_id or self).id,
                'batch_ids': [Command.link(batch.id) for batch in t],
-            })
+            }])
            _logger.info("Split %s to %s (%s) and %s (%s)",
                         self, h, sh, t, st)
            self.write({
                'active': False,
                'state': 'failure',
-                'reason': self.reason if self.state == 'failure' else 'timed out'
+                'reason': self.reason if self.state == 'failure' else 'timed out',
                'likely_false_positive': not self.parent_id,
            })
            return True
@ -2513,8 +2529,14 @@ class Split(models.Model):
    _name = _description = 'runbot_merge.split'
    target = fields.Many2one('runbot_merge.branch', required=True)
    source_id = fields.Many2one('runbot_merge.stagings', required=True)
    batch_ids = fields.One2many('runbot_merge.batch', 'split_id', context={'active_test': False})
    def unlink(self):
        if not self.env.context.get('staging_split'):
            self.source_id.likely_false_positive = False
        return super().unlink()
 class FetchJob(models.Model):
    _name = _description = 'runbot_merge.fetch_job'
--- a/runbot_merge/models/stagings_create.py
+++ b/runbot_merge/models/stagings_create.py
@ -64,10 +64,12 @@ def try_staging(branch: Branch) -> Optional[Stagings]:
    alone, batches = ready_batches(for_branch=branch)
    parent_id = False
    if alone:
        log("staging high-priority PRs %s", batches)
    elif branch.project_id.staging_priority == 'default':
-        if split := branch.split_ids[:1]:
+        if split := branch.split_ids[:1].with_context(staging_split=True):
            parent_id = split.source_id.id
            batches = split.batch_ids
            split.unlink()
            log("staging split PRs %s (prioritising splits)", batches)
@ -79,7 +81,8 @@ def try_staging(branch: Branch) -> Optional[Stagings]:
        if batches:
            log("staging ready PRs %s (prioritising ready)", batches)
        else:
-            split = branch.split_ids[:1]
+            split = branch.split_ids[:1].with_context(staging_split=True)
            parent_id = split.source_id.id
            batches = split.batch_ids
            split.unlink()
            log("staging split PRs %s (prioritising ready)", batches)
@ -176,6 +179,7 @@ For-Commit-Id: {it.head}
        'heads': heads,
        'commits': commits,
        'issues_to_close': issues,
        'parent_id': parent_id,
    })
    for repo, it in staging_state.items():
        _logger.info(
--- a/runbot_merge/tests/test_nondeterministic_failures.py
+++ b/runbot_merge/tests/test_nondeterministic_failures.py
@ -0,0 +1,155 @@
 """This is a bunch of tests for the automated flagging of stagings as likely
 failed due to false positives / non-deterministic failures.
 """
 import pytest
 from utils import Commit, to_pr
 def setup_prs(env, repo, config):
    with repo:
        m = repo.make_commits(
            None,
            Commit("root", tree={'a': ''}),
            ref="heads/master",
        )
        repo.make_commits(m, Commit("c1", tree={'1': ''}), ref="heads/feature1")
        pr1 = repo.make_pr(title="whatever", target="master", head="feature1")
        repo.make_commits(m, Commit("c2", tree={'2': ''}), ref="heads/feature2")
        pr2 = repo.make_pr(title="whatever", target="master", head="feature2")
    env.run_crons(None)
    with repo:
        pr1.post_comment("hansen r+", config['role_reviewer']['token'])
        repo.post_status(pr1.head, 'success')
        pr2.post_comment("hansen r+", config['role_reviewer']['token'])
        repo.post_status(pr2.head, 'success')
    env.run_crons(None)
    return pr1, pr2
 def test_false_positive(env, repo, users, config):
    """ If we end up merging everything, consider that the original error was a
    false positive
    """
    pr1, pr2 = setup_prs(env, repo, config)
    staging = env['runbot_merge.stagings'].search([])
    assert staging
    assert staging.likely_false_positive == False
    with repo:
        repo.post_status('staging.master', 'failure')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    # all splits succeeded => original failure was likely a false positive or non-deterministic
    with repo:
        repo.post_status('staging.master', 'success')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    with repo:
        repo.post_status('staging.master', 'success')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    assert to_pr(env, pr1).state == 'merged'
    assert to_pr(env, pr2).state == 'merged'
 def test_success_is_not_flagged(env, repo, users, config):
    setup_prs(env, repo, config)
    staging = env['runbot_merge.stagings'].search([])
    assert staging
    assert staging.likely_false_positive == False
    with repo:
        repo.post_status('staging.master', 'success')
    env.run_crons(None)
    assert staging.likely_false_positive == False
 def test_true_failure_is_not_flagged(env, repo, users, config):
    """ If we end up flagging (at least) one specific PR, assume it's a true
    positive, even though enough false positives can end like that too.
    """
    pr1, pr2 = setup_prs(env, repo, config)
    staging = env['runbot_merge.stagings'].search([])
    assert staging
    assert staging.likely_false_positive == False
    with repo:
        repo.post_status('staging.master', 'failure')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    with repo:
        repo.post_status('staging.master', 'success')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    # PR pinpointed as a true error => no false positive
    with repo:
        repo.post_status('staging.master', 'failure')
    env.run_crons(None)
    assert staging.likely_false_positive == False
    assert to_pr(env, pr1).state == 'merged'
    assert to_pr(env, pr2).state == 'error'
 def test_cancel_staging_not_flagged(env, repo, users, config):
    """ If we cancel a staging, assume there was a legit reason to do so and if
    there's a false positive we already found it.
    """
    setup_prs(env, repo, config)
    staging = env['runbot_merge.stagings'].search([])
    assert staging
    assert staging.likely_false_positive == False
    with repo:
        repo.post_status('staging.master', 'failure')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    # `staging` is the source and not active anymore, can't cancel it
    staging.target.active_staging_id.cancel("because")
    assert staging.likely_false_positive == False
 def test_removed_split_not_flagged(env, repo, users, config):
    """ If we delete a split, basically the same idea as for cancelling stagings
    """
    setup_prs(env, repo, config)
    staging = env['runbot_merge.stagings'].search([])
    assert staging
    assert staging.likely_false_positive == False
    with repo:
        repo.post_status('staging.master', 'failure')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    staging.target.split_ids.unlink()
    assert staging.likely_false_positive == False
@pytest.mark.parametrize('pr_index', [0, 1])
 def test_unstaged_pr_not_flagged(env, repo, users, config, pr_index):
    """ If we cancel a staging by unstaging a PR or remove a PR from a split,
    assume it's because the PR caused a true failure
    """
    prs = setup_prs(env, repo, config)
    staging = env['runbot_merge.stagings'].search([])
    assert staging
    assert staging.likely_false_positive == False
    with repo:
        repo.post_status('staging.master', 'failure')
    env.run_crons(None)
    assert staging.likely_false_positive == True
    with repo:
        prs[pr_index].post_comment("hansen r-", config['role_reviewer']['token'])
    assert staging.likely_false_positive == False
--- a/runbot_merge/views/mergebot.xml
+++ b/runbot_merge/views/mergebot.xml
@ -333,43 +333,59 @@
                                    name="staging_duration" widget="integer"/>
                        </group>
                    </group>
-                    <group>
+                    <notebook>
-                        <group string="Heads">
+                        <page string="PRs">
-                            <field name="head_ids" colspan="2" nolabel="1" readonly="1">
+                            <group string="Batches">
                                <field name="batch_ids" colspan="4" nolabel="1" readonly="1">
                                    <tree>
                                        <button type="object" name="get_formview_action" icon="fa-external-link" title="open batch"/>
                                        <field name="name"/>
                                        <field name="prs" widget="many2many_tags"/>
                                    </tree>
                                </field>
                            </group>
                            <group string="PRs">
                                <field name="pr_ids" colspan="4" nolabel="1" readonly="1">
                                    <tree>
                                        <button type="object" name="get_formview_action" icon="fa-external-link" title="open pr"/>
                                        <field name="display_name"/>
                                        <field name="github_url" widget="url"/>
                                        <field name="url" widget="url"/>
                                    </tree>
                                </field>
                            </group>
                        </page>
                        <page string="Commits">
                            <group>
                                <group string="Heads">
                                    <field name="head_ids" colspan="2" nolabel="1" readonly="1">
                                        <tree>
                                            <button type="object" name="get_formview_action" icon="fa-external-link" title="open head"/>
                                            <field name="sha"/>
                                        </tree>
                                    </field>
                                </group>
                                <group string="Commits">
                                    <field name="commit_ids" colspan="2" nolabel="1" readonly="1">
                                        <tree>
                                            <button type="object" name="get_formview_action" icon="fa-external-link" title="open commit"/>
                                            <field name="sha"/>
                                        </tree>
                                    </field>
                                </group>
                            </group>
                        </page>
                        <page string="Splits">
                            <field name="child_ids" colspan="4" nolabel="1" readonly="1">
                                <tree>
-                                    <button type="object" name="get_formview_action" icon="fa-external-link" title="open head"/>
+                                    <button type="object" name="get_formview_action" icon="fa-external-link" title="open pr"/>
-                                    <field name="sha"/>
+                                    <field name="display_name"/>
                                    <field name="state"/>
                                    <field name="reason"/>
                                </tree>
                            </field>
-                        </group>
+                        </page>
-                        <group string="Commits">
+                    </notebook>
                            <field name="commit_ids" colspan="2" nolabel="1" readonly="1">
                                <tree>
                                    <button type="object" name="get_formview_action" icon="fa-external-link" title="open commit"/>
                                    <field name="sha"/>
                                </tree>
                            </field>
                        </group>
                    </group>
                    <group string="Batches">
                        <field name="batch_ids" colspan="4" nolabel="1" readonly="1">
                            <tree>
                                <button type="object" name="get_formview_action" icon="fa-external-link" title="open batch"/>
                                <field name="name"/>
                                <field name="prs" widget="many2many_tags"/>
                            </tree>
                        </field>
                    </group>
                    <group string="PRs">
                        <field name="pr_ids" colspan="4" nolabel="1" readonly="1">
                            <tree>
                                <button type="object" name="get_formview_action" icon="fa-external-link" title="open pr"/>
                                <field name="display_name"/>
                                <field name="github_url" widget="url"/>
                                <field name="url" widget="url"/>
                            </tree>
                        </field>
                    </group>
                </sheet>
            </form>
        </field>