[FIX] forwardport: storage of old garbage, repo cache sizes

Since the forwardport bot works off of PRs, when it was created
leveraging the magic refs of github (refs/pull/*/head) seemed
sensible: when updating the cache repo, the magic refs would be
updated alongside and the forward-porting would have all the commits
available.

Turns out there are a few issues with this:

- the magic refs have become a bit unreliable, so we need a fallback
  (b1c16fce8768080d30430f4e6d3788b60ce13de7)
- the forwardport bot only needs the commits on a transient basis, but
  the magic refs live forever and diverge from all other content
  (since we rarely `merge` PRs), for a large repo with lots of PRs
  this leads to a significant inflation in size of repo (both on-disk
  and objects count) e.g. odoo/odoo has about 25% more objects
  with the pull refs included (3486550 to 4395319) and takes nearly
  50% more space on disk (1.21G to 1.77)

As a result, we can just stop configuring the pull refs entirely, and
instead fetch the heads (~refs) we need as we need them. This can be a
touch more expensive at times as it requires two `fetch` calls, and
we'll have a few redundant calls as every forward port of a chain will
require a fetch of the root's head, *however* it will avoid retrieving
PR data for e.g. prs to master, as they don't get forward-ported, they
also won't get parasite updates from PRs being ignored or eventually
closed.

Eventually this could be optimised further by

- performing an update of the cache repo at the start of the cron iff
  there are batches to port
- creating a temp clone for the batch
- fetching the heads of all PRs to forward port in the temp clone in a
  single `fetch`
- performing all the ports by cloning the temp clone (and not
  `fetch`-ing into those)
- then cleaning up the temp clone after having cleaned up individual
  forward port clones

Small followup for #489
This commit is contained in:
Xavier Morel 2022-11-08 12:02:45 +01:00
parent c35b721f0e
commit e20277c6ad

View File

@ -793,6 +793,14 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
b.prs[0]._schedule_fp_followup()
return b
@property
def _source_url(self):
return 'https://{}:{}@github.com/{}'.format(
self.repository.project_id.fp_github_name or '',
self.repository.project_id.fp_github_token,
self.repository.name,
)
def _create_fp_branch(self, target_branch, fp_branch_name, cleanup):
""" Creates a forward-port for the current PR to ``target_branch`` under
``fp_branch_name``.
@ -804,7 +812,7 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
present the conflict information is composed of the hash of the
conflicting commit, the stderr and stdout of the failed
cherrypick and a list of all PR commit hashes
:rtype: (None | (str, str, str, list[str]), Repo)
:rtype: (None | (str, str, str, list[commit]), Repo)
"""
logger = _logger.getChild(str(self.id))
root = self._get_root()
@ -813,21 +821,8 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
self.display_name, root.display_name, target_branch.name
)
source = self._get_local_directory()
check_commit = partial(source.check(False).cat_file, e=root.head)
r = source.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\
.fetch('origin')
logger.info("Updated %s:\n%s", source._directory, r.stdout.decode())
if check_commit().returncode:
# fallback: try to fetch the commit directly if the magic ref doesn't work?
r = source.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT) \
.fetch('origin', root.head)
logger.info("Updated %s:\n%s", source._directory, r.stdout.decode())
if check_commit().returncode:
raise ForwardPortError(
f"During forward port of {self.display_name}, unable to find "
f"expected head of {root.display_name} ({root.head})"
)
r = source.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT).fetch()
logger.info("Updated cache repo %s:\n%s", source._directory, r.stdout.decode())
logger.info("Create working copy...")
working_copy = source.clone(
@ -841,6 +836,21 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
)),
branch=target_branch.name
)
r = working_copy.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT) \
.fetch(self._source_url, root.head)
logger.info(
"Fetched head of %s into %s:\n%s",
root.display_name,
working_copy._directory,
r.stdout.decode()
)
if working_copy.check(False).cat_file(e=root.head).returncode:
raise ForwardPortError(
f"During forward port of {self.display_name}, unable to find "
f"expected head of {root.display_name} ({root.head})"
)
project_id = self.repository.project_id
# add target remote
working_copy.remote(
@ -857,17 +867,17 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
root._cherry_pick(working_copy)
return None, working_copy
except CherrypickError as e:
h, out, err, commits = e.args
# using git diff | git apply -3 to get the entire conflict set
# turns out to not work correctly: in case files have been moved
# / removed (which turns out to be a common source of conflicts
# when forward-porting) it'll just do nothing to the working copy
# so the "conflict commit" will be empty
# switch to a squashed-pr branch
root_branch = 'origin/pull/%d' % root.number
working_copy.checkout('-bsquashed', root_branch)
root_commits = root.commits()
working_copy.check(True).checkout('-bsquashed', root.head)
# commits returns oldest first, so youngest (head) last
head_commit = root_commits[-1]['commit']
head_commit = commits[-1]['commit']
to_tuple = operator.itemgetter('name', 'email')
to_dict = lambda term, vals: {
@ -876,7 +886,7 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
'GIT_%s_DATE' % term: vals[2],
}
authors, committers = set(), set()
for c in (c['commit'] for c in root_commits):
for c in (c['commit'] for c in commits):
authors.add(to_tuple(c['author']))
committers.add(to_tuple(c['committer']))
fp_authorship = (project_id.fp_github_name, '', '')
@ -890,7 +900,7 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
'GIT_COMMITTER_DATE': '',
})
# squash to a single commit
conf.reset('--soft', root_commits[0]['parents'][0]['sha'])
conf.reset('--soft', commits[0]['parents'][0]['sha'])
conf.commit(a=True, message="temp")
squashed = conf.stdout().rev_parse('HEAD').stdout.strip().decode()
@ -901,7 +911,6 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
.with_config(check=False)\
.cherry_pick(squashed, no_commit=True)
status = conf.stdout().status(short=True, untracked_files='no').stdout.decode()
h, out, err, hh = e.args
if err.strip():
err = err.rstrip() + '\n----------\nstatus:\n' + status
else:
@ -909,9 +918,9 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
# if there was a single commit, reuse its message when committing
# the conflict
# TODO: still add conflict information to this?
if len(root_commits) == 1:
msg = root._make_fp_message(root_commits[0])
conf.with_config(input=str(msg).encode())\
if len(commits) == 1:
msg = root._make_fp_message(commits[0])
conf.with_config(input=str(msg).encode()) \
.commit(all=True, allow_empty=True, file='-')
else:
conf.commit(
@ -923,7 +932,7 @@ stdout:
stderr:
%s
""" % (h, out, err))
return (h, out, err, hh), working_copy
return (h, out, err, [c['sha'] for c in commits]), working_copy
def _cherry_pick(self, working_copy):
""" Cherrypicks ``self`` into the working copy
@ -982,7 +991,7 @@ stderr:
commit_sha,
r.stdout.decode(),
_clean_rename(r.stderr.decode()),
[commit['sha'] for commit in commits]
commits
)
msg = self._make_fp_message(commit)
@ -1033,22 +1042,15 @@ stderr:
return git(repo_dir)
else:
_logger.info("Cloning out %s to %s", self.repository.name, repo_dir)
subprocess.run([
'git', 'clone', '--bare',
'https://{}:{}@github.com/{}'.format(
self.repository.project_id.fp_github_name or '',
self.repository.project_id.fp_github_token,
self.repository.name,
),
str(repo_dir)
], check=True)
# add PR branches as local but namespaced (?)
subprocess.run(['git', 'clone', '--bare', self._source_url, str(repo_dir)], check=True)
# bare repos don't have fetch specs by default, and fetching *into*
# them is a pain in the ass, configure fetch specs so `git fetch`
# works properly
repo = git(repo_dir)
# bare repos don't have a fetch spec by default (!) so adding one
# removes the default behaviour and stops fetching the base
# branches unless we add an explicit fetch spec for them
repo.config('--add', 'remote.origin.fetch', '+refs/heads/*:refs/heads/*')
repo.config('--add', 'remote.origin.fetch', '+refs/pull/*/head:refs/heads/pull/*')
# negative refspecs require
# repo.config('--add', 'remote.origin.fetch', '^refs/heads/tmp.*')
# repo.config('--add', 'remote.origin.fetch', '^refs/heads/staging.*')
return repo
def _outstanding(self, cutoff):