[FIX] forwardport: storage of old garbage, repo cache sizes

Since the forwardport bot works off of PRs, when it was created
leveraging the magic refs of github (refs/pull/*/head) seemed
sensible: when updating the cache repo, the magic refs would be
updated alongside and the forward-porting would have all the commits
available.

Turns out there are a few issues with this:

- the magic refs have become a bit unreliable, so we need a fallback
  (b1c16fce8768080d30430f4e6d3788b60ce13de7)
- the forwardport bot only needs the commits on a transient basis, but
  the magic refs live forever and diverge from all other content
  (since we rarely `merge` PRs), for a large repo with lots of PRs
  this leads to a significant inflation in size of repo (both on-disk
  and objects count) e.g. odoo/odoo has about 25% more objects
  with the pull refs included (3486550 to 4395319) and takes nearly
  50% more space on disk (1.21G to 1.77)

As a result, we can just stop configuring the pull refs entirely, and
instead fetch the heads (~refs) we need as we need them. This can be a
touch more expensive at times as it requires two `fetch` calls, and
we'll have a few redundant calls as every forward port of a chain will
require a fetch of the root's head, *however* it will avoid retrieving
PR data for e.g. prs to master, as they don't get forward-ported, they
also won't get parasite updates from PRs being ignored or eventually
closed.

Eventually this could be optimised further by

- performing an update of the cache repo at the start of the cron iff
  there are batches to port
- creating a temp clone for the batch
- fetching the heads of all PRs to forward port in the temp clone in a
  single `fetch`
- performing all the ports by cloning the temp clone (and not
  `fetch`-ing into those)
- then cleaning up the temp clone after having cleaned up individual
  forward port clones

Small followup for #489
This commit is contained in:
Xavier Morel 2022-11-08 12:02:45 +01:00
parent c35b721f0e
commit e20277c6ad

View File

@ -793,6 +793,14 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
b.prs[0]._schedule_fp_followup() b.prs[0]._schedule_fp_followup()
return b return b
@property
def _source_url(self):
return 'https://{}:{}@github.com/{}'.format(
self.repository.project_id.fp_github_name or '',
self.repository.project_id.fp_github_token,
self.repository.name,
)
def _create_fp_branch(self, target_branch, fp_branch_name, cleanup): def _create_fp_branch(self, target_branch, fp_branch_name, cleanup):
""" Creates a forward-port for the current PR to ``target_branch`` under """ Creates a forward-port for the current PR to ``target_branch`` under
``fp_branch_name``. ``fp_branch_name``.
@ -804,7 +812,7 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
present the conflict information is composed of the hash of the present the conflict information is composed of the hash of the
conflicting commit, the stderr and stdout of the failed conflicting commit, the stderr and stdout of the failed
cherrypick and a list of all PR commit hashes cherrypick and a list of all PR commit hashes
:rtype: (None | (str, str, str, list[str]), Repo) :rtype: (None | (str, str, str, list[commit]), Repo)
""" """
logger = _logger.getChild(str(self.id)) logger = _logger.getChild(str(self.id))
root = self._get_root() root = self._get_root()
@ -813,21 +821,8 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
self.display_name, root.display_name, target_branch.name self.display_name, root.display_name, target_branch.name
) )
source = self._get_local_directory() source = self._get_local_directory()
r = source.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT).fetch()
check_commit = partial(source.check(False).cat_file, e=root.head) logger.info("Updated cache repo %s:\n%s", source._directory, r.stdout.decode())
r = source.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\
.fetch('origin')
logger.info("Updated %s:\n%s", source._directory, r.stdout.decode())
if check_commit().returncode:
# fallback: try to fetch the commit directly if the magic ref doesn't work?
r = source.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT) \
.fetch('origin', root.head)
logger.info("Updated %s:\n%s", source._directory, r.stdout.decode())
if check_commit().returncode:
raise ForwardPortError(
f"During forward port of {self.display_name}, unable to find "
f"expected head of {root.display_name} ({root.head})"
)
logger.info("Create working copy...") logger.info("Create working copy...")
working_copy = source.clone( working_copy = source.clone(
@ -841,6 +836,21 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
)), )),
branch=target_branch.name branch=target_branch.name
) )
r = working_copy.with_config(stdout=subprocess.PIPE, stderr=subprocess.STDOUT) \
.fetch(self._source_url, root.head)
logger.info(
"Fetched head of %s into %s:\n%s",
root.display_name,
working_copy._directory,
r.stdout.decode()
)
if working_copy.check(False).cat_file(e=root.head).returncode:
raise ForwardPortError(
f"During forward port of {self.display_name}, unable to find "
f"expected head of {root.display_name} ({root.head})"
)
project_id = self.repository.project_id project_id = self.repository.project_id
# add target remote # add target remote
working_copy.remote( working_copy.remote(
@ -857,17 +867,17 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
root._cherry_pick(working_copy) root._cherry_pick(working_copy)
return None, working_copy return None, working_copy
except CherrypickError as e: except CherrypickError as e:
h, out, err, commits = e.args
# using git diff | git apply -3 to get the entire conflict set # using git diff | git apply -3 to get the entire conflict set
# turns out to not work correctly: in case files have been moved # turns out to not work correctly: in case files have been moved
# / removed (which turns out to be a common source of conflicts # / removed (which turns out to be a common source of conflicts
# when forward-porting) it'll just do nothing to the working copy # when forward-porting) it'll just do nothing to the working copy
# so the "conflict commit" will be empty # so the "conflict commit" will be empty
# switch to a squashed-pr branch # switch to a squashed-pr branch
root_branch = 'origin/pull/%d' % root.number working_copy.check(True).checkout('-bsquashed', root.head)
working_copy.checkout('-bsquashed', root_branch)
root_commits = root.commits()
# commits returns oldest first, so youngest (head) last # commits returns oldest first, so youngest (head) last
head_commit = root_commits[-1]['commit'] head_commit = commits[-1]['commit']
to_tuple = operator.itemgetter('name', 'email') to_tuple = operator.itemgetter('name', 'email')
to_dict = lambda term, vals: { to_dict = lambda term, vals: {
@ -876,7 +886,7 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
'GIT_%s_DATE' % term: vals[2], 'GIT_%s_DATE' % term: vals[2],
} }
authors, committers = set(), set() authors, committers = set(), set()
for c in (c['commit'] for c in root_commits): for c in (c['commit'] for c in commits):
authors.add(to_tuple(c['author'])) authors.add(to_tuple(c['author']))
committers.add(to_tuple(c['committer'])) committers.add(to_tuple(c['committer']))
fp_authorship = (project_id.fp_github_name, '', '') fp_authorship = (project_id.fp_github_name, '', '')
@ -890,7 +900,7 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
'GIT_COMMITTER_DATE': '', 'GIT_COMMITTER_DATE': '',
}) })
# squash to a single commit # squash to a single commit
conf.reset('--soft', root_commits[0]['parents'][0]['sha']) conf.reset('--soft', commits[0]['parents'][0]['sha'])
conf.commit(a=True, message="temp") conf.commit(a=True, message="temp")
squashed = conf.stdout().rev_parse('HEAD').stdout.strip().decode() squashed = conf.stdout().rev_parse('HEAD').stdout.strip().decode()
@ -901,7 +911,6 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
.with_config(check=False)\ .with_config(check=False)\
.cherry_pick(squashed, no_commit=True) .cherry_pick(squashed, no_commit=True)
status = conf.stdout().status(short=True, untracked_files='no').stdout.decode() status = conf.stdout().status(short=True, untracked_files='no').stdout.decode()
h, out, err, hh = e.args
if err.strip(): if err.strip():
err = err.rstrip() + '\n----------\nstatus:\n' + status err = err.rstrip() + '\n----------\nstatus:\n' + status
else: else:
@ -909,9 +918,9 @@ This PR targets %s and is part of the forward-port chain. Further PRs will be cr
# if there was a single commit, reuse its message when committing # if there was a single commit, reuse its message when committing
# the conflict # the conflict
# TODO: still add conflict information to this? # TODO: still add conflict information to this?
if len(root_commits) == 1: if len(commits) == 1:
msg = root._make_fp_message(root_commits[0]) msg = root._make_fp_message(commits[0])
conf.with_config(input=str(msg).encode())\ conf.with_config(input=str(msg).encode()) \
.commit(all=True, allow_empty=True, file='-') .commit(all=True, allow_empty=True, file='-')
else: else:
conf.commit( conf.commit(
@ -923,7 +932,7 @@ stdout:
stderr: stderr:
%s %s
""" % (h, out, err)) """ % (h, out, err))
return (h, out, err, hh), working_copy return (h, out, err, [c['sha'] for c in commits]), working_copy
def _cherry_pick(self, working_copy): def _cherry_pick(self, working_copy):
""" Cherrypicks ``self`` into the working copy """ Cherrypicks ``self`` into the working copy
@ -982,7 +991,7 @@ stderr:
commit_sha, commit_sha,
r.stdout.decode(), r.stdout.decode(),
_clean_rename(r.stderr.decode()), _clean_rename(r.stderr.decode()),
[commit['sha'] for commit in commits] commits
) )
msg = self._make_fp_message(commit) msg = self._make_fp_message(commit)
@ -1033,22 +1042,15 @@ stderr:
return git(repo_dir) return git(repo_dir)
else: else:
_logger.info("Cloning out %s to %s", self.repository.name, repo_dir) _logger.info("Cloning out %s to %s", self.repository.name, repo_dir)
subprocess.run([ subprocess.run(['git', 'clone', '--bare', self._source_url, str(repo_dir)], check=True)
'git', 'clone', '--bare', # bare repos don't have fetch specs by default, and fetching *into*
'https://{}:{}@github.com/{}'.format( # them is a pain in the ass, configure fetch specs so `git fetch`
self.repository.project_id.fp_github_name or '', # works properly
self.repository.project_id.fp_github_token,
self.repository.name,
),
str(repo_dir)
], check=True)
# add PR branches as local but namespaced (?)
repo = git(repo_dir) repo = git(repo_dir)
# bare repos don't have a fetch spec by default (!) so adding one
# removes the default behaviour and stops fetching the base
# branches unless we add an explicit fetch spec for them
repo.config('--add', 'remote.origin.fetch', '+refs/heads/*:refs/heads/*') repo.config('--add', 'remote.origin.fetch', '+refs/heads/*:refs/heads/*')
repo.config('--add', 'remote.origin.fetch', '+refs/pull/*/head:refs/heads/pull/*') # negative refspecs require
# repo.config('--add', 'remote.origin.fetch', '^refs/heads/tmp.*')
# repo.config('--add', 'remote.origin.fetch', '^refs/heads/staging.*')
return repo return repo
def _outstanding(self, cutoff): def _outstanding(self, cutoff):