runbot/runbot_merge/github.py

375 lines
13 KiB
Python
Raw Normal View History

import collections.abc
import itertools
import json
import logging
import logging.handlers
import os
import pathlib
import pprint
[IMP] runbot_merge: ensure at least 1s between mutating GH calls Mostly a temporary safety feature after the events of 07-31: it's still not clear whether that was a one-off issue or a change in policy (I was not able to reproduce locally even doing several set_refs a second) and the gh support is not super talkative, but it probably doesn't hurt to commit the workaround until #247 gets implemented. On 2023-07-31, around 08:30 UTC, `set_ref` started failing, a lot (although oddly enough not continuously), with the unhelpful message that > 422: Reference cannot be updated This basically broke all stagings, until a workaround was implemented by adding a 1s sleep before `set_ref` to ensure no more than 1 `set_ref` per second, which kinda sorta has been the github recommendation forever but had never been an issue before. Contributing to this suspicion is that in late 2022, the documentation of error 422 on `PATCH git/refs/{ref}` was updated to: > Validation failed, or the endpoint has been spammed. Still would be nice if GH was clear about it and sent a 429 instead. Technically the recommendation is: > If you're making a large number of POST, PATCH, PUT, or DELETE > requests for a single user or client ID, wait at least one second > between each request. So... actually implement that. On a per-worker basis as for the most part these are serial processes (e.g. crons), we can still get above the rate limit due to concurrent crons but it should be less likely. Also take `Retry-After` in account, can't hurt, though we're supposed to retry just the request rather than abort the entire thing. Maybe a future update can improve this handling. Would also be nice to take `X-RateLimit` in account, although that's supposed to apply to *all* requests so we'd need a second separate timestamp to track it. Technically that's probably also the case for `Retry-After`. And fixing #247 should cut down drastically on the API calls traffic as staging is a very API-intensive process, especially with the sanity checks we had to add, these days we might be at 4 calls per commit per PR, and up to 80 PRs/staging (5 repositories and 16 batches per staging), with 13 live branches (though realistically only 6-7 have significant traffic, and only 1~2 get close to filling their staging slots).
2023-08-11 16:50:38 +07:00
import time
import unicodedata
from typing import Iterable, List, TypedDict, Literal
import requests
import werkzeug.urls
import odoo.netsvc
from odoo.tools import topological_sort, config
from . import exceptions, utils
class MergeError(Exception): ...
def _is_json(r):
return r and r.headers.get('content-type', '').startswith(('application/json', 'application/javascript'))
_logger = logging.getLogger(__name__)
_gh = logging.getLogger('github_requests')
def _init_gh_logger():
""" Log all GH requests / responses so we have full tracking, but put them
in a separate file if we're logging to a file
"""
if not config['logfile']:
return
original = pathlib.Path(config['logfile'])
new = original.with_name('github_requests')\
.with_suffix(original.suffix)
if os.name == 'posix':
handler = logging.handlers.WatchedFileHandler(str(new))
else:
handler = logging.FileHandler(str(new))
handler.setFormatter(odoo.netsvc.DBFormatter(
'%(asctime)s %(pid)s %(levelname)s %(dbname)s %(name)s: %(message)s'
))
_gh.addHandler(handler)
_gh.propagate = False
if odoo.netsvc._logger_init:
_init_gh_logger()
SimpleUser = TypedDict('SimpleUser', {
'login': str,
'url': str,
'type': Literal['User', 'Organization'],
})
Authorship = TypedDict('Authorship', {
'name': str,
'email': str,
})
Commit = TypedDict('Commit', {
'tree': str,
'url': str,
'message': str,
# optional when creating a commit
'author': Authorship,
'committer': Authorship,
'comments_count': int,
})
CommitLink = TypedDict('CommitLink', {
'html_url': str,
'sha': str,
'url': str,
})
PrCommit = TypedDict('PrCommit', {
'url': str,
'sha': str,
'commit': Commit,
# optional when creating a commit (in which case it uses the current user)
'author': SimpleUser,
'committer': SimpleUser,
'parents': List[CommitLink],
# not actually true but we're smuggling stuff via that key
'new_tree': str,
})
2023-08-10 21:14:33 +07:00
GH_LOG_PATTERN = """=> {method} {path}{qs}{body}
<= {r.status_code} {r.reason}
{headers}
{body2}
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"""
class GH(object):
def __init__(self, token, repo):
self._url = 'https://api.github.com'
self._repo = repo
[IMP] runbot_merge: ensure at least 1s between mutating GH calls Mostly a temporary safety feature after the events of 07-31: it's still not clear whether that was a one-off issue or a change in policy (I was not able to reproduce locally even doing several set_refs a second) and the gh support is not super talkative, but it probably doesn't hurt to commit the workaround until #247 gets implemented. On 2023-07-31, around 08:30 UTC, `set_ref` started failing, a lot (although oddly enough not continuously), with the unhelpful message that > 422: Reference cannot be updated This basically broke all stagings, until a workaround was implemented by adding a 1s sleep before `set_ref` to ensure no more than 1 `set_ref` per second, which kinda sorta has been the github recommendation forever but had never been an issue before. Contributing to this suspicion is that in late 2022, the documentation of error 422 on `PATCH git/refs/{ref}` was updated to: > Validation failed, or the endpoint has been spammed. Still would be nice if GH was clear about it and sent a 429 instead. Technically the recommendation is: > If you're making a large number of POST, PATCH, PUT, or DELETE > requests for a single user or client ID, wait at least one second > between each request. So... actually implement that. On a per-worker basis as for the most part these are serial processes (e.g. crons), we can still get above the rate limit due to concurrent crons but it should be less likely. Also take `Retry-After` in account, can't hurt, though we're supposed to retry just the request rather than abort the entire thing. Maybe a future update can improve this handling. Would also be nice to take `X-RateLimit` in account, although that's supposed to apply to *all* requests so we'd need a second separate timestamp to track it. Technically that's probably also the case for `Retry-After`. And fixing #247 should cut down drastically on the API calls traffic as staging is a very API-intensive process, especially with the sanity checks we had to add, these days we might be at 4 calls per commit per PR, and up to 80 PRs/staging (5 repositories and 16 batches per staging), with 13 live branches (though realistically only 6-7 have significant traffic, and only 1~2 get close to filling their staging slots).
2023-08-11 16:50:38 +07:00
self._last_update = 0
session = self._session = requests.Session()
2018-03-26 18:08:49 +07:00
session.headers['Authorization'] = 'token {}'.format(token)
session.headers['Accept'] = 'application/vnd.github.symmetra-preview+json'
def _log_gh(self, logger: logging.Logger, response: requests.Response, level: int = logging.INFO, extra=None):
""" Logs a pair of request / response to github, to the specified
logger, at the specified level.
Tries to format all the information (including request / response
bodies, at least in part) so we have as much information as possible
for post-mortems.
"""
req = response.request
url = werkzeug.urls.url_parse(req.url)
if url.netloc != 'api.github.com':
return
body = '' if not req.body else ('\n' + pprint.pformat(json.loads(req.body.decode()), indent=4))
body2 = ''
if response.content:
if _is_json(response):
body2 = pprint.pformat(response.json(), depth=4)
elif response.encoding is not None:
body2 = response.text
else: # fallback: universal decoding & replace nonprintables
body2 = ''.join(
'\N{REPLACEMENT CHARACTER}' if unicodedata.category(c) == 'Cc' else c
for c in response.content.decode('iso-8859-1')
)
logger.log(level, GH_LOG_PATTERN.format(
# requests data
method=req.method, path=url.path, qs=url.query, body=body,
# response data
r=response,
headers='\n'.join(
'\t%s: %s' % (h, v) for h, v in response.headers.items()
),
body2=utils.shorten(body2.strip(), 400)
), extra=extra)
def __call__(self, method, path, params=None, json=None, check=True):
"""
:type check: bool | dict[int:Exception]
"""
[IMP] runbot_merge: ensure at least 1s between mutating GH calls Mostly a temporary safety feature after the events of 07-31: it's still not clear whether that was a one-off issue or a change in policy (I was not able to reproduce locally even doing several set_refs a second) and the gh support is not super talkative, but it probably doesn't hurt to commit the workaround until #247 gets implemented. On 2023-07-31, around 08:30 UTC, `set_ref` started failing, a lot (although oddly enough not continuously), with the unhelpful message that > 422: Reference cannot be updated This basically broke all stagings, until a workaround was implemented by adding a 1s sleep before `set_ref` to ensure no more than 1 `set_ref` per second, which kinda sorta has been the github recommendation forever but had never been an issue before. Contributing to this suspicion is that in late 2022, the documentation of error 422 on `PATCH git/refs/{ref}` was updated to: > Validation failed, or the endpoint has been spammed. Still would be nice if GH was clear about it and sent a 429 instead. Technically the recommendation is: > If you're making a large number of POST, PATCH, PUT, or DELETE > requests for a single user or client ID, wait at least one second > between each request. So... actually implement that. On a per-worker basis as for the most part these are serial processes (e.g. crons), we can still get above the rate limit due to concurrent crons but it should be less likely. Also take `Retry-After` in account, can't hurt, though we're supposed to retry just the request rather than abort the entire thing. Maybe a future update can improve this handling. Would also be nice to take `X-RateLimit` in account, although that's supposed to apply to *all* requests so we'd need a second separate timestamp to track it. Technically that's probably also the case for `Retry-After`. And fixing #247 should cut down drastically on the API calls traffic as staging is a very API-intensive process, especially with the sanity checks we had to add, these days we might be at 4 calls per commit per PR, and up to 80 PRs/staging (5 repositories and 16 batches per staging), with 13 live branches (though realistically only 6-7 have significant traffic, and only 1~2 get close to filling their staging slots).
2023-08-11 16:50:38 +07:00
if method.casefold() != 'get':
to_sleep = 1. - (time.time() - self._last_update)
if to_sleep > 0:
time.sleep(to_sleep)
path = f'/repos/{self._repo}/{path}'
r = self._session.request(method, self._url + path, params=params, json=json)
[IMP] runbot_merge: ensure at least 1s between mutating GH calls Mostly a temporary safety feature after the events of 07-31: it's still not clear whether that was a one-off issue or a change in policy (I was not able to reproduce locally even doing several set_refs a second) and the gh support is not super talkative, but it probably doesn't hurt to commit the workaround until #247 gets implemented. On 2023-07-31, around 08:30 UTC, `set_ref` started failing, a lot (although oddly enough not continuously), with the unhelpful message that > 422: Reference cannot be updated This basically broke all stagings, until a workaround was implemented by adding a 1s sleep before `set_ref` to ensure no more than 1 `set_ref` per second, which kinda sorta has been the github recommendation forever but had never been an issue before. Contributing to this suspicion is that in late 2022, the documentation of error 422 on `PATCH git/refs/{ref}` was updated to: > Validation failed, or the endpoint has been spammed. Still would be nice if GH was clear about it and sent a 429 instead. Technically the recommendation is: > If you're making a large number of POST, PATCH, PUT, or DELETE > requests for a single user or client ID, wait at least one second > between each request. So... actually implement that. On a per-worker basis as for the most part these are serial processes (e.g. crons), we can still get above the rate limit due to concurrent crons but it should be less likely. Also take `Retry-After` in account, can't hurt, though we're supposed to retry just the request rather than abort the entire thing. Maybe a future update can improve this handling. Would also be nice to take `X-RateLimit` in account, although that's supposed to apply to *all* requests so we'd need a second separate timestamp to track it. Technically that's probably also the case for `Retry-After`. And fixing #247 should cut down drastically on the API calls traffic as staging is a very API-intensive process, especially with the sanity checks we had to add, these days we might be at 4 calls per commit per PR, and up to 80 PRs/staging (5 repositories and 16 batches per staging), with 13 live branches (though realistically only 6-7 have significant traffic, and only 1~2 get close to filling their staging slots).
2023-08-11 16:50:38 +07:00
if method.casefold() != 'get':
self._last_update = time.time() + int(r.headers.get('Retry-After', 0))
self._log_gh(_gh, r)
if check:
try:
if isinstance(check, collections.abc.Mapping):
exc = check.get(r.status_code)
if exc:
raise exc(r.text)
if r.status_code >= 400:
raise requests.HTTPError(r.text, response=r)
except Exception:
self._log_gh(_logger, r, level=logging.ERROR, extra={
'github-request-id': r.headers.get('x-github-request-id'),
})
raise
return r
def user(self, username):
r = self._session.get("{}/users/{}".format(self._url, username))
r.raise_for_status()
return r.json()
def head(self, branch: str) -> str:
[IMP] runbot_merge: sanity check when rebasing Ensure that the commits we're creating are based on the commit we're expecting. This is the second cause (and really the biggest issue) of the "Great Reset" of master on November 6: a previous commit explains the issue with non-linear github operations (update a branch, get the branch head, they don't match). The second issue is why @awa-odoo's PR was merged with a reversion of @tivisse's as part of its first commit. The stage for this issues is based on the incoherence noted above: having updated a branch, getting that branch's head afterward may still return the old head. However either delays allow that update to be visible *or* different operations can have different views of the system. Regardless it's possible that `repos/merges` "sees" a different branch head than a `git/refs/heads` which preceded it by a few milliseconds. This is an issue because github's API does not provide a generic "rebase" operation, and the operation is thus implemented by hand: 1. get the head of the branch we're trying to rebase a PR on 2. for each commit of the PR (oldest to newest), *merge* commit on the base and associate the merge commit with the original 3. reset the branch to the head we stored previously 4. for each commit of the PR, create a new commit with: - the metadata of the original - the tree of the merge commit - the "current head" as parent then update the "current head" to that commit's ref' If the head fetched at (1) and the one the first merge of (2) sees are different, the first commit created during (4) will look like it has not only its own changes but also all the changes between the two heads, as github records not changes but snapshots of working copies (that's what a git tree is, a complete snapshot of the entire state of a working copy). As a result, we end up not only with commits from a previous staging but the first commit of the next PR rollbacks the changes of those commits, making a mess of the entire thing twice over. And because the commits of the previous staging get reverted, even if there was a good reason for them to fail (not the case here it was a false positive) the new staging might just go through. As noted at the start, mitigate that by asserting that the merge commits created at (2) have the "base parent" (left parent / parent from the base branch) we were expecting, and cancel the staging if that's not the case. This can probably be removed if / when odoo/runbot#247 happens.
2019-11-07 18:00:42 +07:00
d = utils.backoff(
lambda: self('get', 'git/refs/heads/{}'.format(branch)).json(),
exc=requests.HTTPError
)
2018-03-26 18:08:49 +07:00
assert d['ref'] == 'refs/heads/{}'.format(branch)
assert d['object']['type'] == 'commit'
_logger.debug("head(%s, %s) -> %s", self._repo, branch, d['object']['sha'])
return d['object']['sha']
def commit(self, sha):
c = self('GET', 'git/commits/{}'.format(sha)).json()
_logger.debug('commit(%s, %s) -> %s', self._repo, sha, shorten(c['message']))
return c
def comment(self, pr, message):
# if the mergebot user has been blocked by the PR author, this will
# fail, but we don't want the closing of the PR to fail, or for the
# feedback cron to get stuck
try:
self('POST', 'issues/{}/comments'.format(pr), json={'body': message})
except requests.HTTPError as r:
if _is_json(r.response):
body = r.response.json()
if any(e.message == 'User is blocked' for e in (body.get('errors') or [])):
_logger.warning("comment(%s#%s) failed: user likely blocked", self._repo, pr)
return
raise
_logger.debug('comment(%s, %s, %s)', self._repo, pr, shorten(message))
def close(self, pr):
2018-03-26 18:08:49 +07:00
self('PATCH', 'pulls/{}'.format(pr), json={'state': 'closed'})
def change_tags(self, pr, remove, add):
labels_endpoint = 'issues/{}/labels'.format(pr)
tags_before = {label['name'] for label in self('GET', labels_endpoint).json()}
tags_after = (tags_before - remove) | add
# replace labels entirely
self('PUT', labels_endpoint, json={'labels': list(tags_after)})
_logger.debug('change_tags(%s, %s, from=%s, to=%s)', self._repo, pr, tags_before, tags_after)
def _check_updated(self, branch, to):
"""
:return: nothing if successful, the incorrect HEAD otherwise
"""
[IMP] runbot_merge: sanity check when rebasing Ensure that the commits we're creating are based on the commit we're expecting. This is the second cause (and really the biggest issue) of the "Great Reset" of master on November 6: a previous commit explains the issue with non-linear github operations (update a branch, get the branch head, they don't match). The second issue is why @awa-odoo's PR was merged with a reversion of @tivisse's as part of its first commit. The stage for this issues is based on the incoherence noted above: having updated a branch, getting that branch's head afterward may still return the old head. However either delays allow that update to be visible *or* different operations can have different views of the system. Regardless it's possible that `repos/merges` "sees" a different branch head than a `git/refs/heads` which preceded it by a few milliseconds. This is an issue because github's API does not provide a generic "rebase" operation, and the operation is thus implemented by hand: 1. get the head of the branch we're trying to rebase a PR on 2. for each commit of the PR (oldest to newest), *merge* commit on the base and associate the merge commit with the original 3. reset the branch to the head we stored previously 4. for each commit of the PR, create a new commit with: - the metadata of the original - the tree of the merge commit - the "current head" as parent then update the "current head" to that commit's ref' If the head fetched at (1) and the one the first merge of (2) sees are different, the first commit created during (4) will look like it has not only its own changes but also all the changes between the two heads, as github records not changes but snapshots of working copies (that's what a git tree is, a complete snapshot of the entire state of a working copy). As a result, we end up not only with commits from a previous staging but the first commit of the next PR rollbacks the changes of those commits, making a mess of the entire thing twice over. And because the commits of the previous staging get reverted, even if there was a good reason for them to fail (not the case here it was a false positive) the new staging might just go through. As noted at the start, mitigate that by asserting that the merge commits created at (2) have the "base parent" (left parent / parent from the base branch) we were expecting, and cancel the staging if that's not the case. This can probably be removed if / when odoo/runbot#247 happens.
2019-11-07 18:00:42 +07:00
r = self('get', 'git/refs/heads/{}'.format(branch), check=False)
if r.status_code == 200:
head = r.json()['object']['sha']
else:
head = '<Response [%s]: %s)>' % (r.status_code, r.text)
[IMP] runbot_merge: sanity check when rebasing Ensure that the commits we're creating are based on the commit we're expecting. This is the second cause (and really the biggest issue) of the "Great Reset" of master on November 6: a previous commit explains the issue with non-linear github operations (update a branch, get the branch head, they don't match). The second issue is why @awa-odoo's PR was merged with a reversion of @tivisse's as part of its first commit. The stage for this issues is based on the incoherence noted above: having updated a branch, getting that branch's head afterward may still return the old head. However either delays allow that update to be visible *or* different operations can have different views of the system. Regardless it's possible that `repos/merges` "sees" a different branch head than a `git/refs/heads` which preceded it by a few milliseconds. This is an issue because github's API does not provide a generic "rebase" operation, and the operation is thus implemented by hand: 1. get the head of the branch we're trying to rebase a PR on 2. for each commit of the PR (oldest to newest), *merge* commit on the base and associate the merge commit with the original 3. reset the branch to the head we stored previously 4. for each commit of the PR, create a new commit with: - the metadata of the original - the tree of the merge commit - the "current head" as parent then update the "current head" to that commit's ref' If the head fetched at (1) and the one the first merge of (2) sees are different, the first commit created during (4) will look like it has not only its own changes but also all the changes between the two heads, as github records not changes but snapshots of working copies (that's what a git tree is, a complete snapshot of the entire state of a working copy). As a result, we end up not only with commits from a previous staging but the first commit of the next PR rollbacks the changes of those commits, making a mess of the entire thing twice over. And because the commits of the previous staging get reverted, even if there was a good reason for them to fail (not the case here it was a false positive) the new staging might just go through. As noted at the start, mitigate that by asserting that the merge commits created at (2) have the "base parent" (left parent / parent from the base branch) we were expecting, and cancel the staging if that's not the case. This can probably be removed if / when odoo/runbot#247 happens.
2019-11-07 18:00:42 +07:00
if head == to:
_logger.debug("Sanity check ref update of %s to %s: ok", branch, to)
return
_logger.warning(
"Sanity check ref update of %s, expected %s got %s (response-id %s)",
branch, to, head,
r.headers.get('x-github-request-id')
)
return head
def fast_forward(self, branch, sha):
try:
2018-03-26 18:08:49 +07:00
self('patch', 'git/refs/heads/{}'.format(branch), json={'sha': sha})
_logger.debug('fast_forward(%s, %s, %s) -> OK', self._repo, branch, sha)
@utils.backoff(exc=exceptions.FastForwardError)
def _wait_for_update():
if not self._check_updated(branch, sha):
return
raise exceptions.FastForwardError(self._repo) \
from Exception("timeout: never saw %s" % sha)
except requests.HTTPError as e:
_logger.debug('fast_forward(%s, %s, %s) -> ERROR', self._repo, branch, sha, exc_info=True)
if e.response.status_code == 422:
try:
r = e.response.json()
except Exception:
pass
else:
if isinstance(r, dict) and 'message' in r:
e = Exception(r['message'].lower())
raise exceptions.FastForwardError(self._repo) from e
def set_ref(self, branch, sha):
# force-update ref
2018-03-26 18:08:49 +07:00
r = self('patch', 'git/refs/heads/{}'.format(branch), json={
'sha': sha,
'force': True,
}, check=False)
status0 = r.status_code
_logger.debug(
2023-08-10 21:14:33 +07:00
'set_ref(%s, %s, %s -> %s (%s)',
self._repo, branch, sha, status0,
'OK' if status0 == 200 else r.text or r.reason
)
if status0 == 200:
@utils.backoff(exc=AssertionError)
def _wait_for_update():
head = self._check_updated(branch, sha)
assert not head, "Sanity check ref update of %s, expected %s got %s" % (
branch, sha, head
)
return
# 422 makes no sense but that's what github returns, leaving 404 just
# in case
if status0 in (404, 422):
# fallback: create ref
status1 = self.create_ref(branch, sha)
if status1 == 201:
return
else:
status1 = None
raise AssertionError("set_ref failed(%s, %s)" % (status0, status1))
def create_ref(self, branch, sha):
r = self('post', 'git/refs', json={
'ref': 'refs/heads/{}'.format(branch),
'sha': sha,
}, check=False)
status = r.status_code
_logger.debug(
'ref_create(%s, %s, %s) -> %s (%s)',
self._repo, branch, sha, status,
'OK' if status == 201 else r.text or r.reason
)
if status == 201:
@utils.backoff(exc=AssertionError)
def _wait_for_update():
head = self._check_updated(branch, sha)
assert not head, \
f"Sanity check ref update of {branch}, expected {sha} got {head}"
return status
# fetch various bits of issues / prs to load them
def pr(self, number):
return (
self('get', 'issues/{}'.format(number)).json(),
self('get', 'pulls/{}'.format(number)).json()
)
def comments(self, number):
for page in itertools.count(1):
r = self('get', 'issues/{}/comments'.format(number), params={'page': page})
yield from r.json()
if not r.links.get('next'):
return
def reviews(self, number):
for page in itertools.count(1):
r = self('get', 'pulls/{}/reviews'.format(number), params={'page': page})
yield from r.json()
if not r.links.get('next'):
return
def commits_lazy(self, pr: int) -> Iterable[PrCommit]:
for page in itertools.count(1):
r = self('get', f'pulls/{pr}/commits', params={'page': page})
yield from r.json()
if not r.links.get('next'):
return
def commits(self, pr: int) -> List[PrCommit]:
""" Returns a PR's commits oldest first (that's what GH does &
is what we want)
"""
commits = list(self.commits_lazy(pr))
# map shas to the position the commit *should* have
idx = {
c: i
for i, c in enumerate(topological_sort({
c['sha']: [p['sha'] for p in c['parents']]
for c in commits
}))
}
return sorted(commits, key=lambda c: idx[c['sha']])
def statuses(self, h):
r = self('get', 'commits/{}/status'.format(h)).json()
return [{
'sha': r['sha'],
**s,
} for s in r['statuses']]
def shorten(s):
if not s:
return s
line1 = s.split('\n', 1)[0]
if len(line1) < 50:
return line1
return line1[:47] + '...'