runbot/runbot_merge/models/utils.py

import logging
from contextvars import ContextVar
from typing import Tuple
from xml.etree.ElementTree import Element, tostring

import markdown.inlinepatterns
import markdown.treeprocessors
from markupsafe import escape, Markup


def enum(model: str, field: str) -> Tuple[str, str]:
    n = f'{model.replace(".", "_")}_{field}_type'
    return n, n


def readonly(_):
    raise TypeError("Field is readonly")


DFM_CONTEXT_REPO = ContextVar("dfm_context", default="")
def dfm(repository: str, text: str) -> Markup:
    """ Converts the input text from markup to HTML using the Odoo PR
    Description Rules, which are basically:

    - GFM
    - minus raw HTML (?)
    - + github's autolinking (https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/autolinked-references-and-urls)
    - + bespoke autolinking of OPW and Task links to odoo.com
    """
    t = DFM_CONTEXT_REPO.set(repository)
    try:
        dfm_renderer.reset()
        return Markup(dfm_renderer.convert(escape(text)))
    finally:
        DFM_CONTEXT_REPO.reset(t)


class DfmExtension(markdown.extensions.Extension):
    def extendMarkdown(self, md):
        md.registerExtensions(['fenced_code', 'footnotes', 'nl2br', 'sane_lists', 'tables'], configs={})
        md.inlinePatterns.register(GithubLinking(md), 'githublinking', 123)
        md.inlinePatterns.register(OdooLinking(md), 'odoolinking', 124)
        # ideally the unlinker should run before the prettifier so the
        # prettification is done correctly, but it seems unlikely the prettifier
        # handles the variable nature of links correctly, and we likely want to
        # run after the unescaper
        md.treeprocessors.register(Unlinker(), "unlinker", -10)

class GithubLinking(markdown.inlinepatterns.InlineProcessor):
    """Aside from being *very* varied github links are *contextual*. That is,
    their resolution depends on the repository they're being called from
    (technically they also need all the information from the github backend to
    know the people & objects exist but we don't have that option).

    Context is not available to us, but we can fake it through the application
    of contextvars: ``DFM_CONTEXT_REPO`` should contain the full name of the
    repository this is being resolved from.

    If ``DFM_CONTEXT_REPO`` is empty and needed, this processor emits a warning.
    """
    def __init__(self, md=None):
        super().__init__(r"""(?xi)
(?:
    \bhttps://github.com/([\w\.-]+/[\w\.-]+)/(?:issues|pull)/(\d+)(\#[\w-]+)?
|   \bhttps://github.com/([\w\.-]+/[\w\.-]+)/commit/([a-f0-9]+)
|   \b([\w\.-]+/[\w\.-]+)\#(\d+)
|   (\bGH-|(?:^|(?<=\s))\#)(\d+)
|   \b(?:
        # user@sha or user/repo@sha
        ([\w\.-]+(?:/[\w\.-]+)?)
        @
        ([0-9a-f]{7,40})
    )
|   \b(
        # a sha is 7~40 hex digits but that means any million+ number matches
        # which is probably wrong. So ensure there's at least one letter in the
        # set by using a positive lookahead which looks for a sequence of at
        # least 0 numbers followed by a-f
        (?=[0-9]{0,39}?[a-f])
        [0-9a-f]{7,40}
    )
)
\b
""", md)

    def handleMatch(self, m, data):
        ctx = DFM_CONTEXT_REPO.get()
        if not ctx:
            logging.getLogger(__name__)\
                .getChild("github_links")\
                .warning("missing context for rewriting github links, skipping")
            return m[0], *m.span()

        repo = issue = commit = None
        if m[2]:  # full issue / PR
            repo = m[1]
            issue = m[2]
        elif m[5]:  # long hash
            repo = m[4]
            commit = m[5]
        elif m[7]:  # short issue with repo
            repo = m[6]
            issue = m[7]
        elif m[9]:  # short issue without repo
            repo = None if m[8] == '#' else "GH"
            issue = m[9]
        elif m[11]:  # medium hash
            repo = m[10]
            commit = m[11]
        else:  # hash only
            commit = m[12]

        el = Element("a")
        if issue is not None:
            if repo == "GH":
                el.text = f"GH-{issue}"
                repo = ctx
            elif repo in (None, ctx):
                repo = ctx
                el.text = f"#{issue}"
            else:
                el.text = f"{repo}#{issue}"

            if (fragment := m[3]) and fragment.startswith('#issuecomment-'):
                el.text += ' (comment)'
            else:
                fragment = ''
            el.set('href', f"https://github.com/{repo}/issues/{issue}{fragment}")
        else:
            if repo in (None, ctx):
                label_repo = ""
                repo = ctx
            elif '/' not in repo:  # owner-only
                label_repo = repo
                # NOTE: I assume in reality we're supposed to find the actual fork if unambiguous...
                repo = repo + '/' + ctx.split('/')[-1]
            elif repo.split('/')[-1] == ctx.split('/')[-1]:
                # NOTE: here we assume if it's the same repo in a different owner it's a fork
                label_repo = repo.split('/')[0]
            else:
                label_repo = repo
            el.text = f"{label_repo}@{commit}" if label_repo else commit
            el.set("href", f"https://github.com/{repo}/commit/{commit}")
        return el, *m.span()


class OdooLinking(markdown.inlinepatterns.InlineProcessor):
    def __init__(self, md=None):
        # there are other weirder variations but fuck em, this matches
        # "opw", "task", "task-id" or "taskid" followed by an optional - or :
        # followed by digits
        super().__init__(r"(?i)\b(task(?:-?id)?|opw)\s*[-:]?\s*(\d+)\b", md)

    def handleMatch(self, m, data):
        el = Element("a", href='https://www.odoo.com/web#model=project.task&id=' + m[2])
        if m[1].lower() == 'opw':
            el.text = f"opw-{m[2]}"
        else:
            el.text = f"task-{m[2]}"
        return el, *m.span()


class Unlinker(markdown.treeprocessors.Treeprocessor):
    def run(self, root):
        # find all elements which contain a link, as ElementTree does not have
        # parent links we can't really replace links in place
        for parent in root.iterfind('.//*[a]'):
            children = parent[:]
            # can't use clear because that clears the attributes and tail/text
            del parent[:]
            for el in children:
                if el.tag != 'a' or el.get('href', '').startswith(('https:', 'http:')):
                    parent.append(el)
                    continue

                # this is a weird link, remove it

                if el.text:  # first attach its text to the previous element
                    if len(parent):  # prev is not parent
                        parent[-1].tail = (parent[-1].tail or '') + el.text
                    else:
                        parent.text = (parent.text or '') + el.text

                if len(el):  # then unpack all its children
                    parent.extend(el[:])

                if el.tail:  # then attach tail to previous element
                    if len(parent):  # prev is not parent
                        parent[-1].tail = (parent[-1].tail or '') + el.tail
                    else:
                        parent.text = (parent.text or '') + el.tail

        return None


# alternatively, use cmarkgfm? The maintainer of py-gfm (impl'd over
# python-markdown) ultimately gave up, if apparently mostly due to pymarkdown's
# tendency to break its API all the time
dfm_renderer = markdown.Markdown(
    extensions=[DfmExtension()],
    output_format='html5',
)
[ADD] runbot_merge: rendering of PR descriptions Previously PR descriptions were displayed as raw text in the PR dashboard. While not wrong per se, this was pretty ugly and not always convenient as e.g. links had to be copied by hand. Push descriptions through pymarkdown for rendering them, with a few customisations: - Enabled footnotes & tables & fenced code blocks because GFM has that, this doesn't quite put pymarkdown's base behaviour on par with gfm (and py-gfm ultimately gave up on that effort moving to just wrap github's own markdown renderer instead). - Don't allow raw html because too much of a hassle to do it correctly, and very few people ever do it (mostly me I think). - Added a bespoke handler / renderer for github-style references. Note: uses positional captures because it started that way and named captures are not removed from that sequence so mixing and matching is not very useful, plus python does not support identically named groups (even exclusive) so all 4 repo captures and all 3 issue number captures would need different names... - And added a second bespoke handler for our own opw/issue references leading to odoo.com, that's something we can't do via github[^1] so it's a genuine value-add. Fixes #889 [^1]: github can do it (though possibly not with the arbitrary unspecified nonsense I got when I tried to list some of the reference styles, some folks need therapy), but it's not available on our plan 2024-07-11 16:53:01 +07:00			`import logging`
			`from contextvars import ContextVar`
[CHG] runbot_merge: move priority field from PR to batch Simplifies the `ready_prs` query a bit and allows it to be converted to an ORM search, by moving the priority check outside. This also allows the caller to not need to post-process the records list anywhere near the previous state of affairs. `ready_prs` now returns either the "alone" batches, or the non-alone batches, rather than mixing both into a single sequence. This requires correctly applying the search filters to not retrieve priority of batches in error or targeting other branches. 2024-02-07 21:05:33 +07:00			`from typing import Tuple`
[ADD] runbot_merge: rendering of PR descriptions Previously PR descriptions were displayed as raw text in the PR dashboard. While not wrong per se, this was pretty ugly and not always convenient as e.g. links had to be copied by hand. Push descriptions through pymarkdown for rendering them, with a few customisations: - Enabled footnotes & tables & fenced code blocks because GFM has that, this doesn't quite put pymarkdown's base behaviour on par with gfm (and py-gfm ultimately gave up on that effort moving to just wrap github's own markdown renderer instead). - Don't allow raw html because too much of a hassle to do it correctly, and very few people ever do it (mostly me I think). - Added a bespoke handler / renderer for github-style references. Note: uses positional captures because it started that way and named captures are not removed from that sequence so mixing and matching is not very useful, plus python does not support identically named groups (even exclusive) so all 4 repo captures and all 3 issue number captures would need different names... - And added a second bespoke handler for our own opw/issue references leading to odoo.com, that's something we can't do via github[^1] so it's a genuine value-add. Fixes #889 [^1]: github can do it (though possibly not with the arbitrary unspecified nonsense I got when I tried to list some of the reference styles, some folks need therapy), but it's not available on our plan 2024-07-11 16:53:01 +07:00			`from xml.etree.ElementTree import Element, tostring`

			`import markdown.inlinepatterns`
			`import markdown.treeprocessors`
			`from markupsafe import escape, Markup`
[CHG] runbot_merge: move priority field from PR to batch Simplifies the `ready_prs` query a bit and allows it to be converted to an ORM search, by moving the priority check outside. This also allows the caller to not need to post-process the records list anywhere near the previous state of affairs. `ready_prs` now returns either the "alone" batches, or the non-alone batches, rather than mixing both into a single sequence. This requires correctly applying the search filters to not retrieve priority of batches in error or targeting other branches. 2024-02-07 21:05:33 +07:00

			`def enum(model: str, field: str) -> Tuple[str, str]:`
			`n = f'{model.replace(".", "_")}_{field}_type'`
			`return n, n`
[FIX] runbot_merge: leftover direct setting of PR state Setting the PR state directly really doesn't work as it doesn't correctly save (and can get overwritten by any dependency of which there are many). This caused setting odoo/odoo#165777 in error to fail, leading to it being re-staged (and failing) repeatedly, and the PR being spammed with comments. - create a more formal helper for preventing directly setting computed functions (without an actual inverse) - replace direct state setting by setting the corresponding dependency e.g. `error` for error and `skipchecks` to force a PR to ready - add a `skipchecks` inverse to the PR so it can also set itself as reviewed, and is convenient, might be worth also adding stuff to `Batch.write` 2024-06-11 20:31:35 +07:00

			`def readonly(_):`
			`raise TypeError("Field is readonly")`
[ADD] runbot_merge: rendering of PR descriptions Previously PR descriptions were displayed as raw text in the PR dashboard. While not wrong per se, this was pretty ugly and not always convenient as e.g. links had to be copied by hand. Push descriptions through pymarkdown for rendering them, with a few customisations: - Enabled footnotes & tables & fenced code blocks because GFM has that, this doesn't quite put pymarkdown's base behaviour on par with gfm (and py-gfm ultimately gave up on that effort moving to just wrap github's own markdown renderer instead). - Don't allow raw html because too much of a hassle to do it correctly, and very few people ever do it (mostly me I think). - Added a bespoke handler / renderer for github-style references. Note: uses positional captures because it started that way and named captures are not removed from that sequence so mixing and matching is not very useful, plus python does not support identically named groups (even exclusive) so all 4 repo captures and all 3 issue number captures would need different names... - And added a second bespoke handler for our own opw/issue references leading to odoo.com, that's something we can't do via github[^1] so it's a genuine value-add. Fixes #889 [^1]: github can do it (though possibly not with the arbitrary unspecified nonsense I got when I tried to list some of the reference styles, some folks need therapy), but it's not available on our plan 2024-07-11 16:53:01 +07:00

			`DFM_CONTEXT_REPO = ContextVar("dfm_context", default="")`
			`def dfm(repository: str, text: str) -> Markup:`
			`""" Converts the input text from markup to HTML using the Odoo PR`
			`Description Rules, which are basically:`

			`- GFM`
			`- minus raw HTML (?)`
			`- + github's autolinking (https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/autolinked-references-and-urls)`
			`- + bespoke autolinking of OPW and Task links to odoo.com`
			`"""`
			`t = DFM_CONTEXT_REPO.set(repository)`
			`try:`
[FIX] runbot_merge: reset markdown renderer pymarkdown's footnotes plugin saves footnotes across invocations by default. Even if I understand the documented use case it seems wild that it's not opt-in... Anyway disable that resetting all internal state. Thanks rfr for the inital report that things were looking odd. 2024-10-29 19:13:59 +07:00			`dfm_renderer.reset()`
[ADD] runbot_merge: rendering of PR descriptions Previously PR descriptions were displayed as raw text in the PR dashboard. While not wrong per se, this was pretty ugly and not always convenient as e.g. links had to be copied by hand. Push descriptions through pymarkdown for rendering them, with a few customisations: - Enabled footnotes & tables & fenced code blocks because GFM has that, this doesn't quite put pymarkdown's base behaviour on par with gfm (and py-gfm ultimately gave up on that effort moving to just wrap github's own markdown renderer instead). - Don't allow raw html because too much of a hassle to do it correctly, and very few people ever do it (mostly me I think). - Added a bespoke handler / renderer for github-style references. Note: uses positional captures because it started that way and named captures are not removed from that sequence so mixing and matching is not very useful, plus python does not support identically named groups (even exclusive) so all 4 repo captures and all 3 issue number captures would need different names... - And added a second bespoke handler for our own opw/issue references leading to odoo.com, that's something we can't do via github[^1] so it's a genuine value-add. Fixes #889 [^1]: github can do it (though possibly not with the arbitrary unspecified nonsense I got when I tried to list some of the reference styles, some folks need therapy), but it's not available on our plan 2024-07-11 16:53:01 +07:00			`return Markup(dfm_renderer.convert(escape(text)))`
			`finally:`
			`DFM_CONTEXT_REPO.reset(t)`


			`class DfmExtension(markdown.extensions.Extension):`
			`def extendMarkdown(self, md):`
			`md.registerExtensions(['fenced_code', 'footnotes', 'nl2br', 'sane_lists', 'tables'], configs={})`
			`md.inlinePatterns.register(GithubLinking(md), 'githublinking', 123)`
			`md.inlinePatterns.register(OdooLinking(md), 'odoolinking', 124)`
			`# ideally the unlinker should run before the prettifier so the`
			`# prettification is done correctly, but it seems unlikely the prettifier`
			`# handles the variable nature of links correctly, and we likely want to`
			`# run after the unescaper`
			`md.treeprocessors.register(Unlinker(), "unlinker", -10)`

			`class GithubLinking(markdown.inlinepatterns.InlineProcessor):`
			`"""Aside from being very varied github links are contextual. That is,`
			`their resolution depends on the repository they're being called from`
			`(technically they also need all the information from the github backend to`
			`know the people & objects exist but we don't have that option).`

			`Context is not available to us, but we can fake it through the application`
			of contextvars: ``DFM_CONTEXT_REPO`` should contain the full name of the
			`repository this is being resolved from.`

			If ``DFM_CONTEXT_REPO`` is empty and needed, this processor emits a warning.
			`"""`
			`def __init__(self, md=None):`
			`super().__init__(r"""(?xi)`
			`(?:`
			`\bhttps://github.com/([\w\.-]+/[\w\.-]+)/(?:issues\|pull)/(\d+)(\#[\w-]+)?`
			`\| \bhttps://github.com/([\w\.-]+/[\w\.-]+)/commit/([a-f0-9]+)`
			`\| \b([\w\.-]+/[\w\.-]+)\#(\d+)`
			`\| (\bGH-\|(?:^\|(?<=\s))\#)(\d+)`
			`\| \b(?:`
			`# user@sha or user/repo@sha`
			`([\w\.-]+(?:/[\w\.-]+)?)`
			`@`
			`([0-9a-f]{7,40})`
			`)`
			`\| \b(`
			`# a sha is 7~40 hex digits but that means any million+ number matches`
			`# which is probably wrong. So ensure there's at least one letter in the`
			`# set by using a positive lookahead which looks for a sequence of at`
			`# least 0 numbers followed by a-f`
			`(?=[0-9]{0,39}?[a-f])`
			`[0-9a-f]{7,40}`
			`)`
			`)`
			`\b`
			`""", md)`

			`def handleMatch(self, m, data):`
			`ctx = DFM_CONTEXT_REPO.get()`
			`if not ctx:`
			`logging.getLogger(__name__)\`
			`.getChild("github_links")\`
			`.warning("missing context for rewriting github links, skipping")`
			`return m[0], *m.span()`

			`repo = issue = commit = None`
			`if m[2]: # full issue / PR`
			`repo = m[1]`
			`issue = m[2]`
			`elif m[5]: # long hash`
			`repo = m[4]`
			`commit = m[5]`
			`elif m[7]: # short issue with repo`
			`repo = m[6]`
			`issue = m[7]`
			`elif m[9]: # short issue without repo`
			`repo = None if m[8] == '#' else "GH"`
			`issue = m[9]`
			`elif m[11]: # medium hash`
			`repo = m[10]`
			`commit = m[11]`
			`else: # hash only`
			`commit = m[12]`

			`el = Element("a")`
			`if issue is not None:`
			`if repo == "GH":`
			`el.text = f"GH-{issue}"`
			`repo = ctx`
			`elif repo in (None, ctx):`
			`repo = ctx`
			`el.text = f"#{issue}"`
			`else:`
			`el.text = f"{repo}#{issue}"`

			`if (fragment := m[3]) and fragment.startswith('#issuecomment-'):`
			`el.text += ' (comment)'`
			`else:`
			`fragment = ''`
			`el.set('href', f"https://github.com/{repo}/issues/{issue}{fragment}")`
			`else:`
			`if repo in (None, ctx):`
			`label_repo = ""`
			`repo = ctx`
			`elif '/' not in repo: # owner-only`
			`label_repo = repo`
			`# NOTE: I assume in reality we're supposed to find the actual fork if unambiguous...`
			`repo = repo + '/' + ctx.split('/')[-1]`
			`elif repo.split('/')[-1] == ctx.split('/')[-1]:`
			`# NOTE: here we assume if it's the same repo in a different owner it's a fork`
			`label_repo = repo.split('/')[0]`
			`else:`
			`label_repo = repo`
			`el.text = f"{label_repo}@{commit}" if label_repo else commit`
			`el.set("href", f"https://github.com/{repo}/commit/{commit}")`
			`return el, *m.span()`


			`class OdooLinking(markdown.inlinepatterns.InlineProcessor):`
			`def __init__(self, md=None):`
			`# there are other weirder variations but fuck em, this matches`
			`# "opw", "task", "task-id" or "taskid" followed by an optional - or :`
			`# followed by digits`
			`super().__init__(r"(?i)\b(task(?:-?id)?\|opw)\s[-:]?\s(\d+)\b", md)`

			`def handleMatch(self, m, data):`
			`el = Element("a", href='https://www.odoo.com/web#model=project.task&id=' + m[2])`
			`if m[1].lower() == 'opw':`
			`el.text = f"opw-{m[2]}"`
			`else:`
			`el.text = f"task-{m[2]}"`
			`return el, *m.span()`


			`class Unlinker(markdown.treeprocessors.Treeprocessor):`
			`def run(self, root):`
			`# find all elements which contain a link, as ElementTree does not have`
			`# parent links we can't really replace links in place`
			`for parent in root.iterfind('.//*[a]'):`
			`children = parent[:]`
			`# can't use clear because that clears the attributes and tail/text`
			`del parent[:]`
			`for el in children:`
			`if el.tag != 'a' or el.get('href', '').startswith(('https:', 'http:')):`
			`parent.append(el)`
			`continue`

			`# this is a weird link, remove it`

			`if el.text: # first attach its text to the previous element`
			`if len(parent): # prev is not parent`
			`parent[-1].tail = (parent[-1].tail or '') + el.text`
			`else:`
			`parent.text = (parent.text or '') + el.text`

			`if len(el): # then unpack all its children`
			`parent.extend(el[:])`

			`if el.tail: # then attach tail to previous element`
			`if len(parent): # prev is not parent`
			`parent[-1].tail = (parent[-1].tail or '') + el.tail`
			`else:`
			`parent.text = (parent.text or '') + el.tail`

			`return None`


			`# alternatively, use cmarkgfm? The maintainer of py-gfm (impl'd over`
			`# python-markdown) ultimately gave up, if apparently mostly due to pymarkdown's`
			`# tendency to break its API all the time`
			`dfm_renderer = markdown.Markdown(`
			`extensions=[DfmExtension()],`
			`output_format='html5',`
			`)`