65 lines
2.5 KiB
Python
65 lines
2.5 KiB
Python
# Part of Odoo. See LICENSE file for full copyright and licensing details.
|
|
|
|
import re
|
|
from typing import Iterable
|
|
|
|
import lxml
|
|
|
|
MAX_LABEL_LENGTH = 40 # arbitrary
|
|
|
|
|
|
def find_links_with_urls_and_labels(root_node, base_url, skip_regex=None, skip_prefix=None, skip_list=None):
|
|
"""Return lxml link nodes and respective matching urls (made absolute) and labels found in `root_node`.
|
|
|
|
:param lxml.etree._Element root_node: The root node to process
|
|
:param str base_url: base url to prefix relative hrefs
|
|
:param str skip_regex: URL pattern to skip
|
|
:param str skip_prefix: str prefix to skip
|
|
:param Iterable[str] skip_list: URLS to skip
|
|
|
|
:rtype: (list[lxml.etree._Element], list[dict])
|
|
"""
|
|
link_nodes, urls_and_labels = [], []
|
|
|
|
for link_node in root_node.iter(tag="a"):
|
|
original_url = link_node.get("href")
|
|
if not original_url:
|
|
continue
|
|
absolute_url = base_url + original_url if original_url.startswith(('/', '?', '#')) else original_url
|
|
if (
|
|
(skip_regex and re.search(skip_regex, absolute_url))
|
|
or (skip_prefix and absolute_url.startswith(skip_prefix))
|
|
or (skip_list and any(s in absolute_url for s in skip_list))
|
|
):
|
|
continue
|
|
|
|
if link_node.text and (stripped_text := link_node.text.strip()):
|
|
label = stripped_text[:MAX_LABEL_LENGTH]
|
|
else:
|
|
children = link_node.getchildren()
|
|
label = _get_label_from_elements(children)[:MAX_LABEL_LENGTH]
|
|
|
|
link_nodes.append(link_node)
|
|
urls_and_labels.append({'url': absolute_url, 'label': label})
|
|
|
|
return link_nodes, urls_and_labels
|
|
|
|
|
|
def _get_label_from_elements(elements: Iterable[lxml.etree._Element], image_prefix: str = "[media] ") -> str:
|
|
"""Return the first label that can be extracted from a collection of elements"""
|
|
for element in elements:
|
|
if element.tag == "img":
|
|
if img_alt := element.get("alt"):
|
|
return f"{image_prefix}{img_alt}"
|
|
if img_src := element.get("src"):
|
|
img_src_tail = img_src.split("/")[-1]
|
|
return f"{image_prefix}{img_src_tail}"
|
|
return ""
|
|
if isinstance(element, lxml.html.HtmlComment): # A known "hack"
|
|
continue
|
|
if element.tag == "p" and element.get("class") == "o_outlook_hack":
|
|
children = element.getchildren()
|
|
if label := _get_label_from_elements(children):
|
|
return label
|
|
return ""
|