# -*- coding: utf-8 -*-
# Part of Odoo. See LICENSE file for full copyright and licensing details.
import base64
import collections
import itertools
import logging
import random
import re
import socket
import time
import email.utils
from email.utils import getaddresses as orig_getaddresses
from urllib.parse import urlparse
import html as htmllib
import idna
import markupsafe
from lxml import etree, html
from lxml.html import clean, defs
from werkzeug import urls
from odoo.tools import misc
__all__ = [
"email_domain_extract",
"email_domain_normalize",
"email_normalize",
"email_normalize_all",
"email_split",
"encapsulate_email",
"formataddr",
"html2plaintext",
"html_normalize",
"html_sanitize",
"is_html_empty",
"parse_contact_from_email",
"plaintext2html",
"single_email_re",
]
_logger = logging.getLogger(__name__)
# disable strict mode when present: we rely on original non-strict
# parsing, and we know that it isn't reliable, that ok.
# cfr python/cpython@4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19
if hasattr(email.utils, 'supports_strict_parsing'):
def getaddresses(fieldvalues):
return orig_getaddresses(fieldvalues, strict=False)
else:
getaddresses = orig_getaddresses
#----------------------------------------------------------
# HTML Sanitizer
#----------------------------------------------------------
safe_attrs = defs.safe_attrs | frozenset(
['style',
'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection
'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-source-sha', 'data-oe-nodeid',
'data-last-history-steps', 'data-oe-protected', 'data-embedded', 'data-embedded-editable', 'data-embedded-props', 'data-oe-version',
'data-oe-transient-content', 'data-behavior-props', 'data-prop-name', # legacy editor
'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
'data-mimetype-before-conversion',
])
SANITIZE_TAGS = {
# allow new semantic HTML5 tags
'allow_tags': defs.tags | frozenset('article bdi section header footer hgroup nav aside figure main'.split() + [etree.Comment]),
'kill_tags': ['base', 'embed', 'frame', 'head', 'iframe', 'link', 'meta',
'noscript', 'object', 'script', 'style', 'title'],
'remove_tags': ['html', 'body'],
}
class _Cleaner(clean.Cleaner):
_style_re = re.compile(r'''([\w-]+)\s*:\s*((?:[^;"']|"[^";]*"|'[^';]*')+)''')
_style_whitelist = [
'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align',
'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity',
'float', 'vertical-align', 'display',
'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right',
'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right',
'white-space',
# box model
'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom',
'height', 'width', 'max-width', 'min-width', 'min-height',
# tables
'border-collapse', 'border-spacing', 'caption-side', 'empty-cells', 'table-layout']
_style_whitelist.extend(
['border-%s-%s' % (position, attribute)
for position in ['top', 'bottom', 'left', 'right']
for attribute in ('style', 'color', 'width', 'left-radius', 'right-radius')]
)
strip_classes = False
sanitize_style = False
conditional_comments = True
def __call__(self, doc):
super(_Cleaner, self).__call__(doc)
# if we keep attributes but still remove classes
if not getattr(self, 'safe_attrs_only', False) and self.strip_classes:
for el in doc.iter(tag=etree.Element):
self.strip_class(el)
# if we keep style attribute, sanitize them
if not self.style and self.sanitize_style:
for el in doc.iter(tag=etree.Element):
self.parse_style(el)
def strip_class(self, el):
if el.attrib.get('class'):
del el.attrib['class']
def parse_style(self, el):
attributes = el.attrib
styling = attributes.get('style')
if styling:
valid_styles = collections.OrderedDict()
styles = self._style_re.findall(styling)
for style in styles:
if style[0].lower() in self._style_whitelist:
valid_styles[style[0].lower()] = style[1]
if valid_styles:
el.attrib['style'] = '; '.join('%s:%s' % (key, val) for (key, val) in valid_styles.items())
else:
del el.attrib['style']
def kill_conditional_comments(self, doc):
"""Override the default behavior of lxml.
https://github.com/lxml/lxml/blob/e82c9153c4a7d505480b94c60b9a84d79d948efb/src/lxml/html/clean.py#L501-L510
In some use cases, e.g. templates used for mass mailing,
we send emails containing conditional comments targeting Microsoft Outlook,
to give special styling instructions.
https://github.com/odoo/odoo/pull/119325/files#r1301064789
Within these conditional comments, unsanitized HTML can lie.
However, in modern browser, these comments are considered as simple comments,
their content is not executed.
https://caniuse.com/sr_ie-features
"""
if self.conditional_comments:
super().kill_conditional_comments(doc)
def tag_quote(el):
def _create_new_node(tag, text, tail=None, attrs=None):
new_node = etree.Element(tag)
new_node.text = text
new_node.tail = tail
if attrs:
for key, val in attrs.items():
new_node.set(key, val)
return new_node
def _tag_matching_regex_in_text(regex, node, tag='span', attrs=None):
text = node.text or ''
if not re.search(regex, text):
return
child_node = None
idx, node_idx = 0, 0
for item in re.finditer(regex, text):
new_node = _create_new_node(tag, text[item.start():item.end()], None, attrs)
if child_node is None:
node.text = text[idx:item.start()]
new_node.tail = text[item.end():]
node.insert(node_idx, new_node)
else:
child_node.tail = text[idx:item.start()]
new_node.tail = text[item.end():]
node.insert(node_idx, new_node)
child_node = new_node
idx = item.end()
node_idx = node_idx + 1
el_class = el.get('class', '') or ''
el_id = el.get('id', '') or ''
# gmail or yahoo // # outlook, html // # msoffice
if 'gmail_extra' in el_class or \
('SkyDrivePlaceholder' in el_class or 'SkyDrivePlaceholder' in el_class):
el.set('data-o-mail-quote', '1')
if el.getparent() is not None:
el.getparent().set('data-o-mail-quote-container', '1')
if (el.tag == 'hr' and ('stopSpelling' in el_class or 'stopSpelling' in el_id)) or \
'yahoo_quoted' in el_class:
# Quote all elements after this one
el.set('data-o-mail-quote', '1')
for sibling in el.itersiblings(preceding=False):
sibling.set('data-o-mail-quote', '1')
# odoo, gmail and outlook automatic signature wrapper
is_signature_wrapper = 'odoo_signature_wrapper' in el_class or 'gmail_signature' in el_class or el_id == "Signature"
is_outlook_auto_message = 'appendonsend' in el_id
# gmail and outlook reply quote
is_outlook_reply_quote = 'divRplyFwdMsg' in el_id
is_gmail_quote = 'gmail_quote' in el_class
is_quote_wrapper = is_signature_wrapper or is_gmail_quote or is_outlook_reply_quote
if is_quote_wrapper:
el.set('data-o-mail-quote-container', '1')
el.set('data-o-mail-quote', '1')
# outlook reply wrapper is preceded with
and a div containing recipient info
if is_outlook_reply_quote:
hr = el.getprevious()
reply_quote = el.getnext()
if hr is not None and hr.tag == 'hr':
hr.set('data-o-mail-quote', '1')
if reply_quote is not None:
reply_quote.set('data-o-mail-quote-container', '1')
reply_quote.set('data-o-mail-quote', '1')
if is_outlook_auto_message:
if not el.text or not el.text.strip():
el.set('data-o-mail-quote-container', '1')
el.set('data-o-mail-quote', '1')
# html signature (--
blah)
signature_begin = re.compile(r"((?:(?:^|\n)[-]{2}[\s]?$))")
if el.text and el.find('br') is not None and re.search(signature_begin, el.text):
el.set('data-o-mail-quote', '1')
if el.getparent() is not None:
el.getparent().set('data-o-mail-quote-container', '1')
# text-based quotes (>, >>) and signatures (-- Signature)
text_complete_regex = re.compile(r"((?:\n[>]+[^\n\r]*)+|(?:(?:^|\n)[-]{2}[\s]?[\r\n]{1,2}[\s\S]+))")
if not el.get('data-o-mail-quote'):
_tag_matching_regex_in_text(text_complete_regex, el, 'span', {'data-o-mail-quote': '1'})
if el.tag == 'blockquote':
# remove single node
el.set('data-o-mail-quote-node', '1')
el.set('data-o-mail-quote', '1')
if el.getparent() is not None and not el.getparent().get('data-o-mail-quote-node'):
if el.getparent().get('data-o-mail-quote'):
el.set('data-o-mail-quote', '1')
# only quoting the elements following the first quote in the container
# avoids issues with repeated calls to html_normalize
elif el.getparent().get('data-o-mail-quote-container'):
if (first_sibling_quote := el.getparent().find("*[@data-o-mail-quote]")) is not None:
siblings = el.getparent().getchildren()
quote_index = siblings.index(first_sibling_quote)
element_index = siblings.index(el)
if quote_index < element_index:
el.set('data-o-mail-quote', '1')
if el.getprevious() is not None and el.getprevious().get('data-o-mail-quote') and not el.text_content().strip():
el.set('data-o-mail-quote', '1')
def html_normalize(src, filter_callback=None, output_method="html"):
""" Normalize `src` for storage as an html field value.
The string is parsed as an html tag soup, made valid, then decorated for
"email quote" detection, and prepared for an optional filtering.
The filtering step (e.g. sanitization) should be performed by the
`filter_callback` function (to avoid multiple parsing operations, and
normalize the result).
:param src: the html string to normalize
:param filter_callback: optional callable taking a single `etree._Element`
document parameter, to be called during normalization in order to
filter the output document
:param output_method: defines the output method to pass to `html.tostring`.
It defaults to 'html', but can also be 'xml' for xhtml output.
"""
if not src:
return src
# html: remove encoding attribute inside tags
src = re.sub(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', "", src, flags=re.IGNORECASE | re.DOTALL)
src = src.replace('--!>', '-->')
src = re.sub(r'(|)', '', src)
# On the specific case of Outlook desktop it adds unnecessary '' tags which are parsed
# in '' which may alter the appearance (eg. spacing) of the mail body
src = re.sub(r'?o:.*?>', '', src)
try:
doc = html.fromstring(src)
except etree.ParserError as e:
# HTML comment only string, whitespace only..
if 'empty' in str(e):
return ""
raise
# perform quote detection before cleaning and class removal
if doc is not None:
for el in doc.iter(tag=etree.Element):
tag_quote(el)
if filter_callback:
doc = filter_callback(doc)
src = html.tostring(doc, encoding='unicode', method=output_method)
# this is ugly, but lxml/etree tostring want to put everything in a
# 'div' that breaks the editor -> remove that
if src.startswith('') and src.endswith('
'):
src = src[5:-6]
# html considerations so real html content match database value
src = src.replace(u'\xa0', u' ')
return src
def html_sanitize(src, silent=True, sanitize_tags=True, sanitize_attributes=False, sanitize_style=False, sanitize_form=True, sanitize_conditional_comments=True, strip_style=False, strip_classes=False, output_method="html"):
if not src:
return src
logger = logging.getLogger(__name__ + '.html_sanitize')
def sanitize_handler(doc):
kwargs = {
'page_structure': True,
'style': strip_style, # True = remove style tags/attrs
'sanitize_style': sanitize_style, # True = sanitize styling
'forms': sanitize_form, # True = remove form tags
'remove_unknown_tags': False,
'comments': False,
'conditional_comments': sanitize_conditional_comments, # True = remove conditional comments
'processing_instructions': False
}
if sanitize_tags:
kwargs.update(SANITIZE_TAGS)
if sanitize_attributes: # We keep all attributes in order to keep "style"
if strip_classes:
current_safe_attrs = safe_attrs - frozenset(['class'])
else:
current_safe_attrs = safe_attrs
kwargs.update({
'safe_attrs_only': True,
'safe_attrs': current_safe_attrs,
})
else:
kwargs.update({
'safe_attrs_only': False, # keep oe-data attributes + style
'strip_classes': strip_classes, # remove classes, even when keeping other attributes
})
cleaner = _Cleaner(**kwargs)
cleaner(doc)
return doc
try:
sanitized = html_normalize(src, filter_callback=sanitize_handler, output_method=output_method)
except etree.ParserError:
if not silent:
raise
logger.warning(u'ParserError obtained when sanitizing %r', src, exc_info=True)
sanitized = 'ParserError when sanitizing
'
except Exception:
if not silent:
raise
logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True)
sanitized = 'Unknown error when sanitizing
'
return markupsafe.Markup(sanitized)
# ----------------------------------------------------------
# HTML/Text management
# ----------------------------------------------------------
URL_SKIP_PROTOCOL_REGEX = r'mailto:|tel:|sms:'
URL_REGEX = rf'''(\bhref=['"](?!{URL_SKIP_PROTOCOL_REGEX})([^'"]+)['"])'''
TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?'
# retrieve inner content of the link
HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?'
HTML_TAGS_REGEX = re.compile('<.*?>')
HTML_NEWLINES_REGEX = re.compile('<(div|p|br|tr)[^>]*>|\n')
def validate_url(url):
if urls.url_parse(url).scheme not in ('http', 'https', 'ftp', 'ftps'):
return 'http://' + url
return url
def is_html_empty(html_content):
"""Check if a html content is empty. If there are only formatting tags with style
attributes or a void content return True. Famous use case if a
'
' added by some web editor.
:param str html_content: html content, coming from example from an HTML field
:returns: bool, True if no content found or if containing only void formatting tags
"""
if not html_content:
return True
icon_re = r'<\s*(i|span)\b(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"])?)*\s*\bclass\s*=\s*["\'][^"\']*\b(fa|fab|fad|far|oi)\b'
tag_re = r'<\s*\/?(?:p|div|section|span|br|b|i|font)\b(?:(\s+[A-Za-z_-][A-Za-z0-9-_]*(\s*=\s*[\'"][^"\']*[\'"]))*)(?:\s*>|\s*\/\s*>)'
return not bool(re.sub(tag_re, '', html_content).strip()) and not re.search(icon_re, html_content)
def html_keep_url(text):
""" Transform the url into clickable link with tag """
idx = 0
final = ''
link_tags = re.compile(r"""(?)""")
for item in re.finditer(link_tags, text):
final += text[idx:item.start()]
final += create_link(item.group(0), item.group(0))
idx = item.end()
final += text[idx:]
return final
def html_to_inner_content(html):
"""Returns unformatted text after removing html tags and excessive whitespace from a
string/Markup. Passed strings will first be sanitized.
"""
if is_html_empty(html):
return ''
if not isinstance(html, markupsafe.Markup):
html = html_sanitize(html)
processed = re.sub(HTML_NEWLINES_REGEX, ' ', html)
processed = re.sub(HTML_TAGS_REGEX, '', processed)
processed = re.sub(r' {2,}|\t', ' ', processed)
processed = htmllib.unescape(processed)
processed = processed.strip()
return processed
def create_link(url, label):
return f'{label}'
def html2plaintext(html, body_id=None, encoding='utf-8', include_references=True):
""" From an HTML text, convert the HTML to plain text.
If @param body_id is provided then this is the tag where the
body (not necessarily ) starts.
:param include_references: If False, numbered references and
URLs for links and images will not be included.
"""
## (c) Fry-IT, www.fry-it.com, 2007
##
## download here: http://www.peterbe.com/plog/html2plaintext
if not (html and html.strip()):
return ''
if isinstance(html, bytes):
html = html.decode(encoding)
else:
assert isinstance(html, str), f"expected str got {html.__class__.__name__}"
tree = etree.fromstring(html, parser=etree.HTMLParser())
if body_id is not None:
source = tree.xpath('//*[@id=%s]' % (body_id,))
else:
source = tree.xpath('//body')
if len(source):
tree = source[0]
url_index = []
linkrefs = itertools.count(1)
if include_references:
for link in tree.findall('.//a'):
if url := link.get('href'):
link.tag = 'span'
link.text = f'{link.text} [{next(linkrefs)}]'
url_index.append(url)
for img in tree.findall('.//img'):
if src := img.get('src'):
img.tag = 'span'
img_name = re.search(r'[^/]+(?=\.[a-zA-Z]+(?:\?|$))', src)
img.text = '%s [%s]' % (img_name[0] if img_name else 'Image', next(linkrefs))
url_index.append(src)
html = etree.tostring(tree, encoding="unicode")
# \r char is converted into
, must remove it
html = html.replace('
', '')
html = html.replace('', '*').replace('', '*')
html = html.replace('', '*').replace('', '*')
html = html.replace('', '*').replace('
', '*')
html = html.replace('', '**').replace('
', '**')
html = html.replace('', '**').replace('
', '**')
html = html.replace('', '/').replace('', '/')
html = html.replace('', '\n')
html = html.replace('', '\n')
html = re.sub(r'
', '\n', html)
html = re.sub('<.*?>', ' ', html)
html = html.replace(' ' * 2, ' ')
html = html.replace('>', '>')
html = html.replace('<', '<')
html = html.replace('&', '&')
html = html.replace(' ', '\N{NO-BREAK SPACE}')
# strip all lines
html = '\n'.join([x.strip() for x in html.splitlines()])
html = html.replace('\n' * 2, '\n')
if url_index:
html += '\n\n'
for i, url in enumerate(url_index, start=1):
html += f'[{i}] {url}\n'
return html.strip()
def plaintext2html(text, container_tag=None):
r"""Convert plaintext into html. Content of the text is escaped to manage
html entities, using :func:`~odoo.tools.misc.html_escape`.
- all ``\n``, ``\r`` are replaced by ``
``
- enclose content into ````
- convert url into clickable link
- 2 or more consecutive ``
`` are considered as paragraph breaks
:param str text: plaintext to convert
:param str container_tag: container of the html; by default the content is
embedded into a ``
``
:rtype: markupsafe.Markup
"""
assert isinstance(text, str)
text = misc.html_escape(text)
# 1. replace \n and \r
text = re.sub(r'(\r\n|\r|\n)', '
', text)
# 2. clickable links
text = html_keep_url(text)
# 3-4: form paragraphs
idx = 0
final = '
'
br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})')
for item in re.finditer(br_tags, text):
final += text[idx:item.start()] + '
'
idx = item.end()
final += text[idx:] + '
'
# 5. container
if container_tag: # FIXME: validate that container_tag is just a simple tag?
final = '<%s>%s%s>' % (container_tag, final, container_tag)
return markupsafe.Markup(final)
def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=None):
""" Append extra content at the end of an HTML snippet, trying
to locate the end of the HTML document (,