import re from bisect import bisect_left from copy import deepcopy from itertools import chain from typing import Literal, Optional from lxml import etree ## SOME XML UTILS INDENT_RE = re.compile(r"(^\n[\s\t]*)|(\n[\s\t]*$)") def append_text(node, text): if len(node) > 0: node[-1].tail = text else: node.text = text def dedent_tree(elem): for el in elem.iter(etree.Element): if el.text is not None: el.text = INDENT_RE.sub("", el.text) or None if el.tail is not None: el.tail = INDENT_RE.sub("", el.tail) or None def indent_tree(elem, level=1, spaces=2): """ The lxml library doesn't pretty_print xml tails, this method aims to solve this. Returns the elem with properly indented text and tail """ # See: http://lxml.de/FAQ.html#why-doesn-t-the-pretty-print-option-reformat-my-xml-output # Below code is inspired by http://effbot.org/zone/element-lib.htm#prettyprint indent_texts = elem.tag == "xpath" len_elem = len(elem) if len_elem: i = "\n" + level * spaces * " " prev_i = "\n" + (level - 1) * spaces * " " if indent_texts or (not elem.text or not elem.text.strip()): text = elem.text elem.text = (text.strip() + i) if text else i index = 0 while (index < len_elem): subelem = elem[index] tail = (subelem.tail or "").strip() if indent_texts or not tail: if index == len_elem - 1: subelem.tail = (i + tail + prev_i) if tail else prev_i else: subelem.tail = (i + tail + i) if tail else i indent_tree(subelem, level + 1, spaces) index += 1 return elem def visit(node, do_children: lambda n: True): yield node if do_children(node) is False: return for child in node.iterchildren(etree.Element): yield from visit(child, do_children) ## TOOLS TO MAKE THE DIFF def diff_dicts(old: etree._Attrib, new: etree._Attrib, ignored_keys: Optional[set] = frozenset()) -> dict: return { k: new.get(k) for k in sorted(set().union(old, new)) if k not in ignored_keys and old.get(k) != new.get(k) } def longest_increasing_subsequence(arr): """Returns the longest increasing subsequence of a list of unordered values Largely inspired from https://en.wikipedia.org/wiki/Longest_increasing_subsequence and by https://cp-algorithms.com/sequences/longest_increasing_subsequence.html It compares items on their value: in the case items are string: "11" < "2" == True. So, transforming an item to its value if necessary should be done before hand As it returns the biggest list of stable elements in the list, it is useful to compute the least amount of moving items into that list eg: [3,1,2] : 1 and 2 did not move, 3 is just placed before 1 longest_increasing_subsequence = [1,2] """ if not arr: return [] previous = {} first, *list_arr = arr smallest_endings = [first] for el in list_arr: if el < smallest_endings[-1]: target_index = bisect_left(smallest_endings, el) previous[el] = smallest_endings[target_index - 1] smallest_endings[target_index] = el else: previous[el] = smallest_endings[-1] smallest_endings.append(el) sequence = [] el = smallest_endings[-1] for _el in smallest_endings: sequence.append(el) el = previous.get(el) return sequence def _get_node_boundary_text(node: etree._Element, position: Literal["before", "after", "inside"]) -> str | None: """Given a node and a position, returns the text that will be impacted according to Odoo's xpath semantics see template_inheritance.py:add_stripped_items_before """ if position == "before": prev = node.getprevious() if prev is not None: return prev.tail else: return node.getparent().text if position == "after": return node.tail if position == "inside": if len(node) > 0: return node[-1].tail else: return node.text def _group_leafs(leafs: list, l_i_s: set): """Group the leafs according to their reference node and position. Similar to itertools.groupy, except it needed to be "read ahead" as the target of a leaf can be before any stable node (the ones in the l_i_s set) :param: set l_i_s: the longest increasing subsequence indicating stable node's ids """ last_stable = None no_targets = [] current_group = [] current_key = None for leaf in leafs: if leaf.get("id") in l_i_s: last_stable = leaf["id"] if no_targets: yield (last_stable, "before"), no_targets no_targets = [] elif last_stable: new_key = (last_stable, "after") if new_key == current_key: current_group.append(leaf) else: if current_key and current_group: yield current_key, current_group current_group = [leaf] current_key = new_key else: no_targets.append(leaf) if current_group: yield current_key, current_group if no_targets: yield (last_stable, "inside"), no_targets DIFF_ATTRIBUTE = "o-diff-key" class KeyedXmlDiffer: """A class that allows to compute the difference between two trees, of which we know one is a modification of the other. Namely, both trees have nodes that have a unique ID, each node in the new tree is compared to its counterpart in the old one. Hence the recommended flow: - assign ids on the original tree - ids must be convertible to int, and increasing with the tree's order (depth first) - do some operation on that modified tree - compare the original tree with the modified one It supports moving, removing nodes, altering texts, altering the order of a node's children, modifying attributes of a node It doesn't support changing the tag name of a node It doesn't support anything else than Elements, in particular, comments and their tail will be ignored The `diff` method returns an abstraction describing what happened for a node with a given id The `diff_xpath` method computes the Odoo's xpath notation to be used as an inherited view The expected complexity is O(n log n), because of the use of bisect. It could be higher when we compute the xpath for each touched nodes. We still have to browse several times the trees. """ @classmethod def assign_node_ids_for_diff(self, tree): for index, desc in enumerate(tree.iter(etree.Element)): desc.set(DIFF_ATTRIBUTE, str(index)) def __init__(self, ignore_attributes=None, on_new_node=lambda n: True, is_subtree=lambda n: False, xpath_with_meta=False, ): # User-defined parameters self.ignore_attributes = set([] if ignore_attributes is None else ignore_attributes) self.__on_new_node = on_new_node self.attributes_identifiers = { "id": True, "name": True, "t-name": True, "t-call": True, "t-field": True, "t-set": True, } self.is_subtree = is_subtree self.xpath_with_meta = xpath_with_meta # Internal State self.changes = {} self.map_id_to_node_old = {} def _build_tree_from_input(self, diff_input): if isinstance(diff_input, (etree._ElementTree, etree._Element)): return deepcopy(diff_input) else: parser = etree.XMLParser(remove_blank_text=True, resolve_entities=False) return etree.fromstring(diff_input, parser=parser) # Basic diffing abstraction def diff(self, old, new): old_tree = self._build_tree_from_input(old) new_tree = self._build_tree_from_input(new) self.map_id_to_node_old = {node.get(DIFF_ATTRIBUTE): node for node in old_tree.iter(etree.Element)} self._diff_nodes(self.map_id_to_node_old[new_tree.get(DIFF_ATTRIBUTE)], new_tree) return self.changes def _diff_nodes(self, old, new): main_node_id = old.get(DIFF_ATTRIBUTE) local_map_id_to_node = {} old_repr = [(None, old.text)] # is used to check if there have been any changes (without attributes) for index, child in enumerate(old.iterchildren(etree.Element)): nid = child.get(DIFF_ATTRIBUTE) old_repr.append((nid, child.tail)) local_map_id_to_node[nid] = (index, child) leafs = [] # represents children and texts. They will be grouped according to their nearest unmoving sibling new_repr = [(None, new.text)] # is used to check if there have been any changes (without attributes) kept_nodes = [] # nodes' id that are still present children_to_diff = {} # a map id to new node to continue the iteration of the tree acquired_nodes = set() # nodes' ids that are coming from elsewhere in the tree removed_nodes = set(local_map_id_to_node) # children of the main node that will be removed leafs.append({"type": "text", "element": INDENT_RE.sub("", new.text) if new.text else new.text}) for child in new.iterchildren(etree.Element): nid = child.get(DIFF_ATTRIBUTE) is_owned = nid in local_map_id_to_node new_repr.append((nid or False, child.tail)) if nid: children_to_diff[nid] = child if is_owned: kept_nodes.append(int(nid)) removed_nodes.remove(nid) else: acquired_nodes.add(nid) else: for new_child in self._visit_new_node(child): _nid = new_child.get(DIFF_ATTRIBUTE) if _nid: acquired_nodes.add(_nid) children_to_diff[_nid] = new_child leafs.extend([ {"type": "node", "owned": is_owned, "element": child, "id": nid}, {"type": "text", "element": INDENT_RE.sub("", child.tail) if child.tail else child.tail} ]) attributes_changes = diff_dicts(old.attrib, new.attrib, self.ignore_attributes) has_body_changes = old_repr != new_repr if not has_body_changes and not attributes_changes: for nid, new_child in children_to_diff.items(): self._diff_nodes(self.map_id_to_node_old[nid], new_child) return command = {"attributes": attributes_changes, "new_node": new, "leafs": leafs} if has_body_changes: l_i_s = {str(k) for k in longest_increasing_subsequence(kept_nodes)} children_changes = [] replace_text = False for (target_id, position), grouped in _group_leafs(leafs, l_i_s): target_id = target_id or main_node_id target_node = self.map_id_to_node_old[target_id] old_text = _get_node_boundary_text(target_node, position) old_text = old_text and INDENT_RE.sub("", old_text) if position == "after": text_leaf = grouped[-1] elif position in ("before", "inside"): text_leaf = grouped[0] are_texts_compatible = False if text_leaf["element"] == old_text: are_texts_compatible = True text_leaf["ignore"] = True children_changes.append({ "target_id": target_id, "position": position, "replace_text": not are_texts_compatible, "leafs": grouped, }) if not are_texts_compatible: replace_text = True command.update( removed_nodes=removed_nodes, replace_text=replace_text, children_changes=children_changes, acquired_nodes=acquired_nodes ) self.changes[new.get(DIFF_ATTRIBUTE)] = command for nid, new_child in children_to_diff.items(): self._diff_nodes(self.map_id_to_node_old[nid], new_child) # Methods that concern the building of the Odoo's xpath semantic tree def diff_xpath(self, old: etree._Element | str, new: etree._Element | str) -> str: changes = self.diff(old, new) all_removed = set() all_moved = set() # gather nodes that are moving around for change in changes.values(): if "removed_nodes" in change: for rm_id in change["removed_nodes"]: if rm_id not in all_moved: all_removed.add(rm_id) if "acquired_nodes" in change: for moved_id in change["acquired_nodes"]: all_removed.discard(moved_id) all_moved.add(moved_id) delayed_removed = set() traversed = set() for nid in all_moved: node = self.map_id_to_node_old[nid] for ancestor in node.iterancestors(etree.Element): ancestor_id = ancestor.get(DIFF_ATTRIBUTE) if ancestor_id in traversed: break else: traversed.add(ancestor_id) if ancestor_id in all_removed: delayed_removed.add(ancestor_id) all_removed.discard(ancestor_id) break diff_as_arch = etree.Element("data") for main_id, change in changes.items(): if main_id in all_removed: continue # process changes: for changes that touch the children nodes: # apply the changes onto the old tree directly to have correct dynamic xpaths main_node = self.map_id_to_node_old[main_id] main_expr = self._get_xpath(main_node) xpath_changes = etree.Element("data") if change.get("replace_text"): self._handle_node_full_replace(main_id, main_expr, change, xpath_changes) diff_as_arch.append(xpath_changes) continue ## 1. Remove nodes first, that way browsing that subtree will be easier if change.get("removed_nodes"): for cnid in change["removed_nodes"]: if cnid not in all_removed: continue rm_node = self.map_id_to_node_old[cnid] expr = main_expr + self._get_node_xpath(rm_node) xpath_rm = self._make_xpath_node(rm_node, position="replace", expr=expr) xpath_changes.append(xpath_rm) main_node.remove(rm_node) ## 2. Apply changes that touch the children: their order, the inner moves, ## the acquisition of nodes from outside (outer moves) holes = [] if change.get("children_changes"): for children_change in change["children_changes"]: target_node = self.map_id_to_node_old[children_change["target_id"]] position = children_change["position"] if position == "inside": xpath_expr = main_expr else: xpath_expr = main_expr + self._get_node_xpath(target_node) xpath_node = self._make_xpath_node(target_node, position=position, expr=xpath_expr) if children_change["replace_text"]: xpath_node.set("replace_text", "true") has_changes = False for leaf in children_change["leafs"]: if leaf.get("ignore"): continue has_changes = True if leaf["type"] == "text": new_text = leaf["element"] append_text(xpath_node, new_text) else: recompute_main_expr = False if leaf.get("id"): # If a leaf has an id its is one of two cases: # 1. it is "owned": the leaf represents a node that was originally in the same parent # In this case, we don't need to compute the whole xpath for the node, we just need # to concatenate the parent's xpath with the current position of the node. # 2. it is not "owned": the node comes from elsewhere in the tree, in this case we need # to compute the full node's xpath to_apply_on_old = moved_node = self.map_id_to_node_old[leaf["id"]] if leaf["owned"]: move_expr = main_expr + self._get_node_xpath(moved_node) else: recompute_main_expr = int(leaf["id"]) < int(main_id) move_expr = self._get_xpath(moved_node) to_push_in_xpath = self._make_xpath_node(moved_node, position="move", expr=move_expr) else: new_element = deepcopy(leaf["element"]) for new_node in self._visit_new_node_collect_holes(new_element): if new_node.tag == "o-diff-hole": holes.append(new_node) else: self._on_new_node(new_node) to_apply_on_old = new_element to_push_in_xpath = deepcopy(new_element) to_push_in_xpath.tail = None xpath_node.append(to_push_in_xpath) if position == "inside": target_node.append(to_apply_on_old) elif position == "after": target_node.addnext(to_apply_on_old) elif position == "before": target_node.addprevious(to_apply_on_old) if recompute_main_expr: main_expr = self._get_xpath(main_node) if has_changes: xpath_changes.append(xpath_node) ## Replace the holes that were made to move nodes into new elements self._handle_holes_replace(main_expr, holes, xpath_changes) ## 3. Make the changes onto the attributes. It is last because the node's identifiers ## and consequently the node's xpath may be affected if change["attributes"]: xpath_attrs = self._make_xpath_node(main_node, position="attributes", expr=main_expr) xpath_changes.append(xpath_attrs) for key, value in change["attributes"].items(): attr_node = etree.Element("attribute", name=key) attr_node.text = value xpath_attrs.append(attr_node) ## Only commit the changes into the result if there is something if len(xpath_changes) > 0: diff_as_arch.append(xpath_changes) if delayed_removed: delayed_data = etree.SubElement(diff_as_arch, "data") for rm_id in delayed_removed: node = self.map_id_to_node_old[rm_id] xpath_element = self._make_xpath_node(node, position="replace", expr=self._get_xpath(node)) delayed_data.append(xpath_element) node.getparent().remove(node) indent_tree(diff_as_arch) return etree.tostring(diff_as_arch) def _get_identifiers(self, node): node_attrib = node.attrib identifiers = {} for attr in self.attributes_identifiers: if not attr in node_attrib: continue identifiers[attr] = node.get(attr) return identifiers def _get_descendants_axis_xpath(self, node: etree._Element, subtree: etree._Element | None = None) -> str: """Computes the xpath for `node` in terms of the descendants axis eg: [subtree]//[node's identification] If more than one node is found, the function returns an empty string subtree is a reference node for which we can compute the xpath separately """ xpath_template = "//%s[@%s='%s']" if subtree is None: subtree = node.getroottree() is_subtree_element = False else: is_subtree_element = True tag = node.tag identifiers = self._get_identifiers(node) for name, value in identifiers.items(): xpath_from_subtree = xpath_template % (tag, name, value) found = subtree.xpath("." + xpath_from_subtree) if found is not None and len(found) == 1: if is_subtree_element: return self._get_xpath(subtree) + xpath_from_subtree return "." + xpath_from_subtree return "" def _get_children_axis_xpath(self, node: etree._Element, ancestors: list[etree._Element], subtree: etree._Element | None = None) -> str: """Computes the xpath of `node` in terms of direct children hierarchy eg: /form/div/notebook ancestors is in opposite order (bottom-up) subtree is a reference node for which we can compute the xpath separately """ if subtree is not None: xpath = self._get_xpath(subtree) else: xpath = "" for node in chain(reversed(ancestors), [node]): xpath += self._get_node_xpath(node) return xpath def _get_subtree_and_ancestors(self, node: etree._Element) -> tuple[etree._Element | None, list[etree._Element]]: """For a node, returns its subtree parent (a relevant parent node that indicates a tree that could be separate) eg: