Odoo18-Base/odoo/tools/populate.py

"""
Database Population via Duplication

This tool provides utilities to duplicate records across models in Odoo, while maintaining referential integrity,
handling field variations, and optimizing insertion performance. The duplication is controlled by a `factors` argument
that specifies how many times each record should be duplicated. The duplication process takes into account fields
that require unique constraints, distributed values (e.g., date fields), and relational fields (e.g., Many2one, Many2many).

Key Features:
-------------
1. **Field Variations**: Handles variations for certain fields to ensure uniqueness or to distribute values, such as:
   - Char/Text fields: Appends a postfix or variation to existing data.
   - Date/Datetime fields: Distributes dates within a specified range.

2. **Efficient Duplication**: Optimizes the duplication process by:
   - Dropping and restoring indexes to speed up bulk inserts.
   - Disabling foreign key constraint checks during duplication to avoid integrity errors.
   - Dynamically adjusting sequences to maintain consistency in auto-increment fields like `id`.

3. **Field-Specific Logic**:
   - Unique fields are identified and modified to avoid constraint violations.
   - Many2one fields are remapped to newly duplicated records.
   - One2many and Many2many relationships are handled by duplicating both sides of the relationship.

4. **Foreign Key and Index Management**:
   - Indexes are temporarily dropped during record creation and restored afterward to improve performance.
   - Foreign key checks are disabled temporarily to prevent constraint violations during record insertion.

5. **Dependency Management**: Ensures proper population order of models with dependencies (e.g., `_inherits` fields)
   by resolving dependencies before duplicating records.

6. **Dynamic SQL Generation**: Uses SQL queries to manipulate and duplicate data directly at the database level,
   ensuring performance and flexibility in handling large datasets.
"""

from collections import defaultdict
from contextlib import contextmanager
from datetime import datetime

from psycopg2.errors import InsufficientPrivilege
from dateutil.relativedelta import relativedelta
import logging

from odoo.api import Environment
from odoo.tools.sql import SQL
from odoo.fields import Field, Many2one
from odoo.models import Model


_logger = logging.getLogger(__name__)

# Min/Max value for a date/datetime field
MIN_DATETIME = datetime((datetime.now() - relativedelta(years=4)).year, 1, 1)
MAX_DATETIME = datetime.now()


def get_field_variation_date(model: Model, field: Field, factor: int, series_alias: str) -> SQL:
    """
    Distribute the duplication series evenly between [field-total_days, field].
    We use a hard limit of (MAX_DATETIME - MIN_DATETIME) years in the past to avoid
    setting duplicated records too far back in the past.
    """
    total_days = min((MAX_DATETIME - MIN_DATETIME).days, factor)
    cast_type = SQL(field._column_type[1])

    def redistribute(value):
        return SQL(
            "(%(value)s - (%(factor)s - %(series_alias)s) * (%(total_days)s::float/%(factor)s) * interval '1 days')::%(cast_type)s",
            value=value,
            factor=factor,
            series_alias=SQL.identifier(series_alias),
            total_days=total_days,
            cast_type=cast_type,
        )

    if not field.company_dependent:
        return redistribute(SQL.identifier(field.name))
    # company_dependent -> jsonb
    return SQL(
        '(SELECT jsonb_object_agg(key, %(expr)s) FROM jsonb_each_text(%(field)s))',
        expr=redistribute(SQL('value::%s', cast_type)),
        field=SQL.identifier(field.name),
    )


def get_field_variation_char(field: Field, postfix: str | SQL | None = None) -> SQL:
    """
    Append the `postfix` string to a char|text field.
    If no postfix is provided, returns no variation
    """
    if postfix is None:
        return SQL.identifier(field.name)
    if not isinstance(postfix, SQL):
        postfix = SQL.identifier(postfix)
    # if the field is translatable, it's a JSONB column, we vary all values for each key
    if field.translate:
        return SQL("""(
            SELECT jsonb_object_agg(key, value || %(postfix)s)
            FROM jsonb_each_text(%(field)s)
        )""", field=SQL.identifier(field.name), postfix=postfix)
    else:
        # no postfix for fields that are an '' (no point to)
        # or '/' (default/draft name for many model's records)
        return SQL("""
            CASE
                WHEN %(field)s IS NULL OR %(field)s IN ('/', '')
                THEN %(field)s
                ELSE %(field)s || %(postfix)s
            END
        """, field=SQL.identifier(field.name), postfix=postfix)


class PopulateContext:
    def __init__(self):
        self.has_session_replication_role = True

    @contextmanager
    def ignore_indexes(self, model: Model):
        """
        Temporarily drop indexes on table to speed up insertion.
        PKey and Unique indexes are kept for constraints
        """
        indexes = model.env.execute_query_dict(SQL("""
            SELECT indexname AS name, indexdef AS definition
              FROM pg_indexes
             WHERE tablename = %s
               AND indexname NOT LIKE %s
               AND indexdef NOT LIKE %s
        """, model._table, '%pkey', '%UNIQUE%'))
        if indexes:
            _logger.info('Dropping indexes on table %s...', model._table)
            for index in indexes:
                model.env.cr.execute(SQL('DROP INDEX %s CASCADE', SQL.identifier(index['name'])))
            yield
            _logger.info('Adding indexes back on table %s...', model._table)
            for index in indexes:
                model.env.cr.execute(index['definition'])
        else:
            yield

    @contextmanager
    def ignore_fkey_constraints(self, model: Model):
        """
        Disable Fkey constraints checks by setting the session to replica.
        """
        if self.has_session_replication_role:
            try:
                model.env.cr.execute('SET session_replication_role TO replica')
                yield
                model.env.cr.execute('RESET session_replication_role')
            except InsufficientPrivilege:
                _logger.warning("Cannot ignore Fkey constraints during insertion due to insufficient privileges for current pg_role. "
                                "Resetting transaction and retrying to populate without dropping the check on Fkey constraints. "
                                "The bulk insertion will be vastly slower than anticipated.")
                model.env.cr.rollback()
                self.has_session_replication_role = False
                yield
        else:
            yield


def field_needs_variation(model: Model, field: Field) -> bool:
    """
    Return True/False depending on if the field needs to be varied.
    Might be necessary in the case of:
    - unique constraints
    - varying dates for better distribution
    - field will be part of _rec_name_search, therefor variety is needed for effective searches
    - field has a trigram index on it
    """
    def is_unique(model_, field_):
        """
        An unique constraint is enforced by Postgres as an unique index,
        whether it's defined as a constraint on the table, or as an manual unique index.
        Both type of constraint are present in the index catalog
        """
        query = SQL("""
        SELECT EXISTS(SELECT 1
              FROM pg_index idx
                   JOIN pg_class t ON t.oid = idx.indrelid
                   JOIN pg_class i ON i.oid = idx.indexrelid
                   JOIN pg_attribute a ON a.attnum = ANY (idx.indkey) AND a.attrelid = t.oid
              WHERE t.relname = %s  -- tablename
                AND a.attname = %s  -- column
                AND idx.indisunique = TRUE) AS is_unique;
        """, model_._table, field_.name)
        return model_.env.execute_query(query)[0][0]

    # Many2one fields are not considered, as a name_search would resolve it to the _rec_names_search of the related model
    in_names_search = model._rec_names_search and field.name in model._rec_names_search
    in_name = model._rec_name and field.name == model._rec_name
    if (in_name or in_names_search) and field.type != 'many2one':
        return True
    if field.type in ('date', 'datetime'):
        return True
    if field.index == 'trigram':
        return True
    return is_unique(model, field)


def get_field_variation(model: Model, field: Field, factor: int, series_alias: str) -> SQL:
    """
    Returns a variation of the source field,
    to avoid unique constraint, or better distribute data.

    :return: a SQL(identifier|expression|subquery)
    """
    match field.type:
        case 'char' | 'text':
            return get_field_variation_char(field, postfix=series_alias)
        case 'date' | 'datetime':
            return get_field_variation_date(model, field, factor, series_alias)
        case 'html':
            # For the sake of simplicity we don't vary html fields
            return SQL.identifier(field.name)
        case _:
            _logger.warning("The field %s of type %s was marked to be varied, "
                            "but no variation branch was found! Defaulting to a raw copy.", field, field.type)
            # fallback on a raw copy
            return SQL.identifier(field.name)


def fetch_last_id(model: Model) -> int:
    query = SQL('SELECT id FROM %s ORDER BY id DESC LIMIT 1', SQL.identifier(model._table))
    return model.env.execute_query(query)[0][0]


def populate_field(model: Model, field: Field, populated: dict[Model, int], factors: dict[Model, int],
                   table_alias: str = 't', series_alias: str = 's') -> SQL | None:
    """
    Returns the source expression for copying the field (SQL(identifier|expression|subquery) | None)
    `table_alias` and `series_alias` are the identifiers used to reference
    the currently being populated table and it's series, respectively.
    """
    def copy_noop():
        return None

    def copy_raw(field_):
        return SQL.identifier(field_.name)

    def copy(field_):
        if field_needs_variation(model, field_):
            return get_field_variation(model, field_, factors[model], series_alias)
        else:
            return copy_raw(field_)

    def copy_id():
        last_id = fetch_last_id(model)
        populated[model] = last_id  # this adds the model in the populated dict
        return SQL('id + %(last_id)s * %(series_alias)s', last_id=last_id, series_alias=SQL.identifier(series_alias))

    def copy_many2one(field_):
        # if the comodel was priorly populated, remap the many2one to the new copies
        if (comodel := model.env[field_.comodel_name]) in populated:
            comodel_max_id = populated[comodel]
            # we use MOD() instead of %, because % cannot be correctly escaped, it's a limitation of the SQL wrapper
            return SQL("%(table_alias)s.%(field_name)s + %(comodel_max_id)s * (MOD(%(series_alias)s - 1, %(factor)s) + 1)",
                        table_alias=SQL.identifier(table_alias),
                        field_name=SQL.identifier(field_.name),
                        comodel_max_id=comodel_max_id,
                        series_alias=SQL.identifier(series_alias),
                        factor=factors[comodel])
        return copy(field_)

    if field.name == 'id':
        return copy_id()
    match field.type:
        case 'one2many':
            # there is nothing to copy, as it's value is implicitly read from the inverse Many2one
            return copy_noop()
        case 'many2many':
            # there is nothing to do, the copying of the m2m will be handled when copying the relation table
            return copy_noop()
        case 'many2one':
            return copy_many2one(field)
        case 'many2one_reference':
            # TODO: in the case of a reference field, there is no comodel,
            #  but it's specified as the value of the field specified by model_field.
            #  Not really sure how to handle this, as it involves reading the content pointed by model_field
            #  to check on-the-fly if it's populated or not python-side, so for now we raw-copy it.
            #  If we need to read on-the-fly, the populated structure needs to be in DB (via a new Model?)
            return copy(field)
        case 'binary':
            # copy only binary field that are inlined in the table
            return copy(field) if not field.attachment else copy_noop()
        case _:
            return copy(field)


def populate_model(model: Model, populated: dict[Model, int], factors: dict[Model, int], separator_code: str) -> None:

    def update_sequence(model_):
        model_.env.execute_query(SQL("SELECT SETVAL(%(sequence)s, %(last_id)s, TRUE)",
                              sequence=f"{model_._table}_id_seq", last_id=fetch_last_id(model_)))

    def has_column(field_):
        return field_.store and field_.column_type

    assert model not in populated, f"We do not populate a model ({model}) that has already been populated."
    _logger.info('Populating model %s %s times...', model._name, factors[model])
    dest_fields = []
    src_fields = []
    update_fields = []
    table_alias = 't'
    series_alias = 's'
    # process all stored fields (that has a respective column), if the model has an 'id', it's processed first
    for _, field in sorted(model._fields.items(), key=lambda pair: pair[0] != 'id'):
        if has_column(field):
            if field_needs_variation(model, field) and field.type in ('char', 'text'):
                update_fields.append(field)
            if src := populate_field(model, field, populated, factors, table_alias, series_alias):
                dest_fields.append(SQL.identifier(field.name))
                src_fields.append(src)
    # Update char/text fields for existing rows, to allow re-entrance
    if update_fields:
        query = SQL('UPDATE %(table)s SET (%(src_columns)s) = ROW(%(dest_columns)s)',
                    table=SQL.identifier(model._table),
                    src_columns=SQL(', ').join(SQL.identifier(field.name) for field in update_fields),
                    dest_columns=SQL(', ').join(
                        get_field_variation_char(field, postfix=SQL('CHR(%s)', separator_code))
                        for field in update_fields))
        model.env.cr.execute(query)
    query = SQL("""
        INSERT INTO %(table)s (%(dest_columns)s)
        SELECT %(src_columns)s FROM %(table)s %(table_alias)s,
        GENERATE_SERIES(1, %(factor)s) %(series_alias)s
    """, table=SQL.identifier(model._table), factor=factors[model],
         dest_columns=SQL(', ').join(dest_fields), src_columns=SQL(', ').join(src_fields),
         table_alias=SQL.identifier(table_alias), series_alias=SQL.identifier(series_alias))
    model.env.cr.execute(query)
    # normally copying the 'id' will set the model entry in the populated dict,
    # but for the case of a table with no 'id' (ex: Many2many), we add manually,
    # by reading the key and having the defaultdict do the insertion, with a default value of 0
    if populated[model]:
        # in case we populated a model with an 'id', we update the sequence
        update_sequence(model)


class Many2oneFieldWrapper(Many2one):
    def __init__(self, model, field_name, comodel_name):
        super().__init__(comodel_name)
        self._setup_attrs(model, field_name)  # setup most of the default attrs


class Many2manyModelWrapper:
    def __init__(self, env, field):
        self._name = field.relation  # a m2m doesn't have a _name, so we use the tablename
        self._table = field.relation
        self._inherits = {}
        self.env = env
        self._rec_name = None
        self._rec_names_search = []
        # if the field is inherited, the column attributes are defined on the base_field
        column1 = field.column1 or field.base_field.column1
        column2 = field.column2 or field.base_field.column2
        # column1 refers to the model, while column2 refers to the comodel
        self._fields = {
            field.column1: Many2oneFieldWrapper(self, column1, field.model_name),
            field.column2: Many2oneFieldWrapper(self, column2, field.comodel_name),
        }

    def __repr__(self):
        return f"<Many2manyModelWrapper({self._name!r})>"

    def __eq__(self, other):
        return self._name == other._name

    def __hash__(self):
        return hash(self._name)


def infer_many2many_model(env: Environment, field: Field) -> Model | Many2manyModelWrapper:
    """
    Returns the relation model used for the m2m:
    - If it's a custom model, return the model
    - If it's an implicite table generated by the ORM,
      return a wrapped model that behaves like a fake duck-typed model for the population algorithm
    """
    # check if the relation is an existing model
    for model_name, model_class in env.registry.items():
        if model_class._table == field.relation:
            return env[model_name]
    # the relation is a relational table, return a wrapped version
    return Many2manyModelWrapper(env, field)


def populate_models(model_factors: dict[Model, int], separator_code: int) -> None:
    """
    Create factors new records using existing records as templates.

    If a dependency is found for a specific model, but it isn't specified by the user,
    it will inherit the factor of the dependant model.
    """

    def has_records(model_):
        query = SQL('SELECT EXISTS (SELECT 1 FROM %s)', SQL.identifier(model_._table))
        return model_.env.execute_query(query)[0][0]

    populated: dict[Model, int] = defaultdict(int)
    ctx: PopulateContext = PopulateContext()

    def process(model_):
        if model_ in populated:
            return
        if not has_records(model_):  # if there are no records, there is nothing to populate
            populated[model_] = 0
            return

        # if the model has _inherits, the delegated models need to have been populated before the current one
        for model_name in model_._inherits:
            process(model_.env[model_name])

        with ctx.ignore_fkey_constraints(model_), ctx.ignore_indexes(model_):
            populate_model(model_, populated, model_factors, separator_code)

        # models on the other end of X2many relation should also be populated (ex: to avoid SO with no SOL)
        for field in model_._fields.values():
            if field.store and field.copy:
                match field.type:
                    case 'one2many':
                        comodel = model_.env[field.comodel_name]
                        if comodel != model_:
                            model_factors[comodel] = model_factors[model_]
                            process(comodel)
                    case 'many2many':
                        m2m_model = infer_many2many_model(model_.env, field)
                        model_factors[m2m_model] = model_factors[model_]
                        process(m2m_model)

    for model in list(model_factors):
        process(model)