runbot/runbot_builder/tools.py

#!/usr/bin/python3
import argparse
import docker
import logging
import os
import psutil
import re
import sys
import threading
import time
import signal
import subprocess

from contextlib import nullcontext
from datetime import datetime, timedelta, timezone
from pathlib import Path
from logging.handlers import WatchedFileHandler

LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG)
logging.addLevelName(25, "!NFO")

CPU_COUNT = os.cpu_count()
_logger = logging.getLogger(__name__)


class RunbotClient():

    def __init__(self, env):
        self.env = env
        self.ask_interrupt = threading.Event()
        self.host = None
        self.count = 0
        self.max_count = 60

    def on_start(self):
        pass

    def main_loop(self):
        from odoo import fields
        from odoo.tools.profiler import Profiler
        self.on_start()
        signal.signal(signal.SIGINT, self.signal_handler)
        signal.signal(signal.SIGTERM, self.signal_handler)
        signal.signal(signal.SIGQUIT, self.dump_stack)
        self.host = self.env['runbot.host']._get_current()
        self.update_next_git_gc_date()
        self.host._bootstrap()
        logging.info(
            'Host %s running with %s slots on pid %s%s',
            self.host.name,
            self.host.nb_worker,
            os.getpid(),
            ' (assigned only)' if self.host.assigned_only else ''
        )
        while True:
            context_manager = Profiler(db=self.env.cr.dbname) if self.host.profile else nullcontext()
            with context_manager:
                try:
                    self.host.last_start_loop = fields.Datetime.now()
                    self.env.cr.commit()
                    if self.env.registry != self.env.registry.check_signaling():
                        self.env.reset()
                        self.env = self.env()
                    self.count = self.count % self.max_count
                    if self.host.paused:
                        sleep_time = 5
                    else:
                        sleep_time = self.loop_turn()
                    self.count += 1
                    self.host.last_end_loop = fields.Datetime.now()
                    self.env.cr.commit()
                    self.env.clear()
                    self.sleep(sleep_time)
                except Exception as e:
                    _logger.exception('Builder main loop failed with: %s', e)
                    self.env.cr.rollback()
                    self.env.clear()
                    self.sleep(10)
                if self.ask_interrupt.is_set():
                    return

    def loop_turn(self):
        raise NotImplementedError()

    def signal_handler(self, _signal, _frame):
        if self.ask_interrupt.is_set():
            _logger.info("Second Interrupt detected, force exit")
            os._exit(1)

        _logger.info("Interrupt detected")
        self.ask_interrupt.set()

    def dump_stack(self, _signal, _frame):
        import odoo
        odoo.tools.misc.dumpstacks()

    def sleep(self, t):
        self.ask_interrupt.wait(t)

    def update_next_git_gc_date(self):
        now = datetime.now()
        gc_hour = int(self.env['ir.config_parameter'].sudo().get_param('runbot.git_gc_hour', '23'))
        gc_minutes = self.host.id % 60  # deterministic minutes
        self.next_git_gc_date = datetime(now.year, now.month, now.day, gc_hour, gc_minutes)
        while self.next_git_gc_date <= now:
            self.next_git_gc_date += timedelta(days=1)
        _logger.info('Next git gc scheduled on %s', self.next_git_gc_date)

    def git_gc(self):
        """ git gc once a day """
        if self.next_git_gc_date < datetime.now():
            _logger.info('Starting git gc on repositories')
            commands = []
            host_name = self.host.name
            for repo in self.env['runbot.repo'].search([]):
                commands.append((repo.name, repo._get_git_command(['gc', '--prune=all', '--quiet'])))
            self.env.cr.rollback()
            # gc commands can be slow, rollbacking to avoid to keep a transaction idle for multiple minutes.
            messages = []
            for repo_name, command in commands:
                try:
                    start = time.time()
                    subprocess.check_output(command, stderr=subprocess.STDOUT).decode()
                    _logger.info('Git gc on %s took %ss', repo_name, time.time() - start)
                except subprocess.CalledProcessError as e:
                    message = f'git gc failed for {repo_name} on {host_name} with exit status {e.returncode} and message "{e.output[:60]} ..."'
                    messages.append(message)
            for message in messages:
                self.env['runbot.runbot']._warning(message)

            self.env.cr.commit()
            self.update_next_git_gc_date()

def run(client_class):
    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument('--odoo-path', help='Odoo sources path')
    parser.add_argument('--db_host')
    parser.add_argument('--db_port')
    parser.add_argument('--db_user')
    parser.add_argument('--db_password')
    parser.add_argument('-d', '--database', default='runbot', help='name of runbot db')
    parser.add_argument('--addons-path', type=str, dest="addons_path")
    parser.add_argument('--logfile', default=False)
    parser.add_argument('--forced-host-name', default=False)

    args = parser.parse_args()
    if args.logfile:
        dirname = os.path.dirname(args.logfile)
        if dirname and not os.path.isdir(dirname):
            os.makedirs(dirname)

        handler = WatchedFileHandler(args.logfile)
        formatter = logging.Formatter(LOG_FORMAT)
        handler.setFormatter(formatter)
        logging.getLogger().addHandler(handler)

    # configure odoo
    sys.path.append(args.odoo_path)
    import odoo
    _logger.info("Starting scheduler on database %s", args.database)
    odoo.tools.config['db_host'] = args.db_host
    odoo.tools.config['db_port'] = args.db_port
    odoo.tools.config['db_user'] = args.db_user
    odoo.tools.config['db_password'] = args.db_password
    addon_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
    config_addons_path = args.addons_path or odoo.tools.config['addons_path']
    if addon_path not in config_addons_path:
        config_addons_path = ','.join([config_addons_path, addon_path])
    odoo.tools.config['addons_path'] = config_addons_path
    odoo.tools.config['forced_host_name'] = args.forced_host_name
    import odoo.modules

    # create environment
    registry = odoo.modules.registry.Registry(args.database)
    try:
        with registry.cursor() as cr:
            env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {})
            client = client_class(env)
            # run main loop
            client.main_loop()
    except Exception as e:
        _logger.exception(str(e))
        raise e
    _logger.info("Stopping gracefully")

def human_size(nb):
    for i, u in enumerate(['', ' KiB', ' MiB', ' GiB']):
        if nb < 1000 or i == 3:  # 1000 to avoid things like 1001 MiB
            break
        nb = nb / 1024
    return f'{nb:.2f}{u}'

def get_cpu_time():
    """returns system cpus times in microseconds"""
    return sum([t for t in psutil.cpu_times()]) * 10 ** 6

def get_docker_stats(container_id):
    """
    Cgroup V1 path is /sys/fs/cgroup/memory/docker/{container_id}/memory.usage_in_bytes'
    https://docs.docker.com/config/containers/runmetrics/#find-the-cgroup-for-a-given-container
    on our runbot, we use cgroup V2 + systemd driver
    We could add a method to find the right path instead of hard coding
    Returns a tupple with:
    (time, current memory usage, cpu usage in microseconds, current system usage microseconds)
    """
    try:
        memory_current = int(Path(f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.current').read_text().strip())
    except Exception:  # should we log exception in order to debug ... but will spam the logs ...
        memory_current = 0
    try:
        cpu_stats = Path(f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cpu.stat').read_text()
        cpu_usage_line = cpu_stats.split('\n')[0]
        usage_usec = cpu_usage_line and int(cpu_usage_line.split()[1])
    except Exception:
        usage_usec = 0
    return (time.time(), memory_current, usage_usec, get_cpu_time())

def prepare_stats_log(dest, previous_stats, current_stats):
    current_time, current_mem, current_cpu, current_cpu_time = current_stats

    if not previous_stats:
        return (current_time, 0, 0, current_cpu, current_cpu_time), ''

    _, logged_mem, logged_cpu_percent, previous_cpu, previous_cpu_time,  = previous_stats

    mem_ratio = (current_mem - logged_mem) / (logged_mem or 1) * 100
    cpu_delta = current_cpu - previous_cpu
    system_delta = (current_cpu_time - previous_cpu_time) / CPU_COUNT
    cpu_percent = cpu_delta / system_delta * 100
    cpu_percent_delta = (cpu_percent - logged_cpu_percent)
    cpu_percent_ratio = cpu_percent_delta / (logged_cpu_percent or 0.000001) * 100

    log_lines = []
    date_time = datetime.fromtimestamp(current_time).astimezone(tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
    if abs(mem_ratio) > 10:  #  hard-coded values. We should find a way to configure.
        humanized = f'({human_size(current_mem)})'
        log_lines.append(f'{date_time} Memory: {current_mem:>11} {humanized:>11} ({mem_ratio:>+5.1f}%)')
        logged_mem = current_mem
    if abs(cpu_percent_ratio) > 20 and abs(cpu_percent_delta) > 2:
        log_lines.append(f'{date_time}    CPU: {cpu_percent:>11.2f} ({cpu_percent_ratio:>+5.1f}%)')
        logged_cpu_percent = cpu_percent
    previous_stats = current_time, logged_mem, logged_cpu_percent, current_cpu, current_cpu_time
    return previous_stats, '\n'.join(log_lines)

def docker_monitoring_loop(builds_dir):
    builds_dir = Path(builds_dir)
    docker_client = docker.from_env()
    previous_stats_per_docker = {}
    _logger.info('Starting docker monitoring loop thread')
    while True:
        try:
            stats_per_docker = dict()
            for container in docker_client.containers.list(filters={'status': 'running'}):
                if re.match(r'^\d+-.+_.+', container.name):
                    dest, suffix = container.name.split('_', maxsplit=1)
                    container_log_dir = builds_dir / dest / 'logs'
                    if not container_log_dir.exists():
                        _logger.warning('Log dir not found: `%s`', container_log_dir)
                        continue
                    current_stats = get_docker_stats(container.id)
                    previous_stats = previous_stats_per_docker.get(container.name)
                    previous_stats, log_line = prepare_stats_log(dest, previous_stats, current_stats)
                    if log_line:
                        stat_log_file = container_log_dir / f'{suffix}-stats.txt'
                        with open(stat_log_file, mode='a') as f:
                            f.write(f'{log_line}\n')
                    stats_per_docker[container.name] = previous_stats
            previous_stats_per_docker = stats_per_docker
            time.sleep(1)
        except Exception as e:
            _logger.exception('Monitoring loop thread exception: %s', e)
            time.sleep(60)