runbot/runbot_builder/tools.py

276 lines
11 KiB
Python

#!/usr/bin/python3
import argparse
import docker
import logging
import os
import psutil
import re
import sys
import threading
import time
import signal
import subprocess
from contextlib import nullcontext
from datetime import datetime, timedelta, timezone
from pathlib import Path
from logging.handlers import WatchedFileHandler
LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG)
logging.addLevelName(25, "!NFO")
CPU_COUNT = os.cpu_count()
_logger = logging.getLogger(__name__)
class RunbotClient():
def __init__(self, env):
self.env = env
self.ask_interrupt = threading.Event()
self.host = None
self.count = 0
self.max_count = 60
def on_start(self):
pass
def main_loop(self):
from odoo import fields
from odoo.tools.profiler import Profiler
self.on_start()
signal.signal(signal.SIGINT, self.signal_handler)
signal.signal(signal.SIGTERM, self.signal_handler)
signal.signal(signal.SIGQUIT, self.dump_stack)
self.host = self.env['runbot.host']._get_current()
self.update_next_git_gc_date()
self.host._bootstrap()
logging.info(
'Host %s running with %s slots on pid %s%s',
self.host.name,
self.host.nb_worker,
os.getpid(),
' (assigned only)' if self.host.assigned_only else ''
)
while True:
context_manager = Profiler(db=self.env.cr.dbname) if self.host.profile else nullcontext()
with context_manager:
try:
self.host.last_start_loop = fields.Datetime.now()
self.env.cr.commit()
if self.env.registry != self.env.registry.check_signaling():
self.env.reset()
self.env = self.env()
self.count = self.count % self.max_count
if self.host.paused:
sleep_time = 5
else:
sleep_time = self.loop_turn()
self.count += 1
self.host.last_end_loop = fields.Datetime.now()
self.env.cr.commit()
self.env.clear()
self.sleep(sleep_time)
except Exception as e:
_logger.exception('Builder main loop failed with: %s', e)
self.env.cr.rollback()
self.env.clear()
self.sleep(10)
if self.ask_interrupt.is_set():
return
def loop_turn(self):
raise NotImplementedError()
def signal_handler(self, _signal, _frame):
if self.ask_interrupt.is_set():
_logger.info("Second Interrupt detected, force exit")
os._exit(1)
_logger.info("Interrupt detected")
self.ask_interrupt.set()
def dump_stack(self, _signal, _frame):
import odoo
odoo.tools.misc.dumpstacks()
def sleep(self, t):
self.ask_interrupt.wait(t)
def update_next_git_gc_date(self):
now = datetime.now()
gc_hour = int(self.env['ir.config_parameter'].sudo().get_param('runbot.git_gc_hour', '23'))
gc_minutes = self.host.id % 60 # deterministic minutes
self.next_git_gc_date = datetime(now.year, now.month, now.day, gc_hour, gc_minutes)
while self.next_git_gc_date <= now:
self.next_git_gc_date += timedelta(days=1)
_logger.info('Next git gc scheduled on %s', self.next_git_gc_date)
def git_gc(self):
""" git gc once a day """
if self.next_git_gc_date < datetime.now():
_logger.info('Starting git gc on repositories')
commands = []
host_name = self.host.name
for repo in self.env['runbot.repo'].search([]):
commands.append((repo.name, repo._get_git_command(['gc', '--prune=all', '--quiet'])))
self.env.cr.rollback()
# gc commands can be slow, rollbacking to avoid to keep a transaction idle for multiple minutes.
messages = []
for repo_name, command in commands:
try:
start = time.time()
subprocess.check_output(command, stderr=subprocess.STDOUT).decode()
_logger.info('Git gc on %s took %ss', repo_name, time.time() - start)
except subprocess.CalledProcessError as e:
message = f'git gc failed for {repo_name} on {host_name} with exit status {e.returncode} and message "{e.output[:60]} ..."'
messages.append(message)
for message in messages:
self.env['runbot.runbot']._warning(message)
self.env.cr.commit()
self.update_next_git_gc_date()
def run(client_class):
# parse args
parser = argparse.ArgumentParser()
parser.add_argument('--odoo-path', help='Odoo sources path')
parser.add_argument('--db_host')
parser.add_argument('--db_port')
parser.add_argument('--db_user')
parser.add_argument('--db_password')
parser.add_argument('-d', '--database', default='runbot', help='name of runbot db')
parser.add_argument('--addons-path', type=str, dest="addons_path")
parser.add_argument('--logfile', default=False)
parser.add_argument('--forced-host-name', default=False)
args = parser.parse_args()
if args.logfile:
dirname = os.path.dirname(args.logfile)
if dirname and not os.path.isdir(dirname):
os.makedirs(dirname)
handler = WatchedFileHandler(args.logfile)
formatter = logging.Formatter(LOG_FORMAT)
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
# configure odoo
sys.path.append(args.odoo_path)
import odoo
_logger.info("Starting scheduler on database %s", args.database)
odoo.tools.config['db_host'] = args.db_host
odoo.tools.config['db_port'] = args.db_port
odoo.tools.config['db_user'] = args.db_user
odoo.tools.config['db_password'] = args.db_password
addon_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
config_addons_path = args.addons_path or odoo.tools.config['addons_path']
if addon_path not in config_addons_path:
config_addons_path = ','.join([config_addons_path, addon_path])
odoo.tools.config['addons_path'] = config_addons_path
odoo.tools.config['forced_host_name'] = args.forced_host_name
import odoo.modules
# create environment
registry = odoo.modules.registry.Registry(args.database)
try:
with registry.cursor() as cr:
env = odoo.api.Environment(cr, odoo.SUPERUSER_ID, {})
client = client_class(env)
# run main loop
client.main_loop()
except Exception as e:
_logger.exception(str(e))
raise e
_logger.info("Stopping gracefully")
def human_size(nb):
for i, u in enumerate(['', ' KiB', ' MiB', ' GiB']):
if nb < 1000 or i == 3: # 1000 to avoid things like 1001 MiB
break
nb = nb / 1024
return f'{nb:.2f}{u}'
def get_cpu_time():
"""returns system cpus times in microseconds"""
return sum([t for t in psutil.cpu_times()]) * 10 ** 6
def get_docker_stats(container_id):
"""
Cgroup V1 path is /sys/fs/cgroup/memory/docker/{container_id}/memory.usage_in_bytes'
https://docs.docker.com/config/containers/runmetrics/#find-the-cgroup-for-a-given-container
on our runbot, we use cgroup V2 + systemd driver
We could add a method to find the right path instead of hard coding
Returns a tupple with:
(time, current memory usage, cpu usage in microseconds, current system usage microseconds)
"""
try:
memory_current = int(Path(f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.current').read_text().strip())
except Exception: # should we log exception in order to debug ... but will spam the logs ...
memory_current = 0
try:
cpu_stats = Path(f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cpu.stat').read_text()
cpu_usage_line = cpu_stats.split('\n')[0]
usage_usec = cpu_usage_line and int(cpu_usage_line.split()[1])
except Exception:
usage_usec = 0
return (time.time(), memory_current, usage_usec, get_cpu_time())
def prepare_stats_log(dest, previous_stats, current_stats):
current_time, current_mem, current_cpu, current_cpu_time = current_stats
if not previous_stats:
return (current_time, 0, 0, current_cpu, current_cpu_time), ''
_, logged_mem, logged_cpu_percent, previous_cpu, previous_cpu_time, = previous_stats
mem_ratio = (current_mem - logged_mem) / (logged_mem or 1) * 100
cpu_delta = current_cpu - previous_cpu
system_delta = (current_cpu_time - previous_cpu_time) / CPU_COUNT
cpu_percent = cpu_delta / system_delta * 100
cpu_percent_delta = (cpu_percent - logged_cpu_percent)
cpu_percent_ratio = cpu_percent_delta / (logged_cpu_percent or 0.000001) * 100
log_lines = []
date_time = datetime.fromtimestamp(current_time).astimezone(tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
if abs(mem_ratio) > 10: # hard-coded values. We should find a way to configure.
humanized = f'({human_size(current_mem)})'
log_lines.append(f'{date_time} Memory: {current_mem:>11} {humanized:>11} ({mem_ratio:>+5.1f}%)')
logged_mem = current_mem
if abs(cpu_percent_ratio) > 20 and abs(cpu_percent_delta) > 2:
log_lines.append(f'{date_time} CPU: {cpu_percent:>11.2f} ({cpu_percent_ratio:>+5.1f}%)')
logged_cpu_percent = cpu_percent
previous_stats = current_time, logged_mem, logged_cpu_percent, current_cpu, current_cpu_time
return previous_stats, '\n'.join(log_lines)
def docker_monitoring_loop(builds_dir):
builds_dir = Path(builds_dir)
docker_client = docker.from_env()
previous_stats_per_docker = {}
_logger.info('Starting docker monitoring loop thread')
while True:
try:
stats_per_docker = dict()
for container in docker_client.containers.list(filters={'status': 'running'}):
if re.match(r'^\d+-.+_.+', container.name):
dest, suffix = container.name.split('_', maxsplit=1)
container_log_dir = builds_dir / dest / 'logs'
if not container_log_dir.exists():
_logger.warning('Log dir not found: `%s`', container_log_dir)
continue
current_stats = get_docker_stats(container.id)
previous_stats = previous_stats_per_docker.get(container.name)
previous_stats, log_line = prepare_stats_log(dest, previous_stats, current_stats)
if log_line:
stat_log_file = container_log_dir / f'{suffix}-stats.txt'
with open(stat_log_file, mode='a') as f:
f.write(f'{log_line}\n')
stats_per_docker[container.name] = previous_stats
previous_stats_per_docker = stats_per_docker
time.sleep(1)
except Exception as e:
_logger.exception('Monitoring loop thread exception: %s', e)
time.sleep(60)