mirror of
https://github.com/odoo/runbot.git
synced 2025-03-15 15:35:46 +07:00
[IMP] runbot: add a monitoring tool to the builder
At this moment it's difficult to monitor a build cpu and memory usage. This must be done from outside the container to avoid to kill the cat by opening the box. This commit adds a daemonic thread launched by the builder. This thread will read the memory and cpu stats of the running dockers and report their stats in a log file into the logs directory of the builds. The log file format is made to be easily mixed with the build logs and spot memory leaks or cpu overload during builds.
This commit is contained in:
parent
16fa0e1ad8
commit
794a6bf7e2
@ -1,7 +1,10 @@
|
||||
#!/usr/bin/python3
|
||||
import logging
|
||||
import threading
|
||||
|
||||
from tools import RunbotClient, run
|
||||
from pathlib import Path
|
||||
|
||||
from tools import RunbotClient, run, docker_monitoring_loop
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
@ -9,6 +12,10 @@ _logger = logging.getLogger(__name__)
|
||||
class BuilderClient(RunbotClient):
|
||||
|
||||
def on_start(self):
|
||||
builds_path = Path(self.env['runbot.runbot']._root()) / 'build'
|
||||
monitoring_thread = threading.Thread(target=docker_monitoring_loop, args=(builds_path,), daemon=True)
|
||||
monitoring_thread.start()
|
||||
|
||||
for repo in self.env['runbot.repo'].search([('mode', '!=', 'disabled')]):
|
||||
repo._update(force=True)
|
||||
|
||||
|
@ -1,13 +1,17 @@
|
||||
#!/usr/bin/python3
|
||||
import argparse
|
||||
import docker
|
||||
import logging
|
||||
import os
|
||||
import psutil
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import random
|
||||
import time
|
||||
import signal
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from logging.handlers import WatchedFileHandler
|
||||
|
||||
LOG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s'
|
||||
@ -15,6 +19,7 @@ logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
|
||||
logging.getLogger('odoo.addons.runbot').setLevel(logging.DEBUG)
|
||||
logging.addLevelName(25, "!NFO")
|
||||
|
||||
CPU_COUNT = os.cpu_count()
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@ -149,3 +154,89 @@ def run(client_class):
|
||||
_logger.exception(str(e))
|
||||
raise e
|
||||
_logger.info("Stopping gracefully")
|
||||
|
||||
def human_size(nb):
|
||||
for i, u in enumerate(['', ' KiB', ' MiB', ' GiB']):
|
||||
if nb < 1000 or i == 3: # 1000 to avoid things like 1001 MiB
|
||||
break
|
||||
nb = nb / 1024
|
||||
return f'{nb:.2f}{u}'
|
||||
|
||||
def get_cpu_time():
|
||||
"""returns system cpus times in microseconds"""
|
||||
return sum([t for t in psutil.cpu_times()]) * 10 ** 6
|
||||
|
||||
def get_docker_stats(container_id):
|
||||
"""
|
||||
Cgroup V1 path is /sys/fs/cgroup/memory/docker/{container_id}/memory.usage_in_bytes'
|
||||
https://docs.docker.com/config/containers/runmetrics/#find-the-cgroup-for-a-given-container
|
||||
on our runbot, we use cgroup V2 + systemd driver
|
||||
We could add a method to find the right path instead of hard coding
|
||||
Returns a tupple with:
|
||||
(time, current memory usage, cpu usage in microseconds, current system usage microseconds)
|
||||
"""
|
||||
try:
|
||||
memory_current = int(Path(f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.current').read_text().strip())
|
||||
except Exception: # should we log exception in order to debug ... but will spam the logs ...
|
||||
memory_current = 0
|
||||
try:
|
||||
cpu_stats = Path(f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cpu.stat').read_text()
|
||||
cpu_usage_line = cpu_stats.split('\n')[0]
|
||||
usage_usec = cpu_usage_line and int(cpu_usage_line.split()[1])
|
||||
except Exception:
|
||||
usage_usec = 0
|
||||
return (time.time(), memory_current, usage_usec, get_cpu_time())
|
||||
|
||||
def prepare_stats_log(dest, previous_stats, current_stats):
|
||||
current_time, current_mem, current_cpu, current_cpu_time = current_stats
|
||||
|
||||
if not previous_stats:
|
||||
return (current_time, 0, 0, current_cpu, current_cpu_time), ''
|
||||
|
||||
_, logged_mem, logged_cpu_percent, previous_cpu, previous_cpu_time, = previous_stats
|
||||
|
||||
mem_ratio = (current_mem - logged_mem) / (logged_mem or 1) * 100
|
||||
cpu_delta = current_cpu - previous_cpu
|
||||
system_delta = (current_cpu_time - previous_cpu_time) / CPU_COUNT
|
||||
cpu_percent = cpu_delta / system_delta * 100
|
||||
cpu_percent_delta = (cpu_percent - logged_cpu_percent)
|
||||
cpu_percent_ratio = cpu_percent_delta / (logged_cpu_percent or 0.000001) * 100
|
||||
|
||||
log_lines = []
|
||||
date_time = datetime.fromtimestamp(current_time).astimezone(tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S,%f')[:-3]
|
||||
if abs(mem_ratio) > 10: # hard-coded values. We should find a way to configure.
|
||||
humanized = f'({human_size(current_mem)})'
|
||||
log_lines.append(f'{date_time} Memory: {current_mem:>11} {humanized:>11} ({mem_ratio:>+5.1f}%)')
|
||||
logged_mem = current_mem
|
||||
if abs(cpu_percent_ratio) > 20 and abs(cpu_percent_delta) > 2:
|
||||
log_lines.append(f'{date_time} CPU: {cpu_percent:>11.2f} ({cpu_percent_ratio:>+5.1f}%)')
|
||||
logged_cpu_percent = cpu_percent
|
||||
previous_stats = current_time, logged_mem, logged_cpu_percent, current_cpu, current_cpu_time
|
||||
return previous_stats, '\n'.join(log_lines)
|
||||
|
||||
def docker_monitoring_loop(builds_dir):
|
||||
docker_client = docker.from_env()
|
||||
previous_stats_per_docker = {}
|
||||
_logger.info('Starting docker monitoring loop thread')
|
||||
while True:
|
||||
try:
|
||||
stats_per_docker = dict()
|
||||
for container in docker_client.containers.list(filters={'status': 'running'}):
|
||||
if re.match(r'^\d+-.+_.+', container.name):
|
||||
dest, suffix = container.name.split('_', maxsplit=1)
|
||||
container_log_dir = builds_dir / dest / 'logs'
|
||||
if not container_log_dir.exists():
|
||||
_logger.warning('Log dir not found: `%s`', container_log_dir)
|
||||
continue
|
||||
current_stats = get_docker_stats(container.id)
|
||||
previous_stats = previous_stats_per_docker.get(container.name)
|
||||
previous_stats, log_line = prepare_stats_log(dest, previous_stats, current_stats)
|
||||
if log_line:
|
||||
stat_log_file = container_log_dir / f'{suffix}-stats.txt'
|
||||
stat_log_file.open(mode='a').write(f'{log_line}\n')
|
||||
stats_per_docker[container.name] = previous_stats
|
||||
previous_stats_per_docker = stats_per_docker
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
_logger.exception('Monitoring loop thread exception: %s', e)
|
||||
time.sleep(60)
|
||||
|
Loading…
Reference in New Issue
Block a user