[WIP] runbot: monitoring tools

Add a new model runbot.host to keep info and configuration about
hosts (worker servers), like number of worker, reserved or not,
ping times (last start loop, successful iteration, end loop, ...)
and also last errors, number of testing per host, psql connection
count, ...

A new monitoring frontend page is created, similar to glances
but with additionnal information like hosts states and
last_monitored builds (for nightly)

Later this model will be used for runbot_build host instead of char.

Host are automaticaly created when running _scheduler.
This commit is contained in:
Xavier-Do 2019-08-19 16:03:14 +02:00
parent ef24adad88
commit 02d2cc4528
10 changed files with 297 additions and 21 deletions

View File

@ -16,6 +16,7 @@
'views/repo_views.xml',
'views/branch_views.xml',
'views/build_views.xml',
'views/host_views.xml',
'views/config_views.xml',
'views/res_config_settings_views.xml',
'templates/frontend.xml',

View File

@ -273,8 +273,7 @@ class Runbot(Controller):
return request.render("runbot.sticky-dashboard", qctx)
@route('/runbot/glances', type='http', auth='public', website=True)
def glances(self, refresh=None):
def _glances_ctx(self):
repos = request.env['runbot.repo'].search([]) # respect record rules
default_config_id = request.env.ref('runbot.runbot_build_config_default').id
query = """
@ -302,16 +301,45 @@ class Runbot(Controller):
ctx = OrderedDict()
for row in cr.fetchall():
ctx.setdefault(row[0], []).append(row[1:])
return ctx
@route('/runbot/glances', type='http', auth='public', website=True)
def glances(self, refresh=None):
glances_ctx = self._glances_ctx()
pending = self._pending()
qctx = {
'refresh': refresh,
'pending_total': pending[0],
'pending_level': pending[1],
'data': ctx,
'glances_data': glances_ctx,
}
return request.render("runbot.glances", qctx)
@route('/runbot/monitoring', type='http', auth='user', website=True)
def monitoring(self, refresh=None):
glances_ctx = self._glances_ctx()
pending = self._pending()
hosts_data = request.env['runbot.host'].search([])
monitored_config_id = int(request.env['ir.config_parameter'].sudo().get_param('runbot.monitored_config_id', 1))
request.env.cr.execute("""SELECT DISTINCT ON (branch_id) branch_id, id FROM runbot_build
WHERE config_id = %s
AND global_state in ('running', 'done')
AND branch_id in (SELECT id FROM runbot_branch where sticky='t')
ORDER BY branch_id ASC, id DESC""", [int(monitored_config_id)])
last_monitored = request.env['runbot.build'].browse([r[1] for r in request.env.cr.fetchall()])
qctx = {
'refresh': refresh,
'pending_total': pending[0],
'pending_level': pending[1],
'glances_data': glances_ctx,
'hosts_data': hosts_data,
'last_monitored': last_monitored # nightly
}
return request.render("runbot.monitoring", qctx)
@route(['/runbot/branch/<int:branch_id>', '/runbot/branch/<int:branch_id>/page/<int:page>'], website=True, auth='public', type='http')
def branch_builds(self, branch_id=None, search='', page=1, limit=50, refresh='', **kwargs):
""" list builds of a runbot branch """

View File

@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
from . import repo, branch, build, event, build_dependency, build_config, ir_cron
from . import repo, branch, build, event, build_dependency, build_config, ir_cron, host
from . import res_config_settings

60
runbot/models/host.py Normal file
View File

@ -0,0 +1,60 @@
import logging
from odoo import models, fields, api
from ..common import fqdn, local_pgadmin_cursor
_logger = logging.getLogger(__name__)
class RunboHost(models.Model):
_name = "runbot.host"
_order = 'id'
_inherit = 'mail.thread'
name = fields.Char('Host name', required=True, unique=True)
disp_name = fields.Char('Display name')
active = fields.Boolean('Active', default=True)
last_start_loop = fields.Datetime('Last start')
last_end_loop = fields.Datetime('Last end')
last_success = fields.Datetime('Last success')
assigned_only = fields.Boolean('Only accept assigned build', default=False)
nb_worker = fields.Integer('Number of max paralel build', help="0 to use icp value", default=0)
nb_testing = fields.Integer(compute='_compute_nb')
nb_running = fields.Integer(compute='_compute_nb')
last_exception = fields.Char('Last exception')
exception_count = fields.Integer('Exception count')
psql_conn_count = fields.Integer('SQL connections count', default=0)
def _compute_nb(self):
groups = self.env['runbot.build'].read_group(
[('host', 'in', self.mapped('name')), ('local_state', 'in', ('testing', 'running'))],
['host', 'local_state'],
['host', 'local_state'],
lazy=False
)
count_by_host_state = {host.name: {} for host in self}
for group in groups:
count_by_host_state[group['host']][group['local_state']] = group['__count']
for host in self:
host.nb_testing = count_by_host_state[host.name].get('testing', 0)
host.nb_running = count_by_host_state[host.name].get('running', 0)
@api.model
def create(self, values):
if not 'disp_name' in values:
values['disp_name'] = values['name']
return super().create(values)
@api.model
def _get_current(self):
name = fqdn()
return self.search([('name', '=', name)]) or self.create({'name': name})
def get_nb_worker(self):
icp = self.env['ir.config_parameter']
return self.nb_worker or int(icp.sudo().get_param('runbot.runbot_workers', default=6))
def set_psql_conn_count(self):
self.ensure_one()
with local_pgadmin_cursor() as local_cr:
local_cr.execute("SELECT sum(numbackends) FROM pg_stat_database;")
res = local_cr.fetchone()
self.psql_conn_count = res and res[0] or 0

View File

@ -15,7 +15,7 @@ import shutil
from odoo.exceptions import UserError, ValidationError
from odoo.tools.misc import DEFAULT_SERVER_DATETIME_FORMAT
from odoo import models, fields, api
from odoo import models, fields, api, registry
from odoo.modules.module import get_module_resource
from odoo.tools import config
from ..common import fqdn, dt2time, Commit
@ -398,21 +398,20 @@ class runbot_repo(models.Model):
_logger.exception('Fail to update repo %s', repo.name)
@api.multi
def _scheduler(self):
def _scheduler(self, host=None):
"""Schedule builds for the repository"""
ids = self.ids
if not ids:
return
icp = self.env['ir.config_parameter']
host = fqdn()
settings_workers = int(icp.get_param('runbot.runbot_workers', default=6))
workers = int(icp.get_param('%s.workers' % host, default=settings_workers))
host = host or self.env['runbot.host']._get_current()
workers = host.get_nb_worker()
running_max = int(icp.get_param('runbot.runbot_running_max', default=75))
assigned_only = int(icp.get_param('%s.assigned_only' % host, default=False))
assigned_only = host.assigned_only
Build = self.env['runbot.build']
domain = [('repo_id', 'in', ids)]
domain_host = domain + [('host', '=', host)]
domain_host = domain + [('host', '=', host.name)]
# schedule jobs (transitions testing -> running, kill jobs, ...)
build_ids = Build.search(domain_host + ['|', ('local_state', 'in', ['testing', 'running']), ('requested_action', 'in', ['wake_up', 'deathrow'])])
@ -458,7 +457,7 @@ class runbot_repo(models.Model):
)
RETURNING id""" % where_clause
self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': fqdn(), 'limit': limit})
self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': host.name, 'limit': limit})
return self.env.cr.fetchall()
allocated = allocate_builds("""AND runbot_build.build_type != 'scheduled'""", assignable_slots)
@ -561,6 +560,10 @@ class runbot_repo(models.Model):
"""
if hostname != fqdn():
return 'Not for me'
host = self.env['runbot.host']._get_current()
host.set_psql_conn_count()
host.last_start_loop = fields.Datetime.now()
self.env.cr.commit()
start_time = time.time()
# 1. source cleanup
# -> Remove sources when no build is using them
@ -576,7 +579,8 @@ class runbot_repo(models.Model):
while time.time() - start_time < timeout:
repos = self.search([('mode', '!=', 'disabled')])
try:
repos._scheduler()
repos._scheduler(host)
host.last_success = fields.Datetime.now()
self.env.cr.commit()
self.env.reset()
self = self.env()[self._name]
@ -587,6 +591,21 @@ class runbot_repo(models.Model):
self.env.cr.rollback()
self.env.reset()
time.sleep(random.uniform(0, 1))
except Exception as e:
with registry(self._cr.dbname).cursor() as cr: # user another cursor since transaction will be rollbacked
message = str(e)
chost = host.with_env(self.env(cr=cr))
if chost.last_exception == message:
chost.exception_count += 1
else:
chost.with_env(self.env(cr=cr)).last_exception = str(e)
chost.exception_count = 1
raise
if host.last_exception:
host.last_exception = ""
host.exception_count = 0
host.last_end_loop = fields.Datetime.now()
def _source_cleanup(self):
try:

View File

@ -17,3 +17,6 @@ access_runbot_build_config_manager,runbot_build_config_manager,runbot.model_runb
access_runbot_build_config_step_order_user,runbot_build_config_step_order_user,runbot.model_runbot_build_config_step_order,group_user,1,0,0,0
access_runbot_build_config_step_order_manager,runbot_build_config_step_order_manager,runbot.model_runbot_build_config_step_order,runbot.group_build_config_user,1,1,1,1
access_runbot_host_user,runbot_host_user,runbot.model_runbot_host,group_user,1,0,0,0
access_runbot_host_manager,runbot_host_manager,runbot.model_runbot_host,runbot.group_runbot_admin,1,1,1,1

1 id name model_id:id group_id:id perm_read perm_write perm_create perm_unlink
17 access_runbot_host_user runbot_host_user runbot.model_runbot_host group_user 1 0 0 0
18 access_runbot_host_manager runbot_host_manager runbot.model_runbot_host runbot.group_runbot_admin 1 1 1 1
19
20
21
22

View File

@ -76,7 +76,7 @@
</t>
</template>
<template id="runbot.glances">
<t t-call='website.layout'>
<t t-call='portal.frontend_layout'>
<t t-set="head">
<t t-if="refresh">
<meta http-equiv="refresh" t-att-content="refresh"/>
@ -98,10 +98,10 @@
<div>
<span t-attf-class="label label-{{pending_level}}">Pending: <t t-esc="pending_total"/></span>
</div>
<t t-foreach="data.keys()" t-as="repo">
<t t-foreach="glances_data.keys()" t-as="repo">
<h4><t t-esc="repo"/>
</h4>
<t t-foreach="data[repo]" t-as="br">
<t t-foreach="glances_data[repo]" t-as="br">
<t t-if="br[1] == 'ko'"><t t-set="klass">danger</t></t>
<t t-if="br[1] == 'warn'"><t t-set="klass">warning</t></t>
<t t-if="br[1] == 'ok'"><t t-set="klass">success</t></t>
@ -114,5 +114,104 @@
</div>
</t>
</template>
<template id="frontend_no_nav" inherit_id="portal.frontend_layout" primary="True">
<xpath expr="//header" position="replace">
</xpath>
</template>
<template id="runbot.monitoring">
<t t-call="runbot.frontend_no_nav">
<t t-set="head">
<t t-if="refresh">
<meta http-equiv="refresh" t-att-content="refresh"/>
</t>
<style>
.label-killed {
background-color: #aaa;
}
h4 {
padding: 3px 0;
border-bottom: 1px solid grey;
}
.r-mb02 { margin-bottom: 0.2em; }
</style>
</t>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div>
<span t-attf-class="label label-{{pending_level}}">Pending: <t t-esc="pending_total"/></span>
<t t-set="testing">0</t>
<t t-set="workers">0</t>
<t t-foreach="hosts_data.sorted(key=lambda h:h.name)" t-as="host">
<t t-set="testing" t-value="int(testing) + host.nb_testing"/>
<t t-set="workers" t-value="int(workers) + host.sudo().get_nb_worker()"/>
</t>
<t t-set="klass">success</t>
<t t-if="int(testing)/workers > 0"><t t-set="klass">info</t></t>
<t t-if="int(testing)/workers > 0.75"><t t-set="klass">warning</t></t>
<t t-if="int(testing)/workers >= 1"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}">Testing: <t t-esc="testing"/>/<t t-esc="workers"/></span>
</div>
<t t-foreach="glances_data.keys()" t-as="repo">
<div>
<span t-esc="repo"/>
<t t-foreach="glances_data[repo]" t-as="br">
<t t-if="br[1] == 'ko'"><t t-set="klass">danger</t></t>
<t t-if="br[1] == 'warn'"><t t-set="klass">warning</t></t>
<t t-if="br[1] == 'ok'"><t t-set="klass">success</t></t>
<t t-if="br[1] == 'killed'"><t t-set="klass">killed</t></t>
<span t-attf-class="label label-{{klass}}"><t t-esc="br[0]"/></span>
</t>
</div>
</t>
<t t-foreach="hosts_data.sorted(key=lambda h:h.name)" t-as="host">
<div>
<span t-esc="host.name.split('.')[0]"/>
<t t-if="host.nb_testing == 0"><t t-set="klass">success</t></t>
<t t-if="host.nb_testing > 0"><t t-set="klass">info</t></t>
<t t-if="host.nb_testing == host.sudo().get_nb_worker()"><t t-set="klass">warning</t></t>
<t t-if="host.nb_testing > host.sudo().get_nb_worker()"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="host.nb_testing"/>/<span t-esc="host.sudo().get_nb_worker()"/></span>
<t t-esc="host.nb_running"/>
<t t-set="succes_time" t-value="int(datetime.datetime.now().timestamp() - datetime.datetime.strptime(host.last_success, '%Y-%m-%d %H:%M:%S').timestamp())"/>
<t t-set="start_time" t-value="int(datetime.datetime.now().timestamp() - datetime.datetime.strptime(host.last_start_loop, '%Y-%m-%d %H:%M:%S').timestamp())"/>
<t t-set="end_time" t-value="int(datetime.datetime.now().timestamp() - datetime.datetime.strptime(host.last_end_loop, '%Y-%m-%d %H:%M:%S').timestamp())"/>
<t t-set="klass">success</t>
<t t-if="succes_time > 30"><t t-set="klass">info</t></t>
<t t-if="succes_time > 180"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="succes_time"/></span>
<t t-set="klass">success</t>
<t t-if="start_time > 60*10"><t t-set="klass">info</t></t>
<t t-if="start_time > 60*15"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="start_time"/></span>
<t t-set="klass">success</t>
<t t-if="end_time > 60*10"><t t-set="klass">info</t></t>
<t t-if="end_time > 60*15"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="end_time"/></span>
<t t-set="cron_time" t-value="end_time-start_time"/>
<t t-set="klass">success</t>
<t t-if="abs(cron_time) > 10"><t t-set="klass">info</t></t>
<t t-if="abs(cron_time) > 60"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="cron_time"/></span>
</div>
</t>
</div>
</div>
</div>
</t>
</template>
</data>
</odoo>

View File

@ -48,13 +48,15 @@ class Test_Cron(common.TransactionCase):
mock_update.assert_called_with(force=False)
mock_create.assert_called_with()
@patch('odoo.addons.runbot.models.host.fqdn')
@patch('odoo.addons.runbot.models.repo.runbot_repo._get_cron_period')
@patch('odoo.addons.runbot.models.repo.runbot_repo._reload_nginx')
@patch('odoo.addons.runbot.models.repo.runbot_repo._scheduler')
@patch('odoo.addons.runbot.models.repo.fqdn')
def test_cron_build(self, mock_fqdn, mock_scheduler, mock_reload, mock_cron_period):
def test_cron_build(self, mock_fqdn, mock_scheduler, mock_reload, mock_cron_period, mock_host_fqdn):
""" test that cron_fetch_and_build do its work """
mock_fqdn.return_value = 'runbotx.foo.com'
hostname = 'runbotx.foo.com'
mock_fqdn.return_value = mock_host_fqdn.return_value = hostname
mock_cron_period.return_value = 2
self.env['ir.config_parameter'].sudo().set_param('runbot.runbot_update_frequency', 1)
self.Repo.create({'name': '/path/somewhere/disabled.git', 'mode': 'disabled'}) # create a disabled
@ -64,3 +66,6 @@ class Test_Cron(common.TransactionCase):
self.assertEqual(None, ret)
mock_scheduler.assert_called()
self.assertTrue(mock_reload.called)
host = self.env['runbot.host'].search([('name', '=', 'runbotx.foo.com')])
self.assertEqual(host.name, hostname, 'A new host should have been created')
self.assertGreater(host.psql_conn_count, 0, 'A least one connection should exist on the current psql instance')

View File

@ -201,9 +201,9 @@ class Test_Repo_Scheduler(common.TransactionCase):
@patch('odoo.addons.runbot.models.build.runbot_build._reap')
@patch('odoo.addons.runbot.models.build.runbot_build._kill')
@patch('odoo.addons.runbot.models.build.runbot_build._schedule')
@patch('odoo.addons.runbot.models.repo.fqdn')
def test_repo_scheduler(self, mock_repo_fqdn, mock_schedule, mock_kill, mock_reap):
mock_repo_fqdn.return_value = 'test_host'
@patch('odoo.addons.runbot.models.host.fqdn')
def test_repo_scheduler(self, mock_fqdn, mock_schedule, mock_kill, mock_reap):
mock_fqdn.return_value = 'test_host'
self.env['ir.config_parameter'].set_param('runbot.runbot_workers', 6)
Build_model = self.env['runbot.build']
builds = []
@ -237,6 +237,7 @@ class Test_Repo_Scheduler(common.TransactionCase):
'local_state': 'pending',
})
builds.append(build)
self.foo_repo._scheduler()
build.invalidate_cache()

View File

@ -0,0 +1,60 @@
<odoo>
<data>
<record id="host_form" model="ir.ui.view">
<field name="name">runbot.host.form</field>
<field name="model">runbot.host</field>
<field name="arch" type="xml">
<form string="Host">
<sheet>
<group>
<field name="name" readonly='1'/>
<field name="disp_name"/>
<field name="active"/>
<field name="last_start_loop" readonly='1'/>
<field name="last_end_loop" readonly='1'/>
<field name="last_success" readonly='1'/>
<field name="assigned_only"/>
<field name="nb_worker"/>
<field name="nb_testing"/>
<field name="nb_running"/>
<field name="last_exception" readonly='1'/>
<field name="exception_count" readonly='1'/>
</group>
</sheet>
<div class="oe_chatter">
<field name="message_follower_ids" widget="mail_followers"/>
<field name="message_ids" widget="mail_thread"/>
</div>
</form>
</field>
</record>
<record id="view_host_tree" model="ir.ui.view">
<field name="name">runbot.host.tree</field>
<field name="model">runbot.host</field>
<field name="arch" type="xml">
<tree string="Builds">
<field name="name"/>
<field name="disp_name"/>
<field name="assigned_only"/>
<field name="nb_worker"/>
</tree>
</field>
</record>
<record id="open_view_host_tree" model="ir.actions.act_window">
<field name="name">Host</field>
<field name="res_model">runbot.host</field>
<field name="view_mode">tree,form</field>
</record>
<menuitem
name="Build Hosts"
id="runbot_menu_host_tree"
parent="runbot_menu_root"
sequence="32"
action="open_view_host_tree"
/>
</data>
</odoo>