[WIP] runbot: monitoring tools

Add a new model runbot.host to keep info and configuration about
hosts (worker servers), like number of worker, reserved or not,
ping times (last start loop, successful iteration, end loop, ...)
and also last errors, number of testing per host, psql connection
count, ...

A new monitoring frontend page is created, similar to glances
but with additionnal information like hosts states and
last_monitored builds (for nightly)

Later this model will be used for runbot_build host instead of char.

Host are automaticaly created when running _scheduler.
This commit is contained in:
Xavier-Do 2019-08-19 16:03:14 +02:00
parent ef24adad88
commit 02d2cc4528
10 changed files with 297 additions and 21 deletions

View File

@ -16,6 +16,7 @@
'views/repo_views.xml', 'views/repo_views.xml',
'views/branch_views.xml', 'views/branch_views.xml',
'views/build_views.xml', 'views/build_views.xml',
'views/host_views.xml',
'views/config_views.xml', 'views/config_views.xml',
'views/res_config_settings_views.xml', 'views/res_config_settings_views.xml',
'templates/frontend.xml', 'templates/frontend.xml',

View File

@ -273,8 +273,7 @@ class Runbot(Controller):
return request.render("runbot.sticky-dashboard", qctx) return request.render("runbot.sticky-dashboard", qctx)
@route('/runbot/glances', type='http', auth='public', website=True) def _glances_ctx(self):
def glances(self, refresh=None):
repos = request.env['runbot.repo'].search([]) # respect record rules repos = request.env['runbot.repo'].search([]) # respect record rules
default_config_id = request.env.ref('runbot.runbot_build_config_default').id default_config_id = request.env.ref('runbot.runbot_build_config_default').id
query = """ query = """
@ -302,16 +301,45 @@ class Runbot(Controller):
ctx = OrderedDict() ctx = OrderedDict()
for row in cr.fetchall(): for row in cr.fetchall():
ctx.setdefault(row[0], []).append(row[1:]) ctx.setdefault(row[0], []).append(row[1:])
return ctx
@route('/runbot/glances', type='http', auth='public', website=True)
def glances(self, refresh=None):
glances_ctx = self._glances_ctx()
pending = self._pending() pending = self._pending()
qctx = { qctx = {
'refresh': refresh, 'refresh': refresh,
'pending_total': pending[0], 'pending_total': pending[0],
'pending_level': pending[1], 'pending_level': pending[1],
'data': ctx, 'glances_data': glances_ctx,
} }
return request.render("runbot.glances", qctx) return request.render("runbot.glances", qctx)
@route('/runbot/monitoring', type='http', auth='user', website=True)
def monitoring(self, refresh=None):
glances_ctx = self._glances_ctx()
pending = self._pending()
hosts_data = request.env['runbot.host'].search([])
monitored_config_id = int(request.env['ir.config_parameter'].sudo().get_param('runbot.monitored_config_id', 1))
request.env.cr.execute("""SELECT DISTINCT ON (branch_id) branch_id, id FROM runbot_build
WHERE config_id = %s
AND global_state in ('running', 'done')
AND branch_id in (SELECT id FROM runbot_branch where sticky='t')
ORDER BY branch_id ASC, id DESC""", [int(monitored_config_id)])
last_monitored = request.env['runbot.build'].browse([r[1] for r in request.env.cr.fetchall()])
qctx = {
'refresh': refresh,
'pending_total': pending[0],
'pending_level': pending[1],
'glances_data': glances_ctx,
'hosts_data': hosts_data,
'last_monitored': last_monitored # nightly
}
return request.render("runbot.monitoring", qctx)
@route(['/runbot/branch/<int:branch_id>', '/runbot/branch/<int:branch_id>/page/<int:page>'], website=True, auth='public', type='http') @route(['/runbot/branch/<int:branch_id>', '/runbot/branch/<int:branch_id>/page/<int:page>'], website=True, auth='public', type='http')
def branch_builds(self, branch_id=None, search='', page=1, limit=50, refresh='', **kwargs): def branch_builds(self, branch_id=None, search='', page=1, limit=50, refresh='', **kwargs):
""" list builds of a runbot branch """ """ list builds of a runbot branch """

View File

@ -1,4 +1,4 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from . import repo, branch, build, event, build_dependency, build_config, ir_cron from . import repo, branch, build, event, build_dependency, build_config, ir_cron, host
from . import res_config_settings from . import res_config_settings

60
runbot/models/host.py Normal file
View File

@ -0,0 +1,60 @@
import logging
from odoo import models, fields, api
from ..common import fqdn, local_pgadmin_cursor
_logger = logging.getLogger(__name__)
class RunboHost(models.Model):
_name = "runbot.host"
_order = 'id'
_inherit = 'mail.thread'
name = fields.Char('Host name', required=True, unique=True)
disp_name = fields.Char('Display name')
active = fields.Boolean('Active', default=True)
last_start_loop = fields.Datetime('Last start')
last_end_loop = fields.Datetime('Last end')
last_success = fields.Datetime('Last success')
assigned_only = fields.Boolean('Only accept assigned build', default=False)
nb_worker = fields.Integer('Number of max paralel build', help="0 to use icp value", default=0)
nb_testing = fields.Integer(compute='_compute_nb')
nb_running = fields.Integer(compute='_compute_nb')
last_exception = fields.Char('Last exception')
exception_count = fields.Integer('Exception count')
psql_conn_count = fields.Integer('SQL connections count', default=0)
def _compute_nb(self):
groups = self.env['runbot.build'].read_group(
[('host', 'in', self.mapped('name')), ('local_state', 'in', ('testing', 'running'))],
['host', 'local_state'],
['host', 'local_state'],
lazy=False
)
count_by_host_state = {host.name: {} for host in self}
for group in groups:
count_by_host_state[group['host']][group['local_state']] = group['__count']
for host in self:
host.nb_testing = count_by_host_state[host.name].get('testing', 0)
host.nb_running = count_by_host_state[host.name].get('running', 0)
@api.model
def create(self, values):
if not 'disp_name' in values:
values['disp_name'] = values['name']
return super().create(values)
@api.model
def _get_current(self):
name = fqdn()
return self.search([('name', '=', name)]) or self.create({'name': name})
def get_nb_worker(self):
icp = self.env['ir.config_parameter']
return self.nb_worker or int(icp.sudo().get_param('runbot.runbot_workers', default=6))
def set_psql_conn_count(self):
self.ensure_one()
with local_pgadmin_cursor() as local_cr:
local_cr.execute("SELECT sum(numbackends) FROM pg_stat_database;")
res = local_cr.fetchone()
self.psql_conn_count = res and res[0] or 0

View File

@ -15,7 +15,7 @@ import shutil
from odoo.exceptions import UserError, ValidationError from odoo.exceptions import UserError, ValidationError
from odoo.tools.misc import DEFAULT_SERVER_DATETIME_FORMAT from odoo.tools.misc import DEFAULT_SERVER_DATETIME_FORMAT
from odoo import models, fields, api from odoo import models, fields, api, registry
from odoo.modules.module import get_module_resource from odoo.modules.module import get_module_resource
from odoo.tools import config from odoo.tools import config
from ..common import fqdn, dt2time, Commit from ..common import fqdn, dt2time, Commit
@ -398,21 +398,20 @@ class runbot_repo(models.Model):
_logger.exception('Fail to update repo %s', repo.name) _logger.exception('Fail to update repo %s', repo.name)
@api.multi @api.multi
def _scheduler(self): def _scheduler(self, host=None):
"""Schedule builds for the repository""" """Schedule builds for the repository"""
ids = self.ids ids = self.ids
if not ids: if not ids:
return return
icp = self.env['ir.config_parameter'] icp = self.env['ir.config_parameter']
host = fqdn() host = host or self.env['runbot.host']._get_current()
settings_workers = int(icp.get_param('runbot.runbot_workers', default=6)) workers = host.get_nb_worker()
workers = int(icp.get_param('%s.workers' % host, default=settings_workers))
running_max = int(icp.get_param('runbot.runbot_running_max', default=75)) running_max = int(icp.get_param('runbot.runbot_running_max', default=75))
assigned_only = int(icp.get_param('%s.assigned_only' % host, default=False)) assigned_only = host.assigned_only
Build = self.env['runbot.build'] Build = self.env['runbot.build']
domain = [('repo_id', 'in', ids)] domain = [('repo_id', 'in', ids)]
domain_host = domain + [('host', '=', host)] domain_host = domain + [('host', '=', host.name)]
# schedule jobs (transitions testing -> running, kill jobs, ...) # schedule jobs (transitions testing -> running, kill jobs, ...)
build_ids = Build.search(domain_host + ['|', ('local_state', 'in', ['testing', 'running']), ('requested_action', 'in', ['wake_up', 'deathrow'])]) build_ids = Build.search(domain_host + ['|', ('local_state', 'in', ['testing', 'running']), ('requested_action', 'in', ['wake_up', 'deathrow'])])
@ -458,7 +457,7 @@ class runbot_repo(models.Model):
) )
RETURNING id""" % where_clause RETURNING id""" % where_clause
self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': fqdn(), 'limit': limit}) self.env.cr.execute(query, {'repo_ids': tuple(ids), 'host': host.name, 'limit': limit})
return self.env.cr.fetchall() return self.env.cr.fetchall()
allocated = allocate_builds("""AND runbot_build.build_type != 'scheduled'""", assignable_slots) allocated = allocate_builds("""AND runbot_build.build_type != 'scheduled'""", assignable_slots)
@ -561,6 +560,10 @@ class runbot_repo(models.Model):
""" """
if hostname != fqdn(): if hostname != fqdn():
return 'Not for me' return 'Not for me'
host = self.env['runbot.host']._get_current()
host.set_psql_conn_count()
host.last_start_loop = fields.Datetime.now()
self.env.cr.commit()
start_time = time.time() start_time = time.time()
# 1. source cleanup # 1. source cleanup
# -> Remove sources when no build is using them # -> Remove sources when no build is using them
@ -576,7 +579,8 @@ class runbot_repo(models.Model):
while time.time() - start_time < timeout: while time.time() - start_time < timeout:
repos = self.search([('mode', '!=', 'disabled')]) repos = self.search([('mode', '!=', 'disabled')])
try: try:
repos._scheduler() repos._scheduler(host)
host.last_success = fields.Datetime.now()
self.env.cr.commit() self.env.cr.commit()
self.env.reset() self.env.reset()
self = self.env()[self._name] self = self.env()[self._name]
@ -587,6 +591,21 @@ class runbot_repo(models.Model):
self.env.cr.rollback() self.env.cr.rollback()
self.env.reset() self.env.reset()
time.sleep(random.uniform(0, 1)) time.sleep(random.uniform(0, 1))
except Exception as e:
with registry(self._cr.dbname).cursor() as cr: # user another cursor since transaction will be rollbacked
message = str(e)
chost = host.with_env(self.env(cr=cr))
if chost.last_exception == message:
chost.exception_count += 1
else:
chost.with_env(self.env(cr=cr)).last_exception = str(e)
chost.exception_count = 1
raise
if host.last_exception:
host.last_exception = ""
host.exception_count = 0
host.last_end_loop = fields.Datetime.now()
def _source_cleanup(self): def _source_cleanup(self):
try: try:

View File

@ -17,3 +17,6 @@ access_runbot_build_config_manager,runbot_build_config_manager,runbot.model_runb
access_runbot_build_config_step_order_user,runbot_build_config_step_order_user,runbot.model_runbot_build_config_step_order,group_user,1,0,0,0 access_runbot_build_config_step_order_user,runbot_build_config_step_order_user,runbot.model_runbot_build_config_step_order,group_user,1,0,0,0
access_runbot_build_config_step_order_manager,runbot_build_config_step_order_manager,runbot.model_runbot_build_config_step_order,runbot.group_build_config_user,1,1,1,1 access_runbot_build_config_step_order_manager,runbot_build_config_step_order_manager,runbot.model_runbot_build_config_step_order,runbot.group_build_config_user,1,1,1,1
access_runbot_host_user,runbot_host_user,runbot.model_runbot_host,group_user,1,0,0,0
access_runbot_host_manager,runbot_host_manager,runbot.model_runbot_host,runbot.group_runbot_admin,1,1,1,1

1 id name model_id:id group_id:id perm_read perm_write perm_create perm_unlink
17 access_runbot_host_user runbot_host_user runbot.model_runbot_host group_user 1 0 0 0
18 access_runbot_host_manager runbot_host_manager runbot.model_runbot_host runbot.group_runbot_admin 1 1 1 1
19
20
21
22

View File

@ -76,7 +76,7 @@
</t> </t>
</template> </template>
<template id="runbot.glances"> <template id="runbot.glances">
<t t-call='website.layout'> <t t-call='portal.frontend_layout'>
<t t-set="head"> <t t-set="head">
<t t-if="refresh"> <t t-if="refresh">
<meta http-equiv="refresh" t-att-content="refresh"/> <meta http-equiv="refresh" t-att-content="refresh"/>
@ -98,10 +98,10 @@
<div> <div>
<span t-attf-class="label label-{{pending_level}}">Pending: <t t-esc="pending_total"/></span> <span t-attf-class="label label-{{pending_level}}">Pending: <t t-esc="pending_total"/></span>
</div> </div>
<t t-foreach="data.keys()" t-as="repo"> <t t-foreach="glances_data.keys()" t-as="repo">
<h4><t t-esc="repo"/> <h4><t t-esc="repo"/>
</h4> </h4>
<t t-foreach="data[repo]" t-as="br"> <t t-foreach="glances_data[repo]" t-as="br">
<t t-if="br[1] == 'ko'"><t t-set="klass">danger</t></t> <t t-if="br[1] == 'ko'"><t t-set="klass">danger</t></t>
<t t-if="br[1] == 'warn'"><t t-set="klass">warning</t></t> <t t-if="br[1] == 'warn'"><t t-set="klass">warning</t></t>
<t t-if="br[1] == 'ok'"><t t-set="klass">success</t></t> <t t-if="br[1] == 'ok'"><t t-set="klass">success</t></t>
@ -114,5 +114,104 @@
</div> </div>
</t> </t>
</template> </template>
<template id="frontend_no_nav" inherit_id="portal.frontend_layout" primary="True">
<xpath expr="//header" position="replace">
</xpath>
</template>
<template id="runbot.monitoring">
<t t-call="runbot.frontend_no_nav">
<t t-set="head">
<t t-if="refresh">
<meta http-equiv="refresh" t-att-content="refresh"/>
</t>
<style>
.label-killed {
background-color: #aaa;
}
h4 {
padding: 3px 0;
border-bottom: 1px solid grey;
}
.r-mb02 { margin-bottom: 0.2em; }
</style>
</t>
<div class="container-fluid">
<div class="row">
<div class="col-md-12">
<div>
<span t-attf-class="label label-{{pending_level}}">Pending: <t t-esc="pending_total"/></span>
<t t-set="testing">0</t>
<t t-set="workers">0</t>
<t t-foreach="hosts_data.sorted(key=lambda h:h.name)" t-as="host">
<t t-set="testing" t-value="int(testing) + host.nb_testing"/>
<t t-set="workers" t-value="int(workers) + host.sudo().get_nb_worker()"/>
</t>
<t t-set="klass">success</t>
<t t-if="int(testing)/workers > 0"><t t-set="klass">info</t></t>
<t t-if="int(testing)/workers > 0.75"><t t-set="klass">warning</t></t>
<t t-if="int(testing)/workers >= 1"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}">Testing: <t t-esc="testing"/>/<t t-esc="workers"/></span>
</div>
<t t-foreach="glances_data.keys()" t-as="repo">
<div>
<span t-esc="repo"/>
<t t-foreach="glances_data[repo]" t-as="br">
<t t-if="br[1] == 'ko'"><t t-set="klass">danger</t></t>
<t t-if="br[1] == 'warn'"><t t-set="klass">warning</t></t>
<t t-if="br[1] == 'ok'"><t t-set="klass">success</t></t>
<t t-if="br[1] == 'killed'"><t t-set="klass">killed</t></t>
<span t-attf-class="label label-{{klass}}"><t t-esc="br[0]"/></span>
</t>
</div>
</t>
<t t-foreach="hosts_data.sorted(key=lambda h:h.name)" t-as="host">
<div>
<span t-esc="host.name.split('.')[0]"/>
<t t-if="host.nb_testing == 0"><t t-set="klass">success</t></t>
<t t-if="host.nb_testing > 0"><t t-set="klass">info</t></t>
<t t-if="host.nb_testing == host.sudo().get_nb_worker()"><t t-set="klass">warning</t></t>
<t t-if="host.nb_testing > host.sudo().get_nb_worker()"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="host.nb_testing"/>/<span t-esc="host.sudo().get_nb_worker()"/></span>
<t t-esc="host.nb_running"/>
<t t-set="succes_time" t-value="int(datetime.datetime.now().timestamp() - datetime.datetime.strptime(host.last_success, '%Y-%m-%d %H:%M:%S').timestamp())"/>
<t t-set="start_time" t-value="int(datetime.datetime.now().timestamp() - datetime.datetime.strptime(host.last_start_loop, '%Y-%m-%d %H:%M:%S').timestamp())"/>
<t t-set="end_time" t-value="int(datetime.datetime.now().timestamp() - datetime.datetime.strptime(host.last_end_loop, '%Y-%m-%d %H:%M:%S').timestamp())"/>
<t t-set="klass">success</t>
<t t-if="succes_time > 30"><t t-set="klass">info</t></t>
<t t-if="succes_time > 180"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="succes_time"/></span>
<t t-set="klass">success</t>
<t t-if="start_time > 60*10"><t t-set="klass">info</t></t>
<t t-if="start_time > 60*15"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="start_time"/></span>
<t t-set="klass">success</t>
<t t-if="end_time > 60*10"><t t-set="klass">info</t></t>
<t t-if="end_time > 60*15"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="end_time"/></span>
<t t-set="cron_time" t-value="end_time-start_time"/>
<t t-set="klass">success</t>
<t t-if="abs(cron_time) > 10"><t t-set="klass">info</t></t>
<t t-if="abs(cron_time) > 60"><t t-set="klass">danger</t></t>
<span t-attf-class="label label-{{klass}}"><span t-esc="cron_time"/></span>
</div>
</t>
</div>
</div>
</div>
</t>
</template>
</data> </data>
</odoo> </odoo>

View File

@ -48,13 +48,15 @@ class Test_Cron(common.TransactionCase):
mock_update.assert_called_with(force=False) mock_update.assert_called_with(force=False)
mock_create.assert_called_with() mock_create.assert_called_with()
@patch('odoo.addons.runbot.models.host.fqdn')
@patch('odoo.addons.runbot.models.repo.runbot_repo._get_cron_period') @patch('odoo.addons.runbot.models.repo.runbot_repo._get_cron_period')
@patch('odoo.addons.runbot.models.repo.runbot_repo._reload_nginx') @patch('odoo.addons.runbot.models.repo.runbot_repo._reload_nginx')
@patch('odoo.addons.runbot.models.repo.runbot_repo._scheduler') @patch('odoo.addons.runbot.models.repo.runbot_repo._scheduler')
@patch('odoo.addons.runbot.models.repo.fqdn') @patch('odoo.addons.runbot.models.repo.fqdn')
def test_cron_build(self, mock_fqdn, mock_scheduler, mock_reload, mock_cron_period): def test_cron_build(self, mock_fqdn, mock_scheduler, mock_reload, mock_cron_period, mock_host_fqdn):
""" test that cron_fetch_and_build do its work """ """ test that cron_fetch_and_build do its work """
mock_fqdn.return_value = 'runbotx.foo.com' hostname = 'runbotx.foo.com'
mock_fqdn.return_value = mock_host_fqdn.return_value = hostname
mock_cron_period.return_value = 2 mock_cron_period.return_value = 2
self.env['ir.config_parameter'].sudo().set_param('runbot.runbot_update_frequency', 1) self.env['ir.config_parameter'].sudo().set_param('runbot.runbot_update_frequency', 1)
self.Repo.create({'name': '/path/somewhere/disabled.git', 'mode': 'disabled'}) # create a disabled self.Repo.create({'name': '/path/somewhere/disabled.git', 'mode': 'disabled'}) # create a disabled
@ -64,3 +66,6 @@ class Test_Cron(common.TransactionCase):
self.assertEqual(None, ret) self.assertEqual(None, ret)
mock_scheduler.assert_called() mock_scheduler.assert_called()
self.assertTrue(mock_reload.called) self.assertTrue(mock_reload.called)
host = self.env['runbot.host'].search([('name', '=', 'runbotx.foo.com')])
self.assertEqual(host.name, hostname, 'A new host should have been created')
self.assertGreater(host.psql_conn_count, 0, 'A least one connection should exist on the current psql instance')

View File

@ -201,9 +201,9 @@ class Test_Repo_Scheduler(common.TransactionCase):
@patch('odoo.addons.runbot.models.build.runbot_build._reap') @patch('odoo.addons.runbot.models.build.runbot_build._reap')
@patch('odoo.addons.runbot.models.build.runbot_build._kill') @patch('odoo.addons.runbot.models.build.runbot_build._kill')
@patch('odoo.addons.runbot.models.build.runbot_build._schedule') @patch('odoo.addons.runbot.models.build.runbot_build._schedule')
@patch('odoo.addons.runbot.models.repo.fqdn') @patch('odoo.addons.runbot.models.host.fqdn')
def test_repo_scheduler(self, mock_repo_fqdn, mock_schedule, mock_kill, mock_reap): def test_repo_scheduler(self, mock_fqdn, mock_schedule, mock_kill, mock_reap):
mock_repo_fqdn.return_value = 'test_host' mock_fqdn.return_value = 'test_host'
self.env['ir.config_parameter'].set_param('runbot.runbot_workers', 6) self.env['ir.config_parameter'].set_param('runbot.runbot_workers', 6)
Build_model = self.env['runbot.build'] Build_model = self.env['runbot.build']
builds = [] builds = []
@ -237,6 +237,7 @@ class Test_Repo_Scheduler(common.TransactionCase):
'local_state': 'pending', 'local_state': 'pending',
}) })
builds.append(build) builds.append(build)
self.foo_repo._scheduler() self.foo_repo._scheduler()
build.invalidate_cache() build.invalidate_cache()

View File

@ -0,0 +1,60 @@
<odoo>
<data>
<record id="host_form" model="ir.ui.view">
<field name="name">runbot.host.form</field>
<field name="model">runbot.host</field>
<field name="arch" type="xml">
<form string="Host">
<sheet>
<group>
<field name="name" readonly='1'/>
<field name="disp_name"/>
<field name="active"/>
<field name="last_start_loop" readonly='1'/>
<field name="last_end_loop" readonly='1'/>
<field name="last_success" readonly='1'/>
<field name="assigned_only"/>
<field name="nb_worker"/>
<field name="nb_testing"/>
<field name="nb_running"/>
<field name="last_exception" readonly='1'/>
<field name="exception_count" readonly='1'/>
</group>
</sheet>
<div class="oe_chatter">
<field name="message_follower_ids" widget="mail_followers"/>
<field name="message_ids" widget="mail_thread"/>
</div>
</form>
</field>
</record>
<record id="view_host_tree" model="ir.ui.view">
<field name="name">runbot.host.tree</field>
<field name="model">runbot.host</field>
<field name="arch" type="xml">
<tree string="Builds">
<field name="name"/>
<field name="disp_name"/>
<field name="assigned_only"/>
<field name="nb_worker"/>
</tree>
</field>
</record>
<record id="open_view_host_tree" model="ir.actions.act_window">
<field name="name">Host</field>
<field name="res_model">runbot.host</field>
<field name="view_mode">tree,form</field>
</record>
<menuitem
name="Build Hosts"
id="runbot_menu_host_tree"
parent="runbot_menu_root"
sequence="32"
action="open_view_host_tree"
/>
</data>
</odoo>