# Part of Odoo. See LICENSE file for full copyright and licensing details. import base64 import html import io import json import logging import re import requests import tarfile from odoo import api, fields, models from odoo.addons.iap.tools.iap_tools import iap_jsonrpc from odoo.exceptions import AccessError from odoo.tools import LazyTranslate from urllib.parse import urljoin, urlparse _lt = LazyTranslate(__name__) DEFAULT_WSS_ENDPOINT = 'https://iap-scraper.odoo.com/' GET_RESULT_TIMEOUT_SECONDS = 3600 # 1 hour STATUS_MESSAGES = { 'success': _lt("Success"), 'processing': _lt("Processing"), 'waiting': _lt("Waiting for the server to process the request"), 'done': _lt("Done, website generated"), 'error_maintenance': _lt("Server is currently under maintenance. Please retry later"), 'error_internal': _lt("An error occurred"), 'error_invalid_url': _lt("Invalid url"), 'error_banned_url': _lt("Banned url"), 'error_invalid_dbuuid': _lt("Invalid dbuuid"), 'error_too_many_pages': _lt("The request asks for too many pages"), 'error_unsupported_version': _lt("Version is unsupported"), 'error_invalid_token': _lt("Invalid token"), 'error_concurrent_request': _lt("Number of concurrent requests exceeded"), 'error_allowed_request_exhausted': _lt("Number of allowed requests exhausted"), 'error_invalid_import_products': _lt("Invalid import products"), 'error_invalid_request_uuid': _lt("Could not fetch result, invalid output uuid or result expired"), 'error_request_still_processing': _lt("Request is still processing, result not available yet"), 'error_attachment_not_found': _lt("Attachment not found"), 'errror_website_not_supported': _lt("Website not supported"), 'error_website_blocked': _lt("Website blocked or unreachable"), } logger = logging.getLogger(__name__) class WebsiteGeneratorRequest(models.Model): _name = 'website_generator.request' _description = "Website Generator Request" target_url = fields.Char(string="URL to scrape", required=True) additional_urls = fields.Char(string="Additional URLs") page_count = fields.Integer(string="Number of pages") uuid = fields.Char(string="Output UUID generated from Website Scraper Server") status = fields.Char(string="Status", default='waiting') status_message = fields.Char(string="Status Message", compute='_compute_status_message') version = fields.Char(string="Version", default='1.0.0') website_id = fields.Many2one('website', string="Website", ondelete='cascade') notified = fields.Boolean(string="Notified", default=False) @api.model_create_multi def create(self, vals_list): wg_requests = super().create(vals_list) for req in wg_requests: # If there is already a uuid, it means the request was already # created via the odoo.com start trial. if not req.uuid: ws_endpoint = self.env['ir.config_parameter'].sudo().get_param('website_scraper_endpoint', DEFAULT_WSS_ENDPOINT) url = urljoin(ws_endpoint, f'/website_scraper/{req.version}/scrape') response = iap_jsonrpc(url, params=req._get_call_params()) if response.get('status') == 'accepted': req.uuid = response['uuid'] req.status = 'waiting' else: req.status = response.get('status', 'error_internal') logger.warning("Error calling WS server: %s", req.status) self.env.ref("website_generator.cron_get_result").toggle(model=self._name, domain=[]) return wg_requests def write(self, values): res = super().write(values) pending_requests = self.search([ ('status', 'in', ['waiting', 'error_request_still_processing', 'error_maintenance']), ]) if not pending_requests: self.env.ref("website_generator.cron_get_result").active = False logger.info("Webite Generator: No more pending request, disabling 'cron_get_result' cron") return res @api.depends('status') def _compute_status_message(self): for record in self: record.status_message = self.env._(STATUS_MESSAGES.get(record.status, STATUS_MESSAGES['error_internal'])) # pylint: disable=gettext-variable def _get_call_params(self): ICP = self.env['ir.config_parameter'].sudo() params = { 'url': self.target_url, 'additional_urls': self.additional_urls, 'token': ICP.get_param('website_generator.token', None), 'dbuuid': ICP.get_param('database.uuid'), } if self.page_count: params['page_count'] = self.page_count return params @api.model def get_result_waiting_requests(self): """ This method is called by the CRON job which is started by the webhook (``/result_ready``). """ ready_requests = self.search([ ('status', 'in', ['waiting', 'error_request_still_processing', 'error_maintenance']), ]) for request in ready_requests: request._call_server_get_result() def _call_server_get_result(self): # Don't inline this method in `get_result_waiting_requests()`, it's # needed for ease of development (overridden in custom dev module) logger.info("Webite Generator: Getting result for request uuid: %s", self.uuid) ICP = self.env['ir.config_parameter'].sudo() data = { 'uuid': self.uuid, 'dbuuid': ICP.get_param('database.uuid'), # This is called by a CRON, we are not in a website context, # this will always shortcut to ICP, so we can at least have a domain to go back to. 'db_url': self.get_base_url(), } ws_endpoint = ICP.get_param('website_scraper_endpoint', DEFAULT_WSS_ENDPOINT) url = urljoin(ws_endpoint, f'/website_scraper/{self.version}/get_result') response = requests.get(url, params=data, timeout=GET_RESULT_TIMEOUT_SECONDS) # /get_result is not protected by token data['token'] = ICP.get_param('website_generator.token', None) # Check the response try: if response.status_code != 200: self.status = 'error_internal' logger.warning("Error calling WS server: Status code %s", response.status_code) return # TODO: Find a better way to check if the response failed or not elif response.headers.get('Content-Type', '') != 'application/x-tar': # An error occurred, getting the status from the response # On top of real errors, some "normal" status could be returned # here as `error_request_still_processing` if the scraping is # not yet finished. self.status = response.json().get('status', 'error_internal') return except Exception: # If the response is not JSON, it means the request was successful pass try: tar_gz_file = io.BytesIO(response.content) with tarfile.open(fileobj=tar_gz_file, mode='r:gz') as tar: website, _ = self._generate_site(tar) self.status = 'done' # Send email to the client. website = self.env['website'].get_current_website() mail_template = self.env.ref('website_generator.email_template_website_scrapped') email_values = {'email_to': self.env.company.email_formatted, 'website_url': website.get_base_url()} mail_template.with_context(email_values).send_mail( website.id, force_send=True, email_values=None, ) # Report OK to IAP (success) logger.info("Webite Generator: Reporting OK for request uuid: %s", self.uuid) url = urljoin(ws_endpoint, f'/website_scraper/{self.version}/report_ok') self._report_to_iap(url, data) except Exception as e: # Defensive programming: if necessary info is missing, stop and warn IAP # (should not happen, but just in case of a future changes in the WS server) # Rollback the transaction to avoid the creation of the website self.env.cr.rollback() self.status = 'error_internal' logger.exception("Error building the website: %s", e) # Report KO to IAP (useful for spotting critical errors) logger.info("Webite Generator: Reporting KO for request uuid: %s", self.uuid) url = urljoin(ws_endpoint, f'/website_scraper/{self.version}/report_ko') self._report_to_iap(url, data) def _report_to_iap(self, url, data): try: resp = iap_jsonrpc(url, params=data) if resp.get('status') != 'ok': logger.warning("Error reporting to WS server: %s", resp.get('status')) except AccessError as e: logger.warning("Error reporting to WS server: %s", e) def _generate_site(self, tar): odoo_blocks = self._load_input(tar) odoo_blocks['direct_html_replacements_mapping'] = {} odoo_blocks['regex_html_replacements_mapping'] = {} website = self._get_website(odoo_blocks, tar) # Generate the images attachments (Modifies odoo_blocks in place) self._save_images_as_attachments(odoo_blocks, tar) self._create_model_records(tar, odoo_blocks) # Create redirects, modifies odoo_blocks in place self._apply_all_html_replacements(odoo_blocks) self._generate_pages(website, odoo_blocks) return website, odoo_blocks def _apply_all_html_replacements(self, odoo_blocks): direct_html_replacements_mapping = odoo_blocks.get('direct_html_replacements_mapping', {}) regex_html_replacements_mapping = odoo_blocks.get('regex_html_replacements_mapping', {}) sorted_original_html = sorted(direct_html_replacements_mapping.keys(), key=len, reverse=True) homepage = odoo_blocks['homepage'] homepage['body_html'] = self._apply_html_replacements(homepage.get('body_html', []), sorted_original_html, direct_html_replacements_mapping, regex_html_replacements_mapping) footer = homepage.get('footer', []) if footer: homepage['footer'] = self._apply_html_replacements(footer, sorted_original_html, direct_html_replacements_mapping, regex_html_replacements_mapping) header_buttons = homepage.get('header', {}).get('buttons', []) for button in header_buttons: if button.get('href') in direct_html_replacements_mapping: button['href'] = direct_html_replacements_mapping[button['href']] # Update the html urls for all pages for page_name, page_dict in odoo_blocks.get('pages', {}).items(): odoo_blocks['pages'][page_name]['body_html'] = self._apply_html_replacements(page_dict.get('body_html', []), sorted_original_html, direct_html_replacements_mapping, regex_html_replacements_mapping) def _create_model_records(self, tar, odoo_blocks): # Each override will call super and create it's model records as well as any redirects it needs. pass def _load_input(self, tar): # Don't inline this method in `_generate_site()`, it's needed for ease # of development (overridden in custom dev module) return json.load(tar.extractfile('out.json')) def _generate_pages(self, website, odoo_blocks): # Create pages for page_url, page_data in odoo_blocks.get('pages', {}).items(): if 'body_html' in page_data: # Create page new_page_info = website.with_context(website_id=website.id).new_page(page_data['name']) new_page = self.env['website.page'].browse(new_page_info['page_id']) # force url to the one provided, don't use the slugified one new_page.url = page_url new_page.is_published = True # Create page content new_page._construct_page(page_data) # Remove the default homepage homepage_url = website.homepage_url or '/' self.env['website.page'].search([('website_id', '=', website.id), ('url', '=', homepage_url)]).unlink() homepage_info = website.with_context(website_id=website.id).new_page('Home') homepage = self.env['website.page'].browse(homepage_info['page_id']) homepage.write({ 'url': '/', 'is_published': True, }) # Create home page content homepage._construct_homepage(odoo_blocks['homepage']) def _get_website(self, odoo_blocks, tar): website = self.env['website'].get_current_website() self.write({'website_id': website.id}) website_info = odoo_blocks.get('website') if not website_info: raise ValueError("Website info not found in the input") homepage_url = odoo_blocks.get('homepage', {}).get('url') if not homepage_url: raise ValueError("Homepage url not found in the input") website_name = urlparse(homepage_url).netloc.removeprefix('www.') website_values = {'name': website_name, **website_info.get('social_media_links', {})} # Add logo logo_filename = website_info.get('logo') if logo_filename: image = self._get_image_data(tar, logo_filename) if image: website_values['logo'] = base64.b64encode(image).decode() website.update(website_values) return website def _save_images_as_attachments(self, odoo_blocks, tar): def populate_image_customization_mapping(customized_image_mappings, ws_id, customization): # Replace the image with the cropped one url = customization.get('url') data_mimetype = customization['data_mimetype'] # Check that an attachment was created. if not customized_attachments_url_src.get(ws_id) or not attachments_url_src.get(url): return customized_image_mappings attributes = { 'src': customized_attachments_url_src[ws_id].image_src, 'data-original-id': attachments_url_src[url].id, 'data-original-src': attachments_url_src[url].image_src, 'data-mimetype': data_mimetype, 'data-mimetype-before-conversion': data_mimetype, } # Apply the cropping attributes cropping_dimensions = customization.get('cropping_coords', {}) if cropping_dimensions: attributes.update({ 'data-x': cropping_dimensions['x'], 'data-y': cropping_dimensions['y'], 'data-width': cropping_dimensions['width'], 'data-height': cropping_dimensions['height'], 'data-scale-x': 1, 'data-scale-y': 1, 'data-aspect-ratio': '0/0', }) color_filter = customization.get('filter', {}) if color_filter: rgba = f'rgba({int(color_filter["coords"][0] * 255)}, {int(color_filter["coords"][1] * 255)}, {int(color_filter["coords"][2] * 255)}, {color_filter["alpha"]})' attributes.update({ 'data-gl-filter': 'custom', 'data-filter-options': f'{{"filterColor":"{rgba}"}}' }) if url and (cropping_dimensions or color_filter) and ws_id: pattern = rf']*data-ws_id\s*=\s*["\']?{ws_id}["\']?[^>]*>' # The 'style="" class=""' is needed and will be replaced by the class and style attributes of the original image. customized_img_string = f'' customized_image_mappings[pattern] = customized_img_string return customized_image_mappings all_images = odoo_blocks['website'].get('all_images', {}) # Create attachments for all images (uncropped) attachments_url_src = {} for img_url, img_name in all_images.items(): attachments_url_src = self.try_create_image_attachment(img_name, img_url, attachments_url_src, tar) odoo_blocks['direct_html_replacements_mapping'].update({html.escape(k): v.image_src for k, v in attachments_url_src.items()}) # Create attachments for all images (cropped) customized_attachments_url_src = {} customized_image_mappings = {} for page_dict in [odoo_blocks['homepage']] + list(odoo_blocks.get('pages', {}).values()): customized_images = page_dict.get('images_to_customize', []) for ws_id, image_customizations in customized_images.items(): img_name = self._get_custom_image_name(all_images, ws_id, image_customizations) # Note, we give the 'ws_id' as the image_url because we may have multiple images # with the same url but cropped differently (where the image_url is the # downloaded image url from the original website). customized_attachments_url_src = self.try_create_image_attachment(img_name, ws_id, customized_attachments_url_src, tar) customized_image_mappings = populate_image_customization_mapping(customized_image_mappings, ws_id, image_customizations) odoo_blocks['regex_html_replacements_mapping'].update(customized_image_mappings) def try_create_image_attachment(self, img_name, img_url, attachments_url_src, tar): try: # Read from tar image_data = self._get_image_data(tar, img_name) if not image_data: return attachments_url_src # Create a new attachment att = self.env['ir.attachment'].create({ 'name': img_name, 'raw': image_data, 'public': True, 'res_model': 'ir.ui.view', 'res_id': 0, # shared between website's pages }) if att and att.image_src: attachments_url_src[img_url] = att except (AttributeError, TypeError, ValueError) as e: # Defensive programming: skip the image if it's invalid # (image extension not supported, corrupted metadata, etc.) logger.warning("Error attaching image %r : %s", img_url, e) return attachments_url_src def _get_custom_image_name(self, all_images, ws_id, image_customizations): original_img_url = image_customizations.get('url') original_img_name = all_images.get(original_img_url, '') # We keep the same mimetype as the original image supported_mimetypes = { 'png': 'image/png', 'jpg': 'image/jpeg', 'webp': 'image/webp', 'svg': 'image/svg+xml', 'gif': 'image/gif', } # Split the image name and the image extension based on the last dot original_img_name_base, separator, original_img_name_extension = original_img_name.rpartition('.') image_extension = original_img_name_extension.lower() if separator else '' if not image_extension or image_extension not in supported_mimetypes: image_extension = 'png' image_customizations['data_mimetype'] = supported_mimetypes[image_extension] if original_img_name: img_name = f'customized_{original_img_name_base}_{ws_id}.{image_extension}' image_customizations['filename'] = img_name return img_name return '' def _apply_html_replacements(self, body_html, sorted_list_replacement_mapping, direct_replacement_mapping, regex_replacement_mapping): new_block_list = [] for block_html in body_html: page_html = self._replace_in_string(block_html, sorted_list_replacement_mapping, direct_replacement_mapping) page_html = self._replace_in_string_regex(page_html, regex_replacement_mapping) new_block_list.append(page_html) return new_block_list def _find_or_create(self, model, domain, vals): record = self.env[model].search(domain, limit=1) if not record: record = self.env[model].create(vals) return record def _get_image_data(self, tar, image_name): if not image_name: return None try: image_data = tar.extractfile('images/' + image_name).read() return image_data except (KeyError, AttributeError) as e: logger.warning("Image %s not found : %s", image_name, e) return None def _get_image_info(self, tar, images, image_file_mappings): all_images_info = [] for image in images: image_filename = image_file_mappings.get(image, '') image_data = self._get_image_data(tar, image_filename) if image_data: all_images_info.append({ 'name': image_filename, 'raw': image_data, 'base64': base64.b64encode(image_data).decode(), }) return all_images_info @staticmethod def _replace_in_string_regex(page_html, regex_replacement_mapping): # Since we need to have a mapping of the regex to the replacement # and not a mapping of the matched string to the replacement, # we have to do the sub on each iteration, rather than one group sub. re_class_patern = re.compile(r'class="[^"]*"') re_style_patern = re.compile(r'style="[^"]*"') for pattern, replacement in regex_replacement_mapping.items(): def replace_but_keep_class_and_style(match): # Replaces the matched string but keeps the original class and style attribute (if found). result = match.group(0) class_match = re_class_patern.search(result) if class_match: prev_class = class_match.group(0) result = re_class_patern.sub(prev_class, replacement, count=1) style_match = re_style_patern.search(match.group(0)) if style_match: prev_style = style_match.group(0) result = re_style_patern.sub(prev_style, result, count=1) return result page_html = re.sub(pattern, replace_but_keep_class_and_style, page_html) return page_html @staticmethod def _replace_in_string(string, sorted_list_replacements, replacements): if not replacements or not sorted_list_replacements: return string # Use a regular expression to match any of the replacements pattern = r'(' + '|'.join(map(re.escape, sorted_list_replacements)) + r')' def replace_callback(match): # Having this callback function is useful for verifying which URLs were replaced. matched_url = match.group(0) replacement = replacements.get(matched_url) if not replacement: replacement = matched_url logger.warning("Match found but URL %r not found in attachments", matched_url) return replacement # Replace all matches with their corresponding replacement replaced_string = re.sub(pattern, replace_callback, string) return replaced_string @api.model def convert_scraping_request_ICP(self): ICP = ws_uuid = self.env['ir.config_parameter'].sudo() ws_uuid = ICP.get_param('website_generator.iap_ws_uuid', None) ws_target_url = ICP.get_param('website_generator.iap_ws_target_url', None) if not (ws_uuid and ws_target_url): # TODO: return website configurator? return { 'type': 'ir.actions.act_url', 'url': '/odoo', 'target': 'self', } self.env['website_generator.request'].create({ 'uuid': ws_uuid, 'target_url': ws_target_url, }) ICP.set_param('website_generator.iap_ws_uuid', None) ICP.set_param('website_generator.iap_ws_target_url', None) return { 'type': 'ir.actions.act_url', 'url': "/odoo/action-website_generator.website_generator_screen?reload=true", 'target': 'self', }