# -*- coding: utf-8 -*- """ Mimetypes-related utilities # TODO: reexport stdlib mimetypes? """ import collections import functools import io import logging import mimetypes import re import zipfile __all__ = ['guess_mimetype'] _logger = logging.getLogger(__name__) # We define our own guess_mimetype implementation and if magic is available we # use it instead. # discriminants for zip-based file formats _ooxml_dirs = { 'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'pt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', } def _check_ooxml(data): with io.BytesIO(data) as f, zipfile.ZipFile(f) as z: filenames = z.namelist() # OOXML documents should have a [Content_Types].xml file for early # check that we're interested in this thing at all if '[Content_Types].xml' not in filenames: return False # then there is a directory whose name denotes the type of the file: # word, pt (powerpoint) or xl (excel) for dirname, mime in _ooxml_dirs.items(): if any(entry.startswith(dirname) for entry in filenames): return mime return False # checks that a string looks kinda sorta like a mimetype _mime_validator = re.compile(r""" [\w-]+ # type-name / # subtype separator [\w-]+ # registration facet or subtype (?:\.[\w-]+)* # optional faceted name (?:\+[\w-]+)? # optional structured syntax specifier """, re.VERBOSE) def _check_open_container_format(data): # Open Document Format for Office Applications (OpenDocument) Version 1.2 # # Part 3: Packages # 3 Packages # 3.3 MIME Media Type with io.BytesIO(data) as f, zipfile.ZipFile(f) as z: # If a MIME media type for a document exists, then an OpenDocument # package should contain a file with name "mimetype". if 'mimetype' not in z.namelist(): return False # The content of this file shall be the ASCII encoded MIME media type # associated with the document. marcel = z.read('mimetype').decode('ascii') # check that it's not too long (RFC6838 ยง 4.2 restricts type and # subtype to 127 characters each + separator, strongly recommends # limiting them to 64 but does not require it) and that it looks a lot # like a valid mime type if len(marcel) < 256 and _mime_validator.match(marcel): return marcel return False _xls_pattern = re.compile(b""" \x09\x08\x10\x00\x00\x06\x05\x00 | \xFD\xFF\xFF\xFF(\x10|\x1F|\x20|"|\\#|\\(|\\)) """, re.VERBOSE) _ppt_pattern = re.compile(b""" \x00\x6E\x1E\xF0 | \x0F\x00\xE8\x03 | \xA0\x46\x1D\xF0 | \xFD\xFF\xFF\xFF(\x0E|\x1C|\x43)\x00\x00\x00 """, re.VERBOSE) def _check_olecf(data): """ Pre-OOXML Office formats are OLE Compound Files which all use the same file signature ("magic bytes") and should have a subheader at offset 512 (0x200). Subheaders taken from http://www.garykessler.net/library/file_sigs.html according to which Mac office files *may* have different subheaders. We'll ignore that. """ offset = 0x200 if data.startswith(b'\xEC\xA5\xC1\x00', offset): return 'application/msword' # the _xls_pattern stuff doesn't seem to work correctly (the test file # only has a bunch of \xf* at offset 0x200), that apparently works elif b'Microsoft Excel' in data: return 'application/vnd.ms-excel' elif _ppt_pattern.match(data, offset): return 'application/vnd.ms-powerpoint' return False def _check_svg(data): """This simply checks the existence of the opening and ending SVG tags""" if b'= ' ' or c in '\t\n\r' for c in bin_data[:1024].decode()): return 'text/plain' except ValueError: pass return default try: import magic except ImportError: magic = None if magic: # There are 2 python libs named 'magic' with incompatible api. # magic from pypi https://pypi.python.org/pypi/python-magic/ if hasattr(magic, 'from_buffer'): _guesser = functools.partial(magic.from_buffer, mime=True) # magic from file(1) https://packages.debian.org/squeeze/python-magic elif hasattr(magic, 'open'): ms = magic.open(magic.MAGIC_MIME_TYPE) ms.load() _guesser = ms.buffer def guess_mimetype(bin_data, default=None): mimetype = _guesser(bin_data[:1024]) # upgrade incorrect mimetype to official one, fixed upstream # https://github.com/file/file/commit/1a08bb5c235700ba623ffa6f3c95938fe295b262 if mimetype == 'image/svg': return 'image/svg+xml' return mimetype else: guess_mimetype = _odoo_guess_mimetype def neuter_mimetype(mimetype, user): wrong_type = 'ht' in mimetype or 'xml' in mimetype or 'svg' in mimetype if wrong_type and not user._is_system(): return 'text/plain' return mimetype def get_extension(filename): # A file has no extension if it has no dot (ignoring the leading one # of hidden files) or that what follow the last dot is not a single # word, e.g. "Mr. Doe" _stem, dot, ext = filename.lstrip('.').rpartition('.') if not dot or not ext.isalnum(): return '' # Assume all 4-chars extensions to be valid extensions even if it is # not known from the mimetypes database. In /etc/mime.types, only 7% # known extensions are longer. if len(ext) <= 4: return f'.{ext}'.lower() # Use the mimetype database to determine the extension of the file. guessed_mimetype, guessed_ext = mimetypes.guess_type(filename) if guessed_ext: return guessed_ext if guessed_mimetype: return f'.{ext}'.lower() # Unknown extension. return '' def fix_filename_extension(filename, mimetype): """ Make sure the filename ends with an extension of the mimetype. :param str filename: the filename with an unsafe extension :param str mimetype: the mimetype detected reading the file's content :returns: the same filename if its extension matches the detected mimetype, otherwise the same filename with the mimetype's extension added at the end. """ if mimetypes.guess_type(filename)[0] == mimetype: return filename if extension := mimetypes.guess_extension(mimetype): _logger.warning("File %r has an invalid extension for mimetype %r, adding %r", filename, mimetype, extension) return filename + extension _logger.warning("File %r has an unknown extension for mimetype %r", filename, mimetype) return filename