Source code for plaso.lib.utils

# -*- coding: utf-8 -*-
"""This file contains utility functions."""

from __future__ import unicode_literals

import logging

from plaso.lib import py2to3


[docs]def IsText(bytes_in, encoding=None): """Examine the bytes in and determine if they are indicative of a text. Parsers need quick and at least semi reliable method of discovering whether or not a particular byte stream is a text or resembles text or not. This can be used in text parsers to determine if a file is a text file or not for instance. The method assumes the byte sequence is either ASCII, UTF-8, UTF-16 or method supplied character encoding. Otherwise it will make the assumption the byte sequence is not text, but a byte sequence. Args: bytes_in (bytes): byte stream to examine. encoding (Optional[str]): encoding to test, if not defined ASCII and UTF-8 are tried. Returns: bool: True if the bytes stream contains text. """ # TODO: Improve speed and accuracy of this method. # Start with the assumption we are dealing with a text. is_ascii = True # Check if this is ASCII text string. for char in bytes_in: if not 31 < ord(char) < 128: is_ascii = False break # We have an ASCII string. if is_ascii: return is_ascii if isinstance(bytes_in, py2to3.UNICODE_TYPE): return True # Check if this is UTF-8 try: bytes_in.decode('utf-8') return True except UnicodeDecodeError: pass if encoding: try: bytes_in.decode(encoding) return True except LookupError: logging.error('Unsupported encoding: {0:s}'.format(encoding)) except UnicodeDecodeError: pass return False