Source code for plaso.engine.extractors

# -*- coding: utf-8 -*-
"""The extractor class definitions.

An extractor is a class used to extract information from "raw" data.
"""

from __future__ import unicode_literals

import copy
import hashlib

import pysigscan

from dfvfs.helpers import file_system_searcher
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.lib import errors as dfvfs_errors
from dfvfs.resolver import resolver as path_spec_resolver

from plaso.engine import logger
from plaso.lib import errors
from plaso.parsers import interface as parsers_interface
from plaso.parsers import manager as parsers_manager


[docs]class EventExtractor(object):
  """Event extractor.

  An event extractor extracts events from event sources.
  """

  _PARSE_RESULT_FAILURE = 1
  _PARSE_RESULT_SUCCESS = 2
  _PARSE_RESULT_UNSUPPORTED = 3

  def __init__(self, parser_filter_expression=None):
    """Initializes an event extractor.

    Args:
      parser_filter_expression (Optional[str]): the parser filter expression,
          None represents all parsers and plugins.

          The parser filter expression is a comma separated value string that
          denotes a list of parser names to include and/or exclude. Each entry
          can have the value of:

          * An exact match of a list of parsers, or a preset (see
            plaso/parsers/presets.py for a full list of available presets).
          * A name of a single parser (case insensitive), e.g. msiecf.
          * A glob name for a single parser, e.g. '*msie*' (case insensitive).
    """
    super(EventExtractor, self).__init__()
    self._file_scanner = None
    self._filestat_parser = None
    self._formats_with_signatures = None
    self._mft_parser = None
    self._non_sigscan_parser_names = None
    self._parsers = None
    self._parsers_profiler = None
    self._usnjrnl_parser = None

    self._InitializeParserObjects(
        parser_filter_expression=parser_filter_expression)

  def _CheckParserCanProcessFileEntry(self, parser, file_entry):
    """Determines if a parser can process a file entry.

    Args:
      file_entry (dfvfs.FileEntry): file entry.
      parser (BaseParser): parser.

    Returns:
      bool: True if the file entry can be processed by the parser object.
    """
    for filter_object in parser.FILTERS:
      if filter_object.Match(file_entry):
        return True

    return False

  def _GetSignatureMatchParserNames(self, file_object):
    """Determines if a file-like object matches one of the known signatures.

    Args:
      file_object (file): file-like object whose contents will be checked
          for known signatures.

    Returns:
      list[str]: parser names for which the contents of the file-like object
          matches their known signatures.
    """
    parser_names = []
    scan_state = pysigscan.scan_state()
    self._file_scanner.scan_file_object(scan_state, file_object)

    for scan_result in iter(scan_state.scan_results):
      format_specification = (
          self._formats_with_signatures.GetSpecificationBySignature(
              scan_result.identifier))

      if format_specification.identifier not in parser_names:
        parser_names.append(format_specification.identifier)

    return parser_names

  def _InitializeParserObjects(self, parser_filter_expression=None):
    """Initializes the parser objects.

    Args:
      parser_filter_expression (Optional[str]): the parser filter expression,
          None represents all parsers and plugins.

          The parser filter expression is a comma separated value string that
          denotes a list of parser names to include and/or exclude. Each entry
          can have the value of:

          * An exact match of a list of parsers, or a preset (see
            plaso/parsers/presets.py for a full list of available presets).
          * A name of a single parser (case insensitive), e.g. msiecf.
          * A glob name for a single parser, e.g. '*msie*' (case insensitive).
    """
    self._formats_with_signatures, non_sigscan_parser_names = (
        parsers_manager.ParsersManager.GetFormatsWithSignatures(
            parser_filter_expression=parser_filter_expression))

    self._non_sigscan_parser_names = []
    for parser_name in non_sigscan_parser_names:
      if parser_name not in ('filestat', 'usnjrnl'):
        self._non_sigscan_parser_names.append(parser_name)

    self._file_scanner = parsers_manager.ParsersManager.CreateSignatureScanner(
        self._formats_with_signatures)

    self._parsers = parsers_manager.ParsersManager.GetParserObjects(
        parser_filter_expression=parser_filter_expression)

    active_parser_names = ', '.join(sorted(self._parsers.keys()))
    logger.debug('Active parsers: {0:s}'.format(active_parser_names))

    self._filestat_parser = self._parsers.get('filestat', None)
    if 'filestat' in self._parsers:
      del self._parsers['filestat']

    self._mft_parser = self._parsers.get('mft', None)

    self._usnjrnl_parser = self._parsers.get('usnjrnl', None)
    if 'usnjrnl' in self._parsers:
      del self._parsers['usnjrnl']

  def _ParseDataStreamWithParser(
      self, parser_mediator, parser, file_entry, data_stream_name):
    """Parses a data stream of a file entry with a specific parser.

    Args:
      parser_mediator (ParserMediator): parser mediator.
      parser (BaseParser): parser.
      file_entry (dfvfs.FileEntry): file entry.
      data_stream_name (str): data stream name.

    Raises:
      RuntimeError: if the file-like object is missing.
    """
    file_object = file_entry.GetFileObject(data_stream_name=data_stream_name)
    if not file_object:
      raise RuntimeError(
          'Unable to retrieve file-like object from file entry.')

    try:
      self._ParseFileEntryWithParser(
          parser_mediator, parser, file_entry, file_object=file_object)

    finally:
      file_object.close()

  def _ParseFileEntryWithParser(
      self, parser_mediator, parser, file_entry, file_object=None):
    """Parses a file entry with a specific parser.

    Args:
      parser_mediator (ParserMediator): parser mediator.
      parser (BaseParser): parser.
      file_entry (dfvfs.FileEntry): file entry.
      file_object (Optional[file]): file-like object to parse.
          If not set the parser will use the parser mediator to open
          the file entry's default data stream as a file-like object.

    Returns:
      int: parse result which is _PARSE_RESULT_FAILURE if the file entry
          could not be parsed, _PARSE_RESULT_SUCCESS if the file entry
          successfully was parsed or _PARSE_RESULT_UNSUPPORTED when
          UnableToParseFile was raised.

    Raises:
      TypeError: if parser object is not a supported parser type.
    """
    if not isinstance(parser, (
        parsers_interface.FileEntryParser, parsers_interface.FileObjectParser)):
      raise TypeError('Unsupported parser object type.')

    parser_mediator.ClearParserChain()

    reference_count = (
        parser_mediator.resolver_context.GetFileObjectReferenceCount(
            file_entry.path_spec))

    parser_mediator.SampleStartTiming(parser.NAME)

    try:
      if isinstance(parser, parsers_interface.FileEntryParser):
        parser.Parse(parser_mediator)
      elif isinstance(parser, parsers_interface.FileObjectParser):
        parser.Parse(parser_mediator, file_object)
      result = self._PARSE_RESULT_SUCCESS

    # We catch IOError so we can determine the parser that generated the error.
    except (IOError, dfvfs_errors.BackEndError) as exception:
      display_name = parser_mediator.GetDisplayName(file_entry)
      logger.warning(
          '{0:s} unable to parse file: {1:s} with error: {2!s}'.format(
              parser.NAME, display_name, exception))
      result = self._PARSE_RESULT_FAILURE

    except errors.UnableToParseFile as exception:
      display_name = parser_mediator.GetDisplayName(file_entry)
      logger.debug(
          '{0:s} unable to parse file: {1:s} with error: {2!s}'.format(
              parser.NAME, display_name, exception))
      result = self._PARSE_RESULT_UNSUPPORTED

    finally:
      parser_mediator.SampleStopTiming(parser.NAME)
      parser_mediator.SampleMemoryUsage(parser.NAME)

      new_reference_count = (
          parser_mediator.resolver_context.GetFileObjectReferenceCount(
              file_entry.path_spec))
      if reference_count != new_reference_count:
        display_name = parser_mediator.GetDisplayName(file_entry)
        logger.warning((
            '[{0:s}] did not explicitly close file-object for file: '
            '{1:s}.').format(parser.NAME, display_name))

    return result

  def _ParseFileEntryWithParsers(
      self, parser_mediator, parser_names, file_entry, file_object=None):
    """Parses a file entry with a specific parsers.

    Args:
      parser_mediator (ParserMediator): parser mediator.
      parser_names (list[str]): names of parsers.
      file_entry (dfvfs.FileEntry): file entry.
      file_object (Optional[file]): file-like object to parse.
          If not set the parser will use the parser mediator to open
          the file entry's default data stream as a file-like object.

    Returns:
      int: parse result which is _PARSE_RESULT_FAILURE if the file entry
          could not be parsed, _PARSE_RESULT_SUCCESS if the file entry
          successfully was parsed or _PARSE_RESULT_UNSUPPORTED when
          UnableToParseFile was raised or no names of parser were provided.

    Raises:
      RuntimeError: if the parser object is missing.
    """
    parse_results = self._PARSE_RESULT_UNSUPPORTED
    for parser_name in parser_names:
      parser = self._parsers.get(parser_name, None)
      if not parser:
        raise RuntimeError(
            'Parser object missing for parser: {0:s}'.format(parser_name))

      if parser.FILTERS:
        if not self._CheckParserCanProcessFileEntry(parser, file_entry):
          parse_results = self._PARSE_RESULT_SUCCESS
          continue

      display_name = parser_mediator.GetDisplayName(file_entry)
      logger.debug((
          '[ParseFileEntryWithParsers] parsing file: {0:s} with parser: '
          '{1:s}').format(display_name, parser_name))

      parse_result = self._ParseFileEntryWithParser(
          parser_mediator, parser, file_entry, file_object=file_object)
      if parse_result == self._PARSE_RESULT_FAILURE:
        return self._PARSE_RESULT_FAILURE

      elif parse_result == self._PARSE_RESULT_SUCCESS:
        parse_results = self._PARSE_RESULT_SUCCESS

    return parse_results

[docs]  def ParseDataStream(self, parser_mediator, file_entry, data_stream_name):
    """Parses a data stream of a file entry with the enabled parsers.

    Args:
      parser_mediator (ParserMediator): parser mediator.
      file_entry (dfvfs.FileEntry): file entry.
      data_stream_name (str): data stream name.

    Raises:
      RuntimeError: if the file-like object or the parser object is missing.
    """
    file_object = file_entry.GetFileObject(data_stream_name=data_stream_name)
    if not file_object:
      raise RuntimeError(
          'Unable to retrieve file-like object from file entry.')

    try:
      parser_names = self._GetSignatureMatchParserNames(file_object)

      parse_with_non_sigscan_parsers = True
      if parser_names:
        parse_result = self._ParseFileEntryWithParsers(
            parser_mediator, parser_names, file_entry, file_object=file_object)
        if parse_result in (
            self._PARSE_RESULT_FAILURE, self._PARSE_RESULT_SUCCESS):
          parse_with_non_sigscan_parsers = False

      if parse_with_non_sigscan_parsers:
        self._ParseFileEntryWithParsers(
            parser_mediator, self._non_sigscan_parser_names, file_entry,
            file_object=file_object)

    finally:
      file_object.close()

[docs]  def ParseFileEntryMetadata(self, parser_mediator, file_entry):
    """Parses the file entry metadata e.g. file system data.

    Args:
      parser_mediator (ParserMediator): parser mediator.
      file_entry (dfvfs.FileEntry): file entry.
    """
    if self._filestat_parser:
      self._ParseFileEntryWithParser(
          parser_mediator, self._filestat_parser, file_entry)

[docs]  def ParseMetadataFile(
      self, parser_mediator, file_entry, data_stream_name):
    """Parses a metadata file.

    Args:
      parser_mediator (ParserMediator): parser mediator.
      file_entry (dfvfs.FileEntry): file entry.
      data_stream_name (str): data stream name.
    """
    parent_path_spec = getattr(file_entry.path_spec, 'parent', None)
    filename_upper = file_entry.name.upper()
    if (self._mft_parser and parent_path_spec and
        filename_upper in ('$MFT', '$MFTMIRR') and not data_stream_name):
      self._ParseDataStreamWithParser(
          parser_mediator, self._mft_parser, file_entry, '')

    elif (self._usnjrnl_parser and parent_path_spec and
          filename_upper == '$USNJRNL' and data_stream_name == '$J'):
      # To be able to ignore the sparse data ranges the UsnJrnl parser
      # needs to read directly from the volume.
      volume_file_object = path_spec_resolver.Resolver.OpenFileObject(
          parent_path_spec, resolver_context=parser_mediator.resolver_context)

      try:
        self._ParseFileEntryWithParser(
            parser_mediator, self._usnjrnl_parser, file_entry,
            file_object=volume_file_object)
      finally:
        volume_file_object.close()


[docs]class PathSpecExtractor(object):
  """Path specification extractor.

  A path specification extractor extracts path specification from a source
  directory, file or storage media device or image.
  """

  _MAXIMUM_DEPTH = 255

  def __init__(self, duplicate_file_check=False):
    """Initializes a path specification extractor.

    The source collector discovers all the file entries in the source.
    The source can be a single file, directory or a volume within
    a storage media image or device.

    Args:
      duplicate_file_check (Optional[bool]): True if duplicate files should
          be ignored.
    """
    super(PathSpecExtractor, self).__init__()
    self._duplicate_file_check = duplicate_file_check
    self._hashlist = {}

  def _CalculateNTFSTimeHash(self, file_entry):
    """Calculates an MD5 from the date and time value of a NTFS file entry.

    Args:
      file_entry (dfvfs.FileEntry): file entry.

    Returns:
      str: hexadecimal representation of the MD5 hash value of the date and
          time values of the file entry.
    """
    date_time_values = []

    access_time = getattr(file_entry, 'access_time', None)
    if access_time:
      date_time_string = access_time.CopyToDateTimeString()
      date_time_values.append('atime:{0:s}'.format(date_time_string))

    creation_time = getattr(file_entry, 'creation_time', None)
    if creation_time:
      date_time_string = creation_time.CopyToDateTimeString()
      date_time_values.append('crtime:{0:s}'.format(date_time_string))

    modification_time = getattr(file_entry, 'modification_time', None)
    if modification_time:
      date_time_string = modification_time.CopyToDateTimeString()
      date_time_values.append('mtime:{0:s}'.format(date_time_string))

    # file_entry.change_time is an alias of file_entry.entry_modification_time.
    change_time = getattr(file_entry, 'change_time', None)
    if change_time:
      date_time_string = change_time.CopyToDateTimeString()
      date_time_values.append('ctime:{0:s}'.format(date_time_string))

    date_time_values = ''.join(date_time_values)
    date_time_values = date_time_values.encode('ascii')

    hash_value = hashlib.md5()
    hash_value.update(date_time_values)
    return hash_value.hexdigest()

  def _ExtractPathSpecs(
      self, path_spec, find_specs=None, recurse_file_system=True,
      resolver_context=None):
    """Extracts path specification from a specific source.

    Args:
      path_spec (dfvfs.PathSpec): path specification.
      find_specs (Optional[list[dfvfs.FindSpec]]): find specifications
          used in path specification extraction.
      recurse_file_system (Optional[bool]): True if extraction should
          recurse into a file system.
      resolver_context (Optional[dfvfs.Context]): resolver context.

    Yields:
      dfvfs.PathSpec: path specification of a file entry found in the source.
    """
    try:
      file_entry = path_spec_resolver.Resolver.OpenFileEntry(
          path_spec, resolver_context=resolver_context)
    except (
        dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
        dfvfs_errors.PathSpecError) as exception:
      logger.error(
          'Unable to open file entry with error: {0!s}'.format(exception))
      return

    if not file_entry:
      logger.warning('Unable to open: {0:s}'.format(path_spec.comparable))
      return

    if (not file_entry.IsDirectory() and not file_entry.IsFile() and
        not file_entry.IsDevice()):
      logger.warning((
          'Source path specification not a device, file or directory.\n'
          '{0:s}').format(path_spec.comparable))
      return

    if file_entry.IsFile():
      yield path_spec

    else:
      for extracted_path_spec in self._ExtractPathSpecsFromFileSystem(
          path_spec, find_specs=find_specs,
          recurse_file_system=recurse_file_system,
          resolver_context=resolver_context):
        yield extracted_path_spec

  def _ExtractPathSpecsFromDirectory(self, file_entry, depth=0):
    """Extracts path specification from a directory.

    Args:
      file_entry (dfvfs.FileEntry): file entry that refers to the directory.
      depth (Optional[int]): current depth where 0 represents the file system
          root.

    Yields:
      dfvfs.PathSpec: path specification of a file entry found in the directory.
    """
    if depth >= self._MAXIMUM_DEPTH:
      raise errors.MaximumRecursionDepth('Maximum recursion depth reached.')

    # Need to do a breadth-first search otherwise we'll hit the Python
    # maximum recursion depth.
    sub_directories = []

    for sub_file_entry in file_entry.sub_file_entries:
      try:
        if not sub_file_entry.IsAllocated() or sub_file_entry.IsLink():
          continue
      except dfvfs_errors.BackEndError as exception:
        logger.warning(
            'Unable to process file: {0:s} with error: {1!s}'.format(
                sub_file_entry.path_spec.comparable.replace(
                    '\n', ';'), exception))
        continue

      # For TSK-based file entries only, ignore the virtual /$OrphanFiles
      # directory.
      if sub_file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK:
        if file_entry.IsRoot() and sub_file_entry.name == '$OrphanFiles':
          continue

      if sub_file_entry.IsDirectory():
        sub_directories.append(sub_file_entry)

      elif sub_file_entry.IsFile():
        # If we are dealing with a VSS we want to calculate a hash
        # value based on available timestamps and compare that to previously
        # calculated hash values, and only include the file into the queue if
        # the hash does not match.
        if self._duplicate_file_check:
          hash_value = self._CalculateNTFSTimeHash(sub_file_entry)

          inode = getattr(sub_file_entry.path_spec, 'inode', 0)
          if inode in self._hashlist:
            if hash_value in self._hashlist[inode]:
              continue

          self._hashlist.setdefault(inode, []).append(hash_value)

      for path_spec in self._ExtractPathSpecsFromFile(sub_file_entry):
        yield path_spec

    for sub_file_entry in sub_directories:
      try:
        for path_spec in self._ExtractPathSpecsFromDirectory(
            sub_file_entry, depth=(depth + 1)):
          yield path_spec

      except (
          IOError, dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
          dfvfs_errors.PathSpecError) as exception:
        logger.warning('{0!s}'.format(exception))

  def _ExtractPathSpecsFromFile(self, file_entry):
    """Extracts path specification from a file.

    Args:
      file_entry (dfvfs.FileEntry): file entry that refers to the file.

    Yields:
      dfvfs.PathSpec: path specification of a file entry found in the file.
    """
    produced_main_path_spec = False
    for data_stream in file_entry.data_streams:
      # Make a copy so we don't make the changes on a path specification
      # directly. Otherwise already produced path specifications can be
      # altered in the process.
      path_spec = copy.deepcopy(file_entry.path_spec)
      if data_stream.name:
        setattr(path_spec, 'data_stream', data_stream.name)
      yield path_spec

      if not data_stream.name:
        produced_main_path_spec = True

    if not produced_main_path_spec:
      yield file_entry.path_spec

  def _ExtractPathSpecsFromFileSystem(
      self, path_spec, find_specs=None, recurse_file_system=True,
      resolver_context=None):
    """Extracts path specification from a file system within a specific source.

    Args:
      path_spec (dfvfs.PathSpec): path specification of the root of
          the file system.
      find_specs (Optional[list[dfvfs.FindSpec]]): find specifications.
      recurse_file_system (Optional[bool]): True if extraction should
          recurse into a file system.
      resolver_context (Optional[dfvfs.Context]): resolver context.

    Yields:
      dfvfs.PathSpec: path specification of a file entry found in
          the file system.
    """
    try:
      file_system = path_spec_resolver.Resolver.OpenFileSystem(
          path_spec, resolver_context=resolver_context)
    except (
        dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
        dfvfs_errors.PathSpecError) as exception:
      logger.error(
          'Unable to open file system with error: {0!s}'.format(exception))
      return

    try:
      if find_specs:
        searcher = file_system_searcher.FileSystemSearcher(
            file_system, path_spec)
        for extracted_path_spec in searcher.Find(find_specs=find_specs):
          yield extracted_path_spec

      elif recurse_file_system:
        file_entry = file_system.GetFileEntryByPathSpec(path_spec)
        if file_entry:
          for extracted_path_spec in self._ExtractPathSpecsFromDirectory(
              file_entry):
            yield extracted_path_spec

      else:
        yield path_spec

    except (
        dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
        dfvfs_errors.PathSpecError) as exception:
      logger.warning('{0!s}'.format(exception))

    finally:
      file_system.Close()

[docs]  def ExtractPathSpecs(
      self, path_specs, find_specs=None, recurse_file_system=True,
      resolver_context=None):
    """Extracts path specification from a specific source.

    Args:
      path_specs (Optional[list[dfvfs.PathSpec]]): path specifications.
      find_specs (Optional[list[dfvfs.FindSpec]]): find specifications.
      recurse_file_system (Optional[bool]): True if extraction should
          recurse into a file system.
      resolver_context (Optional[dfvfs.Context]): resolver context.

    Yields:
      dfvfs.PathSpec: path specification of a file entry found in the source.
    """
    for path_spec in path_specs:
      for extracted_path_spec in self._ExtractPathSpecs(
          path_spec, find_specs=find_specs,
          recurse_file_system=recurse_file_system,
          resolver_context=resolver_context):
        yield extracted_path_spec