Source code for plaso.engine.worker

# -*- coding: utf-8 -*-
"""The event extraction worker."""

from __future__ import unicode_literals

import copy
import os
import re
import time

from dfvfs.analyzer import analyzer
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.lib import errors as dfvfs_errors
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import resolver as path_spec_resolver

from plaso.analyzers import hashing_analyzer
from plaso.analyzers import manager as analyzers_manager
from plaso.containers import event_sources
from plaso.engine import extractors
from plaso.engine import logger
from plaso.lib import definitions
from plaso.lib import errors


[docs]class EventExtractionWorker(object):
  """Event extraction worker.

  The event extraction worker determines which parsers are suitable for parsing
  a particular file entry or data stream. The parsers extract relevant data from
  file system and or file content data. All extracted data is passed to the
  parser mediator for further processing.

  Attributes:
    last_activity_timestamp (int): timestamp received that indicates the last
        time activity was observed.
    processing_status (str): human readable status indication such as:
        'Extracting', 'Hashing'.
  """

  # TSK metadata files that need special handling.
  _METADATA_FILE_LOCATIONS_TSK = frozenset([
      # NTFS
      '/$AttrDef',
      '/$BadClus',
      '/$Bitmap',
      '/$Boot',
      '/$Extend/$ObjId',
      '/$Extend/$Quota',
      '/$Extend/$Reparse',
      '/$Extend/$RmMetadata/$Repair',
      '/$Extend/$RmMetadata/$TxfLog/$Tops',
      '/$Extend/$UsnJrnl',
      '/$LogFile',
      '/$MFT',
      '/$MFTMirr',
      '/$Secure',
      '/$UpCase',
      '/$Volume',
      # HFS+/HFSX
      '/$ExtentsFile',
      '/$CatalogFile',
      '/$BadBlockFile',
      '/$AllocationFile',
      '/$AttributesFile',
  ])

  # TODO: make this filtering solution more generic. Also see:
  # https://github.com/log2timeline/plaso/issues/467
  _CHROME_CACHE_DATA_FILE_RE = re.compile(r'^[fF]_[0-9a-fA-F]{6}$')
  _FIREFOX_CACHE_DATA_FILE_RE = re.compile(r'^[0-9a-fA-F]{5}[dm][0-9]{2}$')
  _FIREFOX_CACHE2_DATA_FILE_RE = re.compile(r'^[0-9a-fA-F]{40}$')

  _TYPES_WITH_ROOT_METADATA = frozenset([
      dfvfs_definitions.TYPE_INDICATOR_GZIP])

  def __init__(self, parser_filter_expression=None):
    """Initializes an event extraction worker.

    Args:
      parser_filter_expression (Optional[str]): parser filter expression,
          where None represents all parsers and plugins.

          The parser filter expression is a comma separated value string that
          denotes a list of parser names to include and/or exclude. Each entry
          can have the value of:

          * An exact match of a list of parsers, or a preset (see
            plaso/parsers/presets.py for a full list of available presets).
          * A name of a single parser (case insensitive), e.g. msiecf.
          * A glob name for a single parser, e.g. '*msie*' (case insensitive).
    """
    super(EventExtractionWorker, self).__init__()
    self._abort = False
    self._analyzers = []
    self._event_extractor = extractors.EventExtractor(
        parser_filter_expression=parser_filter_expression)
    self._hasher_file_size_limit = None
    self._path_spec_extractor = extractors.PathSpecExtractor()
    self._process_archives = None
    self._process_compressed_streams = None
    self._processing_profiler = None

    self.last_activity_timestamp = 0.0
    self.processing_status = definitions.PROCESSING_STATUS_IDLE

  def _AnalyzeDataStream(self, mediator, file_entry, data_stream_name):
    """Analyzes the contents of a specific data stream of a file entry.

    The results of the analyzers are set in the parser mediator as attributes
    that are added to produced event objects. Note that some file systems
    allow directories to have data streams, e.g. NTFS.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry whose data stream is to be
          analyzed.
      data_stream_name (str): name of the data stream.

    Raises:
      RuntimeError: if the file-like object cannot be retrieved from
          the file entry.
    """
    display_name = mediator.GetDisplayName()
    logger.debug('[AnalyzeDataStream] analyzing file: {0:s}'.format(
        display_name))

    if self._processing_profiler:
      self._processing_profiler.StartTiming('analyzing')

    try:
      file_object = file_entry.GetFileObject(data_stream_name=data_stream_name)
      if not file_object:
        raise RuntimeError((
            'Unable to retrieve file-like object for file entry: '
            '{0:s}.').format(display_name))

      try:
        self._AnalyzeFileObject(mediator, file_object)
      finally:
        file_object.close()

    finally:
      if self._processing_profiler:
        self._processing_profiler.StopTiming('analyzing')

    logger.debug(
        '[AnalyzeDataStream] completed analyzing file: {0:s}'.format(
            display_name))

  def _AnalyzeFileObject(self, mediator, file_object):
    """Processes a file-like object with analyzers.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_object (dfvfs.FileIO): file-like object to process.
    """
    maximum_read_size = max([
        analyzer_object.SIZE_LIMIT for analyzer_object in self._analyzers])

    hashers_only = True
    for analyzer_object in self._analyzers:
      if not isinstance(analyzer_object, hashing_analyzer.HashingAnalyzer):
        hashers_only = False
        break

    file_size = file_object.get_size()

    if (hashers_only and self._hasher_file_size_limit and
        file_size > self._hasher_file_size_limit):
      return

    file_object.seek(0, os.SEEK_SET)

    data = file_object.read(maximum_read_size)
    while data:
      if self._abort:
        break

      for analyzer_object in self._analyzers:
        if self._abort:
          break

        if (not analyzer_object.INCREMENTAL_ANALYZER and
            file_size > analyzer_object.SIZE_LIMIT):
          continue

        if (isinstance(analyzer_object, hashing_analyzer.HashingAnalyzer) and
            self._hasher_file_size_limit and
            file_size > self._hasher_file_size_limit):
          continue

        self.processing_status = analyzer_object.PROCESSING_STATUS_HINT

        analyzer_object.Analyze(data)

        self.last_activity_timestamp = time.time()

      data = file_object.read(maximum_read_size)

    display_name = mediator.GetDisplayName()
    for analyzer_object in self._analyzers:
      if self._abort:
        break

      for result in analyzer_object.GetResults():
        logger.debug((
            '[AnalyzeFileObject] attribute {0:s}:{1:s} calculated for '
            'file: {2:s}.').format(
                result.attribute_name, result.attribute_value, display_name))

        mediator.AddEventAttribute(
            result.attribute_name, result.attribute_value)

      analyzer_object.Reset()

    self.processing_status = definitions.PROCESSING_STATUS_RUNNING

  def _CanSkipDataStream(self, file_entry, data_stream):
    """Determines if analysis and extraction of a data stream can be skipped.

    This is used to prevent Plaso trying to run analyzers or extract content
    from a pipe or socket it encounters while processing a mounted filesystem.

    Args:
      file_entry (dfvfs.FileEntry): file entry to consider for skipping.
      data_stream (dfvfs.DataStream): data stream to consider for skipping.

    Returns:
      bool: True if the data stream can be skipped.
    """
    if file_entry.IsFile():
      return False

    if data_stream.IsDefault():
      return True

    return False

  def _CanSkipContentExtraction(self, file_entry):
    """Determines if content extraction of a file entry can be skipped.

    Args:
      file_entry (dfvfs.FileEntry): file entry of which to determine content
          extraction can be skipped.

    Returns:
      bool: True if content extraction can be skipped.
    """
    # TODO: make this filtering solution more generic. Also see:
    # https://github.com/log2timeline/plaso/issues/467
    location = getattr(file_entry.path_spec, 'location', None)
    if not location:
      return False

    data_stream_name = getattr(file_entry.path_spec, 'data_stream', None)
    if data_stream_name:
      return False

    file_system = file_entry.GetFileSystem()

    path_segments = file_system.SplitPath(location)
    if not path_segments:
      return False

    if self._CHROME_CACHE_DATA_FILE_RE.match(path_segments[-1]):
      location_segments = path_segments[:-1]
      location_segments.append('index')
      location = file_system.JoinPath(location_segments)
      index_path_spec = path_spec_factory.Factory.NewPathSpec(
          file_entry.type_indicator, location=location,
          parent=file_entry.path_spec.parent)

      if file_system.FileEntryExistsByPathSpec(index_path_spec):
        # TODO: improve this check if "index" is a Chrome Cache index file.
        return True

    elif self._FIREFOX_CACHE_DATA_FILE_RE.match(path_segments[-1]):
      location_segments = path_segments[:-4]
      location_segments.append('_CACHE_MAP_')
      location = file_system.JoinPath(location_segments)
      cache_map_path_spec = path_spec_factory.Factory.NewPathSpec(
          file_entry.type_indicator, location=location,
          parent=file_entry.path_spec.parent)

      if file_system.FileEntryExistsByPathSpec(cache_map_path_spec):
        # TODO: improve this check if "_CACHE_MAP_" is a Firefox Cache
        # version 1 cache map file.
        return True

    elif self._FIREFOX_CACHE2_DATA_FILE_RE.match(path_segments[-1]):
      location_segments = path_segments[:-2]
      location_segments.append('index')
      location = file_system.JoinPath(location_segments)
      index_path_spec = path_spec_factory.Factory.NewPathSpec(
          file_entry.type_indicator, location=location,
          parent=file_entry.path_spec.parent)

      if file_system.FileEntryExistsByPathSpec(index_path_spec):
        # TODO: improve this check if "index" is a Firefox Cache version 2
        # index file.
        return True

    elif len(path_segments) == 1 and path_segments[0].lower() in (
        'hiberfil.sys', 'pagefile.sys', 'swapfile.sys'):
      return True

    return False

  def _ExtractContentFromDataStream(
      self, mediator, file_entry, data_stream_name):
    """Extracts content from a data stream.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry to extract its content.
      data_stream_name (str): name of the data stream whose content is to be
          extracted.
    """
    self.processing_status = definitions.PROCESSING_STATUS_EXTRACTING

    if self._processing_profiler:
      self._processing_profiler.StartTiming('extracting')

    self._event_extractor.ParseDataStream(
        mediator, file_entry, data_stream_name)

    if self._processing_profiler:
      self._processing_profiler.StopTiming('extracting')

    self.processing_status = definitions.PROCESSING_STATUS_RUNNING

    self.last_activity_timestamp = time.time()

  def _ExtractMetadataFromFileEntry(self, mediator, file_entry, data_stream):
    """Extracts metadata from a file entry.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry to extract metadata from.
      data_stream (dfvfs.DataStream): data stream or None if the file entry
          has no data stream.
    """
    # Do not extract metadata from the root file entry when it is virtual.
    if file_entry.IsRoot() and file_entry.type_indicator not in (
        self._TYPES_WITH_ROOT_METADATA):
      return

    # We always want to extract the file entry metadata but we only want
    # to parse it once per file entry, so we only use it if we are
    # processing the default data stream of regular files.
    if data_stream and not data_stream.IsDefault():
      return

    display_name = mediator.GetDisplayName()
    logger.debug(
        '[ExtractMetadataFromFileEntry] processing file entry: {0:s}'.format(
            display_name))

    self.processing_status = definitions.PROCESSING_STATUS_EXTRACTING

    if self._processing_profiler:
      self._processing_profiler.StartTiming('extracting')

    self._event_extractor.ParseFileEntryMetadata(mediator, file_entry)

    if self._processing_profiler:
      self._processing_profiler.StopTiming('extracting')

    self.processing_status = definitions.PROCESSING_STATUS_RUNNING

  def _GetArchiveTypes(self, mediator, path_spec):
    """Determines if a data stream contains an archive such as: TAR or ZIP.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      path_spec (dfvfs.PathSpec): path specification of the data stream.

    Returns:
      list[str]: dfVFS archive type indicators found in the data stream.
    """
    try:
      type_indicators = analyzer.Analyzer.GetArchiveTypeIndicators(
          path_spec, resolver_context=mediator.resolver_context)
    except IOError as exception:
      type_indicators = []

      error_message = (
          'analyzer failed to determine archive type indicators '
          'with error: {0!s}').format(exception)
      mediator.ProduceExtractionError(error_message, path_spec=path_spec)

    return type_indicators

  def _GetCompressedStreamTypes(self, mediator, path_spec):
    """Determines if a data stream contains a compressed stream such as: gzip.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      path_spec (dfvfs.PathSpec): path specification of the data stream.

    Returns:
      list[str]: dfVFS compressed stream type indicators found in
          the data stream.
    """
    try:
      type_indicators = analyzer.Analyzer.GetCompressedStreamTypeIndicators(
          path_spec, resolver_context=mediator.resolver_context)
    except IOError as exception:
      type_indicators = []

      error_message = (
          'analyzer failed to determine compressed stream type indicators '
          'with error: {0!s}').format(exception)
      mediator.ProduceExtractionError(error_message, path_spec=path_spec)

    return type_indicators

  def _IsMetadataFile(self, file_entry):
    """Determines if the file entry is a metadata file.

    Args:
      file_entry (dfvfs.FileEntry): a file entry object.

    Returns:
      bool: True if the file entry is a metadata file.
    """
    if (file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK and
        file_entry.path_spec.location in self._METADATA_FILE_LOCATIONS_TSK):
      return True

    return False

  def _ProcessArchiveTypes(self, mediator, path_spec, type_indicators):
    """Processes a data stream containing archive types such as: TAR or ZIP.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      path_spec (dfvfs.PathSpec): path specification.
      type_indicators(list[str]): dfVFS archive type indicators found in
          the data stream.
    """
    number_of_type_indicators = len(type_indicators)
    if number_of_type_indicators == 0:
      return

    self.processing_status = definitions.PROCESSING_STATUS_COLLECTING

    if number_of_type_indicators > 1:
      display_name = mediator.GetDisplayName()
      logger.debug((
          'Found multiple format type indicators: {0:s} for '
          'archive file: {1:s}').format(type_indicators, display_name))

    for type_indicator in type_indicators:
      if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TAR:
        archive_path_spec = path_spec_factory.Factory.NewPathSpec(
            dfvfs_definitions.TYPE_INDICATOR_TAR, location='/',
            parent=path_spec)

      elif type_indicator == dfvfs_definitions.TYPE_INDICATOR_ZIP:
        archive_path_spec = path_spec_factory.Factory.NewPathSpec(
            dfvfs_definitions.TYPE_INDICATOR_ZIP, location='/',
            parent=path_spec)

      else:
        archive_path_spec = None

        error_message = (
            'unsupported archive format type indicator: {0:s}').format(
                type_indicator)
        mediator.ProduceExtractionError(
            error_message, path_spec=path_spec)

      if archive_path_spec:
        try:
          path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
              [archive_path_spec], resolver_context=mediator.resolver_context)

          for generated_path_spec in path_spec_generator:
            if self._abort:
              break

            event_source = event_sources.FileEntryEventSource(
                path_spec=generated_path_spec)
            event_source.file_entry_type = (
                dfvfs_definitions.FILE_ENTRY_TYPE_FILE)
            mediator.ProduceEventSource(event_source)

            self.last_activity_timestamp = time.time()

        except (IOError, errors.MaximumRecursionDepth) as exception:
          error_message = (
              'unable to process archive file with error: {0!s}').format(
                  exception)
          mediator.ProduceExtractionError(
              error_message, path_spec=generated_path_spec)

  def _ProcessCompressedStreamTypes(self, mediator, path_spec, type_indicators):
    """Processes a data stream containing compressed stream types such as: bz2.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      path_spec (dfvfs.PathSpec): path specification.
      type_indicators(list[str]): dfVFS archive type indicators found in
          the data stream.
    """
    number_of_type_indicators = len(type_indicators)
    if number_of_type_indicators == 0:
      return

    self.processing_status = definitions.PROCESSING_STATUS_COLLECTING

    if number_of_type_indicators > 1:
      display_name = mediator.GetDisplayName()
      logger.debug((
          'Found multiple format type indicators: {0:s} for '
          'compressed stream file: {1:s}').format(
              type_indicators, display_name))

    for type_indicator in type_indicators:
      if type_indicator == dfvfs_definitions.TYPE_INDICATOR_BZIP2:
        compressed_stream_path_spec = path_spec_factory.Factory.NewPathSpec(
            dfvfs_definitions.TYPE_INDICATOR_COMPRESSED_STREAM,
            compression_method=dfvfs_definitions.COMPRESSION_METHOD_BZIP2,
            parent=path_spec)

      elif type_indicator == dfvfs_definitions.TYPE_INDICATOR_GZIP:
        compressed_stream_path_spec = path_spec_factory.Factory.NewPathSpec(
            dfvfs_definitions.TYPE_INDICATOR_GZIP, parent=path_spec)

      else:
        compressed_stream_path_spec = None

        error_message = (
            'unsupported compressed stream format type indicators: '
            '{0:s}').format(type_indicator)
        mediator.ProduceExtractionError(
            error_message, path_spec=path_spec)

      if compressed_stream_path_spec:
        event_source = event_sources.FileEntryEventSource(
            path_spec=compressed_stream_path_spec)
        event_source.file_entry_type = dfvfs_definitions.FILE_ENTRY_TYPE_FILE
        mediator.ProduceEventSource(event_source)

        self.last_activity_timestamp = time.time()

  def _ProcessDirectory(self, mediator, file_entry):
    """Processes a directory file entry.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry of the directory.
    """
    self.processing_status = definitions.PROCESSING_STATUS_COLLECTING

    if self._processing_profiler:
      self._processing_profiler.StartTiming('collecting')

    for sub_file_entry in file_entry.sub_file_entries:
      if self._abort:
        break

      try:
        if not sub_file_entry.IsAllocated():
          continue

      except dfvfs_errors.BackEndError as exception:
        error_message = (
            'unable to process directory entry: {0:s} with error: '
            '{1!s}').format(sub_file_entry.name, exception)
        mediator.ProduceExtractionError(
            error_message, path_spec=file_entry.path_spec)
        continue

      # For TSK-based file entries only, ignore the virtual /$OrphanFiles
      # directory.
      if sub_file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK:
        if file_entry.IsRoot() and sub_file_entry.name == '$OrphanFiles':
          continue

      event_source = event_sources.FileEntryEventSource(
          path_spec=sub_file_entry.path_spec)

      # TODO: move this into a dfVFS file entry property.
      stat_object = sub_file_entry.GetStat()
      if stat_object:
        event_source.file_entry_type = stat_object.type

      mediator.ProduceEventSource(event_source)

      self.last_activity_timestamp = time.time()

    if self._processing_profiler:
      self._processing_profiler.StopTiming('collecting')

    self.processing_status = definitions.PROCESSING_STATUS_RUNNING

  def _ProcessFileEntry(self, mediator, file_entry):
    """Processes a file entry.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry.
    """
    display_name = mediator.GetDisplayName()
    logger.debug(
        '[ProcessFileEntry] processing file entry: {0:s}'.format(display_name))

    reference_count = mediator.resolver_context.GetFileObjectReferenceCount(
        file_entry.path_spec)

    try:
      if self._IsMetadataFile(file_entry):
        self._ProcessMetadataFile(mediator, file_entry)

      else:
        file_entry_processed = False
        for data_stream in file_entry.data_streams:
          if self._abort:
            break

          if self._CanSkipDataStream(file_entry, data_stream):
            logger.debug(
                ('[ProcessFileEntry] Skipping datastream {0:s} '
                 'for {1:s}: {2:s}').format(
                     data_stream.name, file_entry.type, display_name))
            continue

          self._ProcessFileEntryDataStream(mediator, file_entry, data_stream)

          file_entry_processed = True

        if not file_entry_processed:
          # For when the file entry does not contain a data stream.
          self._ProcessFileEntryDataStream(mediator, file_entry, None)

    finally:
      new_reference_count = (
          mediator.resolver_context.GetFileObjectReferenceCount(
              file_entry.path_spec))
      if reference_count != new_reference_count:
        # Clean up after parsers that do not call close explicitly.
        if mediator.resolver_context.ForceRemoveFileObject(
            file_entry.path_spec):
          logger.warning(
              'File-object not explicitly closed for file: {0:s}'.format(
                  display_name))

    logger.debug(
        '[ProcessFileEntry] done processing file entry: {0:s}'.format(
            display_name))

  def _ProcessFileEntryDataStream(self, mediator, file_entry, data_stream):
    """Processes a specific data stream of a file entry.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry containing the data stream.
      data_stream (dfvfs.DataStream): data stream or None if the file entry
          has no data stream.
    """
    display_name = mediator.GetDisplayName()
    data_stream_name = getattr(data_stream, 'name', '') or ''
    logger.debug((
        '[ProcessFileEntryDataStream] processing data stream: "{0:s}" of '
        'file entry: {1:s}').format(data_stream_name, display_name))

    mediator.ClearEventAttributes()

    if data_stream and self._analyzers:
      # Since AnalyzeDataStream generates event attributes it needs to be
      # called before producing events.
      self._AnalyzeDataStream(mediator, file_entry, data_stream.name)

    self._ExtractMetadataFromFileEntry(mediator, file_entry, data_stream)

    # Not every file entry has a data stream. In such cases we want to
    # extract the metadata only.
    if not data_stream:
      return

    # Determine if the content of the file entry should not be extracted.
    skip_content_extraction = self._CanSkipContentExtraction(file_entry)
    if skip_content_extraction:
      display_name = mediator.GetDisplayName()
      logger.debug(
          'Skipping content extraction of: {0:s}'.format(display_name))
      self.processing_status = definitions.PROCESSING_STATUS_IDLE
      return

    path_spec = copy.deepcopy(file_entry.path_spec)
    if data_stream and not data_stream.IsDefault():
      path_spec.data_stream = data_stream.name

    archive_types = []
    compressed_stream_types = []

    if self._process_compressed_streams:
      compressed_stream_types = self._GetCompressedStreamTypes(
          mediator, path_spec)

    if not compressed_stream_types:
      archive_types = self._GetArchiveTypes(mediator, path_spec)

    if archive_types:
      if self._process_archives:
        self._ProcessArchiveTypes(mediator, path_spec, archive_types)

      if dfvfs_definitions.TYPE_INDICATOR_ZIP in archive_types:
        # ZIP files are the base of certain file formats like docx.
        self._ExtractContentFromDataStream(
            mediator, file_entry, data_stream.name)

    elif compressed_stream_types:
      self._ProcessCompressedStreamTypes(
          mediator, path_spec, compressed_stream_types)

    else:
      self._ExtractContentFromDataStream(
          mediator, file_entry, data_stream.name)

  def _ProcessMetadataFile(self, mediator, file_entry):
    """Processes a metadata file.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      file_entry (dfvfs.FileEntry): file entry of the metadata file.
    """
    self.processing_status = definitions.PROCESSING_STATUS_EXTRACTING

    self._event_extractor.ParseFileEntryMetadata(mediator, file_entry)
    for data_stream in file_entry.data_streams:
      if self._abort:
        break
      self.last_activity_timestamp = time.time()

      self._event_extractor.ParseMetadataFile(
          mediator, file_entry, data_stream.name)

  def _SetHashers(self, hasher_names_string):
    """Sets the hasher names.

    Args:
      hasher_names_string (str): comma separated names of the hashers
          to enable, where 'none' disables the hashing analyzer.
    """
    if not hasher_names_string or hasher_names_string == 'none':
      return

    analyzer_object = analyzers_manager.AnalyzersManager.GetAnalyzerInstance(
        'hashing')
    analyzer_object.SetHasherNames(hasher_names_string)
    self._analyzers.append(analyzer_object)

  def _SetYaraRules(self, yara_rules_string):
    """Sets the Yara rules.

    Args:
      yara_rules_string (str): unparsed Yara rule definitions.
    """
    if not yara_rules_string:
      return

    analyzer_object = analyzers_manager.AnalyzersManager.GetAnalyzerInstance(
        'yara')
    analyzer_object.SetRules(yara_rules_string)
    self._analyzers.append(analyzer_object)

[docs]  def GetAnalyzerNames(self):
    """Gets the names of the active analyzers.

    Returns:
      list[str]: names of active analyzers.
    """
    return [analyzer_instance.NAME for analyzer_instance in self._analyzers]

[docs]  def ProcessPathSpec(self, mediator, path_spec):
    """Processes a path specification.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      path_spec (dfvfs.PathSpec): path specification.
    """
    self.last_activity_timestamp = time.time()
    self.processing_status = definitions.PROCESSING_STATUS_RUNNING

    file_entry = path_spec_resolver.Resolver.OpenFileEntry(
        path_spec, resolver_context=mediator.resolver_context)

    if file_entry is None:
      display_name = mediator.GetDisplayNameForPathSpec(path_spec)
      logger.warning(
          'Unable to open file entry with path spec: {0:s}'.format(
              display_name))
      self.processing_status = definitions.PROCESSING_STATUS_IDLE
      return

    mediator.SetFileEntry(file_entry)

    try:
      if file_entry.IsDirectory():
        self._ProcessDirectory(mediator, file_entry)
      self._ProcessFileEntry(mediator, file_entry)

    finally:
      mediator.ResetFileEntry()

      self.last_activity_timestamp = time.time()
      self.processing_status = definitions.PROCESSING_STATUS_IDLE

  # TODO: move the functionality of this method into the constructor.
[docs]  def SetExtractionConfiguration(self, configuration):
    """Sets the extraction configuration settings.

    Args:
      configuration (ExtractionConfiguration): extraction configuration.
    """
    self._hasher_file_size_limit = configuration.hasher_file_size_limit
    self._SetHashers(configuration.hasher_names_string)
    self._process_archives = configuration.process_archives
    self._process_compressed_streams = configuration.process_compressed_streams
    self._SetYaraRules(configuration.yara_rules_string)

[docs]  def SetProcessingProfiler(self, processing_profiler):
    """Sets the parsers profiler.

    Args:
      processing_profiler (ProcessingProfiler): processing profile.
    """
    self._processing_profiler = processing_profiler

[docs]  def SignalAbort(self):
    """Signals the extraction worker to abort."""
    self._abort = True