Source code for plaso.engine.extractors
# -*- coding: utf-8 -*-
"""The extractor class definitions.
An extractor is a class used to extract information from "raw" data.
"""
from __future__ import unicode_literals
import copy
import hashlib
import pysigscan
from dfvfs.helpers import file_system_searcher
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.lib import errors as dfvfs_errors
from dfvfs.resolver import resolver as path_spec_resolver
from plaso.engine import logger
from plaso.lib import errors
from plaso.parsers import interface as parsers_interface
from plaso.parsers import manager as parsers_manager
[docs]class EventExtractor(object):
"""Event extractor.
An event extractor extracts events from event sources.
"""
_PARSE_RESULT_FAILURE = 1
_PARSE_RESULT_SUCCESS = 2
_PARSE_RESULT_UNSUPPORTED = 3
def __init__(self, parser_filter_expression=None):
"""Initializes an event extractor.
Args:
parser_filter_expression (Optional[str]): the parser filter expression,
None represents all parsers and plugins.
The parser filter expression is a comma separated value string that
denotes a list of parser names to include and/or exclude. Each entry
can have the value of:
* An exact match of a list of parsers, or a preset (see
plaso/parsers/presets.py for a full list of available presets).
* A name of a single parser (case insensitive), e.g. msiecf.
* A glob name for a single parser, e.g. '*msie*' (case insensitive).
"""
super(EventExtractor, self).__init__()
self._file_scanner = None
self._filestat_parser = None
self._formats_with_signatures = None
self._mft_parser = None
self._non_sigscan_parser_names = None
self._parsers = None
self._parsers_profiler = None
self._usnjrnl_parser = None
self._InitializeParserObjects(
parser_filter_expression=parser_filter_expression)
def _CheckParserCanProcessFileEntry(self, parser, file_entry):
"""Determines if a parser can process a file entry.
Args:
file_entry (dfvfs.FileEntry): file entry.
parser (BaseParser): parser.
Returns:
bool: True if the file entry can be processed by the parser object.
"""
for filter_object in parser.FILTERS:
if filter_object.Match(file_entry):
return True
return False
def _GetSignatureMatchParserNames(self, file_object):
"""Determines if a file-like object matches one of the known signatures.
Args:
file_object (file): file-like object whose contents will be checked
for known signatures.
Returns:
list[str]: parser names for which the contents of the file-like object
matches their known signatures.
"""
parser_names = []
scan_state = pysigscan.scan_state()
self._file_scanner.scan_file_object(scan_state, file_object)
for scan_result in iter(scan_state.scan_results):
format_specification = (
self._formats_with_signatures.GetSpecificationBySignature(
scan_result.identifier))
if format_specification.identifier not in parser_names:
parser_names.append(format_specification.identifier)
return parser_names
def _InitializeParserObjects(self, parser_filter_expression=None):
"""Initializes the parser objects.
Args:
parser_filter_expression (Optional[str]): the parser filter expression,
None represents all parsers and plugins.
The parser filter expression is a comma separated value string that
denotes a list of parser names to include and/or exclude. Each entry
can have the value of:
* An exact match of a list of parsers, or a preset (see
plaso/parsers/presets.py for a full list of available presets).
* A name of a single parser (case insensitive), e.g. msiecf.
* A glob name for a single parser, e.g. '*msie*' (case insensitive).
"""
self._formats_with_signatures, non_sigscan_parser_names = (
parsers_manager.ParsersManager.GetFormatsWithSignatures(
parser_filter_expression=parser_filter_expression))
self._non_sigscan_parser_names = []
for parser_name in non_sigscan_parser_names:
if parser_name not in ('filestat', 'usnjrnl'):
self._non_sigscan_parser_names.append(parser_name)
self._file_scanner = parsers_manager.ParsersManager.CreateSignatureScanner(
self._formats_with_signatures)
self._parsers = parsers_manager.ParsersManager.GetParserObjects(
parser_filter_expression=parser_filter_expression)
active_parser_names = ', '.join(sorted(self._parsers.keys()))
logger.debug('Active parsers: {0:s}'.format(active_parser_names))
self._filestat_parser = self._parsers.get('filestat', None)
if 'filestat' in self._parsers:
del self._parsers['filestat']
self._mft_parser = self._parsers.get('mft', None)
self._usnjrnl_parser = self._parsers.get('usnjrnl', None)
if 'usnjrnl' in self._parsers:
del self._parsers['usnjrnl']
def _ParseDataStreamWithParser(
self, parser_mediator, parser, file_entry, data_stream_name):
"""Parses a data stream of a file entry with a specific parser.
Args:
parser_mediator (ParserMediator): parser mediator.
parser (BaseParser): parser.
file_entry (dfvfs.FileEntry): file entry.
data_stream_name (str): data stream name.
Raises:
RuntimeError: if the file-like object is missing.
"""
file_object = file_entry.GetFileObject(data_stream_name=data_stream_name)
if not file_object:
raise RuntimeError(
'Unable to retrieve file-like object from file entry.')
try:
self._ParseFileEntryWithParser(
parser_mediator, parser, file_entry, file_object=file_object)
finally:
file_object.close()
def _ParseFileEntryWithParser(
self, parser_mediator, parser, file_entry, file_object=None):
"""Parses a file entry with a specific parser.
Args:
parser_mediator (ParserMediator): parser mediator.
parser (BaseParser): parser.
file_entry (dfvfs.FileEntry): file entry.
file_object (Optional[file]): file-like object to parse.
If not set the parser will use the parser mediator to open
the file entry's default data stream as a file-like object.
Returns:
int: parse result which is _PARSE_RESULT_FAILURE if the file entry
could not be parsed, _PARSE_RESULT_SUCCESS if the file entry
successfully was parsed or _PARSE_RESULT_UNSUPPORTED when
UnableToParseFile was raised.
Raises:
TypeError: if parser object is not a supported parser type.
"""
if not isinstance(parser, (
parsers_interface.FileEntryParser, parsers_interface.FileObjectParser)):
raise TypeError('Unsupported parser object type.')
parser_mediator.ClearParserChain()
reference_count = (
parser_mediator.resolver_context.GetFileObjectReferenceCount(
file_entry.path_spec))
parser_mediator.SampleStartTiming(parser.NAME)
try:
if isinstance(parser, parsers_interface.FileEntryParser):
parser.Parse(parser_mediator)
elif isinstance(parser, parsers_interface.FileObjectParser):
parser.Parse(parser_mediator, file_object)
result = self._PARSE_RESULT_SUCCESS
# We catch IOError so we can determine the parser that generated the error.
except (IOError, dfvfs_errors.BackEndError) as exception:
display_name = parser_mediator.GetDisplayName(file_entry)
logger.warning(
'{0:s} unable to parse file: {1:s} with error: {2!s}'.format(
parser.NAME, display_name, exception))
result = self._PARSE_RESULT_FAILURE
except errors.UnableToParseFile as exception:
display_name = parser_mediator.GetDisplayName(file_entry)
logger.debug(
'{0:s} unable to parse file: {1:s} with error: {2!s}'.format(
parser.NAME, display_name, exception))
result = self._PARSE_RESULT_UNSUPPORTED
finally:
parser_mediator.SampleStopTiming(parser.NAME)
parser_mediator.SampleMemoryUsage(parser.NAME)
new_reference_count = (
parser_mediator.resolver_context.GetFileObjectReferenceCount(
file_entry.path_spec))
if reference_count != new_reference_count:
display_name = parser_mediator.GetDisplayName(file_entry)
logger.warning((
'[{0:s}] did not explicitly close file-object for file: '
'{1:s}.').format(parser.NAME, display_name))
return result
def _ParseFileEntryWithParsers(
self, parser_mediator, parser_names, file_entry, file_object=None):
"""Parses a file entry with a specific parsers.
Args:
parser_mediator (ParserMediator): parser mediator.
parser_names (list[str]): names of parsers.
file_entry (dfvfs.FileEntry): file entry.
file_object (Optional[file]): file-like object to parse.
If not set the parser will use the parser mediator to open
the file entry's default data stream as a file-like object.
Returns:
int: parse result which is _PARSE_RESULT_FAILURE if the file entry
could not be parsed, _PARSE_RESULT_SUCCESS if the file entry
successfully was parsed or _PARSE_RESULT_UNSUPPORTED when
UnableToParseFile was raised or no names of parser were provided.
Raises:
RuntimeError: if the parser object is missing.
"""
parse_results = self._PARSE_RESULT_UNSUPPORTED
for parser_name in parser_names:
parser = self._parsers.get(parser_name, None)
if not parser:
raise RuntimeError(
'Parser object missing for parser: {0:s}'.format(parser_name))
if parser.FILTERS:
if not self._CheckParserCanProcessFileEntry(parser, file_entry):
parse_results = self._PARSE_RESULT_SUCCESS
continue
display_name = parser_mediator.GetDisplayName(file_entry)
logger.debug((
'[ParseFileEntryWithParsers] parsing file: {0:s} with parser: '
'{1:s}').format(display_name, parser_name))
parse_result = self._ParseFileEntryWithParser(
parser_mediator, parser, file_entry, file_object=file_object)
if parse_result == self._PARSE_RESULT_FAILURE:
return self._PARSE_RESULT_FAILURE
elif parse_result == self._PARSE_RESULT_SUCCESS:
parse_results = self._PARSE_RESULT_SUCCESS
return parse_results
[docs] def ParseDataStream(self, parser_mediator, file_entry, data_stream_name):
"""Parses a data stream of a file entry with the enabled parsers.
Args:
parser_mediator (ParserMediator): parser mediator.
file_entry (dfvfs.FileEntry): file entry.
data_stream_name (str): data stream name.
Raises:
RuntimeError: if the file-like object or the parser object is missing.
"""
file_object = file_entry.GetFileObject(data_stream_name=data_stream_name)
if not file_object:
raise RuntimeError(
'Unable to retrieve file-like object from file entry.')
try:
parser_names = self._GetSignatureMatchParserNames(file_object)
parse_with_non_sigscan_parsers = True
if parser_names:
parse_result = self._ParseFileEntryWithParsers(
parser_mediator, parser_names, file_entry, file_object=file_object)
if parse_result in (
self._PARSE_RESULT_FAILURE, self._PARSE_RESULT_SUCCESS):
parse_with_non_sigscan_parsers = False
if parse_with_non_sigscan_parsers:
self._ParseFileEntryWithParsers(
parser_mediator, self._non_sigscan_parser_names, file_entry,
file_object=file_object)
finally:
file_object.close()
[docs] def ParseFileEntryMetadata(self, parser_mediator, file_entry):
"""Parses the file entry metadata e.g. file system data.
Args:
parser_mediator (ParserMediator): parser mediator.
file_entry (dfvfs.FileEntry): file entry.
"""
if self._filestat_parser:
self._ParseFileEntryWithParser(
parser_mediator, self._filestat_parser, file_entry)
[docs] def ParseMetadataFile(
self, parser_mediator, file_entry, data_stream_name):
"""Parses a metadata file.
Args:
parser_mediator (ParserMediator): parser mediator.
file_entry (dfvfs.FileEntry): file entry.
data_stream_name (str): data stream name.
"""
parent_path_spec = getattr(file_entry.path_spec, 'parent', None)
filename_upper = file_entry.name.upper()
if (self._mft_parser and parent_path_spec and
filename_upper in ('$MFT', '$MFTMIRR') and not data_stream_name):
self._ParseDataStreamWithParser(
parser_mediator, self._mft_parser, file_entry, '')
elif (self._usnjrnl_parser and parent_path_spec and
filename_upper == '$USNJRNL' and data_stream_name == '$J'):
# To be able to ignore the sparse data ranges the UsnJrnl parser
# needs to read directly from the volume.
volume_file_object = path_spec_resolver.Resolver.OpenFileObject(
parent_path_spec, resolver_context=parser_mediator.resolver_context)
try:
self._ParseFileEntryWithParser(
parser_mediator, self._usnjrnl_parser, file_entry,
file_object=volume_file_object)
finally:
volume_file_object.close()
[docs]class PathSpecExtractor(object):
"""Path specification extractor.
A path specification extractor extracts path specification from a source
directory, file or storage media device or image.
"""
_MAXIMUM_DEPTH = 255
def __init__(self, duplicate_file_check=False):
"""Initializes a path specification extractor.
The source collector discovers all the file entries in the source.
The source can be a single file, directory or a volume within
a storage media image or device.
Args:
duplicate_file_check (Optional[bool]): True if duplicate files should
be ignored.
"""
super(PathSpecExtractor, self).__init__()
self._duplicate_file_check = duplicate_file_check
self._hashlist = {}
def _CalculateNTFSTimeHash(self, file_entry):
"""Calculates an MD5 from the date and time value of a NTFS file entry.
Args:
file_entry (dfvfs.FileEntry): file entry.
Returns:
str: hexadecimal representation of the MD5 hash value of the date and
time values of the file entry.
"""
date_time_values = []
access_time = getattr(file_entry, 'access_time', None)
if access_time:
date_time_string = access_time.CopyToDateTimeString()
date_time_values.append('atime:{0:s}'.format(date_time_string))
creation_time = getattr(file_entry, 'creation_time', None)
if creation_time:
date_time_string = creation_time.CopyToDateTimeString()
date_time_values.append('crtime:{0:s}'.format(date_time_string))
modification_time = getattr(file_entry, 'modification_time', None)
if modification_time:
date_time_string = modification_time.CopyToDateTimeString()
date_time_values.append('mtime:{0:s}'.format(date_time_string))
# file_entry.change_time is an alias of file_entry.entry_modification_time.
change_time = getattr(file_entry, 'change_time', None)
if change_time:
date_time_string = change_time.CopyToDateTimeString()
date_time_values.append('ctime:{0:s}'.format(date_time_string))
date_time_values = ''.join(date_time_values)
date_time_values = date_time_values.encode('ascii')
hash_value = hashlib.md5()
hash_value.update(date_time_values)
return hash_value.hexdigest()
def _ExtractPathSpecs(
self, path_spec, find_specs=None, recurse_file_system=True,
resolver_context=None):
"""Extracts path specification from a specific source.
Args:
path_spec (dfvfs.PathSpec): path specification.
find_specs (Optional[list[dfvfs.FindSpec]]): find specifications
used in path specification extraction.
recurse_file_system (Optional[bool]): True if extraction should
recurse into a file system.
resolver_context (Optional[dfvfs.Context]): resolver context.
Yields:
dfvfs.PathSpec: path specification of a file entry found in the source.
"""
try:
file_entry = path_spec_resolver.Resolver.OpenFileEntry(
path_spec, resolver_context=resolver_context)
except (
dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
dfvfs_errors.PathSpecError) as exception:
logger.error(
'Unable to open file entry with error: {0!s}'.format(exception))
return
if not file_entry:
logger.warning('Unable to open: {0:s}'.format(path_spec.comparable))
return
if (not file_entry.IsDirectory() and not file_entry.IsFile() and
not file_entry.IsDevice()):
logger.warning((
'Source path specification not a device, file or directory.\n'
'{0:s}').format(path_spec.comparable))
return
if file_entry.IsFile():
yield path_spec
else:
for extracted_path_spec in self._ExtractPathSpecsFromFileSystem(
path_spec, find_specs=find_specs,
recurse_file_system=recurse_file_system,
resolver_context=resolver_context):
yield extracted_path_spec
def _ExtractPathSpecsFromDirectory(self, file_entry, depth=0):
"""Extracts path specification from a directory.
Args:
file_entry (dfvfs.FileEntry): file entry that refers to the directory.
depth (Optional[int]): current depth where 0 represents the file system
root.
Yields:
dfvfs.PathSpec: path specification of a file entry found in the directory.
"""
if depth >= self._MAXIMUM_DEPTH:
raise errors.MaximumRecursionDepth('Maximum recursion depth reached.')
# Need to do a breadth-first search otherwise we'll hit the Python
# maximum recursion depth.
sub_directories = []
for sub_file_entry in file_entry.sub_file_entries:
try:
if not sub_file_entry.IsAllocated() or sub_file_entry.IsLink():
continue
except dfvfs_errors.BackEndError as exception:
logger.warning(
'Unable to process file: {0:s} with error: {1!s}'.format(
sub_file_entry.path_spec.comparable.replace(
'\n', ';'), exception))
continue
# For TSK-based file entries only, ignore the virtual /$OrphanFiles
# directory.
if sub_file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK:
if file_entry.IsRoot() and sub_file_entry.name == '$OrphanFiles':
continue
if sub_file_entry.IsDirectory():
sub_directories.append(sub_file_entry)
elif sub_file_entry.IsFile():
# If we are dealing with a VSS we want to calculate a hash
# value based on available timestamps and compare that to previously
# calculated hash values, and only include the file into the queue if
# the hash does not match.
if self._duplicate_file_check:
hash_value = self._CalculateNTFSTimeHash(sub_file_entry)
inode = getattr(sub_file_entry.path_spec, 'inode', 0)
if inode in self._hashlist:
if hash_value in self._hashlist[inode]:
continue
self._hashlist.setdefault(inode, []).append(hash_value)
for path_spec in self._ExtractPathSpecsFromFile(sub_file_entry):
yield path_spec
for sub_file_entry in sub_directories:
try:
for path_spec in self._ExtractPathSpecsFromDirectory(
sub_file_entry, depth=(depth + 1)):
yield path_spec
except (
IOError, dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
dfvfs_errors.PathSpecError) as exception:
logger.warning('{0!s}'.format(exception))
def _ExtractPathSpecsFromFile(self, file_entry):
"""Extracts path specification from a file.
Args:
file_entry (dfvfs.FileEntry): file entry that refers to the file.
Yields:
dfvfs.PathSpec: path specification of a file entry found in the file.
"""
produced_main_path_spec = False
for data_stream in file_entry.data_streams:
# Make a copy so we don't make the changes on a path specification
# directly. Otherwise already produced path specifications can be
# altered in the process.
path_spec = copy.deepcopy(file_entry.path_spec)
if data_stream.name:
setattr(path_spec, 'data_stream', data_stream.name)
yield path_spec
if not data_stream.name:
produced_main_path_spec = True
if not produced_main_path_spec:
yield file_entry.path_spec
def _ExtractPathSpecsFromFileSystem(
self, path_spec, find_specs=None, recurse_file_system=True,
resolver_context=None):
"""Extracts path specification from a file system within a specific source.
Args:
path_spec (dfvfs.PathSpec): path specification of the root of
the file system.
find_specs (Optional[list[dfvfs.FindSpec]]): find specifications.
recurse_file_system (Optional[bool]): True if extraction should
recurse into a file system.
resolver_context (Optional[dfvfs.Context]): resolver context.
Yields:
dfvfs.PathSpec: path specification of a file entry found in
the file system.
"""
try:
file_system = path_spec_resolver.Resolver.OpenFileSystem(
path_spec, resolver_context=resolver_context)
except (
dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
dfvfs_errors.PathSpecError) as exception:
logger.error(
'Unable to open file system with error: {0!s}'.format(exception))
return
try:
if find_specs:
searcher = file_system_searcher.FileSystemSearcher(
file_system, path_spec)
for extracted_path_spec in searcher.Find(find_specs=find_specs):
yield extracted_path_spec
elif recurse_file_system:
file_entry = file_system.GetFileEntryByPathSpec(path_spec)
if file_entry:
for extracted_path_spec in self._ExtractPathSpecsFromDirectory(
file_entry):
yield extracted_path_spec
else:
yield path_spec
except (
dfvfs_errors.AccessError, dfvfs_errors.BackEndError,
dfvfs_errors.PathSpecError) as exception:
logger.warning('{0!s}'.format(exception))
finally:
file_system.Close()
[docs] def ExtractPathSpecs(
self, path_specs, find_specs=None, recurse_file_system=True,
resolver_context=None):
"""Extracts path specification from a specific source.
Args:
path_specs (Optional[list[dfvfs.PathSpec]]): path specifications.
find_specs (Optional[list[dfvfs.FindSpec]]): find specifications.
recurse_file_system (Optional[bool]): True if extraction should
recurse into a file system.
resolver_context (Optional[dfvfs.Context]): resolver context.
Yields:
dfvfs.PathSpec: path specification of a file entry found in the source.
"""
for path_spec in path_specs:
for extracted_path_spec in self._ExtractPathSpecs(
path_spec, find_specs=find_specs,
recurse_file_system=recurse_file_system,
resolver_context=resolver_context):
yield extracted_path_spec