Source code for plaso.parsers.opera

# -*- coding: utf-8 -*-
"""Parsers for Opera Browser history files."""

from __future__ import unicode_literals

import os

try:
  import urlparse
except ImportError:
  from urllib import parse as urlparse

# pylint: disable=wrong-import-position
from defusedxml import ElementTree
from dfdatetime import posix_time as dfdatetime_posix_time
from dfdatetime import time_elements as dfdatetime_time_elements
from dfdatetime import semantic_time as dfdatetime_semantic_time
from dfvfs.helpers import text_file

from plaso.containers import events
from plaso.containers import time_events
from plaso.lib import errors
from plaso.lib import definitions
from plaso.parsers import interface
from plaso.parsers import manager


[docs]class OperaTypedHistoryEventData(events.EventData):
  """Opera typed history entry data.

  Attributes:
    entry_selection (str): information about whether the URL was directly
        typed in or the result of the user choosing from the auto complete.
    entry_type (str): information about whether the URL was directly typed in
        or the result of the user choosing from the auto complete.
    url (str): typed URL or hostname.
  """

  DATA_TYPE = 'opera:history:typed_entry'

  def __init__(self):
    """Initializes event data."""
    super(OperaTypedHistoryEventData, self).__init__(data_type=self.DATA_TYPE)
    self.entry_selection = None
    self.entry_type = None
    self.url = None


[docs]class OperaGlobalHistoryEventData(events.EventData):
  """Opera global history entry data.

  Attributes:
    description (str): description.
    popularity_index (int): popularity index.
    title (str): title.
    url (str):  URL.
  """

  DATA_TYPE = 'opera:history:entry'

  def __init__(self):
    """Initializes event data."""
    super(OperaGlobalHistoryEventData, self).__init__(data_type=self.DATA_TYPE)
    self.description = None
    self.popularity_index = None
    self.title = None
    self.url = None


[docs]class OperaTypedHistoryParser(interface.FileObjectParser):
  """Parses the Opera typed_history.xml file."""

  NAME = 'opera_typed_history'
  DESCRIPTION = 'Parser for Opera typed_history.xml files.'

  _HEADER_READ_SIZE = 128

[docs]  def ParseFileObject(self, parser_mediator, file_object):
    """Parses an Opera typed history file-like object.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
    data = file_object.read(self._HEADER_READ_SIZE)
    if not data.startswith(b'<?xml'):
      raise errors.UnableToParseFile(
          'Not an Opera typed history file [not a XML]')

    _, _, data = data.partition(b'\n')
    if not data.startswith(b'<typed_history'):
      raise errors.UnableToParseFile(
          'Not an Opera typed history file [wrong XML root key]')

    # For ElementTree to work we need to work on a file object seeked
    # to the beginning.
    file_object.seek(0, os.SEEK_SET)

    xml = ElementTree.parse(file_object)

    for history_item in xml.iterfind('typed_history_item'):
      event_data = OperaTypedHistoryEventData()
      event_data.entry_type = history_item.get('type', None)
      event_data.url = history_item.get('content', None)

      if event_data.entry_type == 'selected':
        event_data.entry_selection = 'Filled from autocomplete.'
      elif event_data.entry_type == 'text':
        event_data.entry_selection = 'Manually typed.'

      last_typed_time = history_item.get('last_typed', None)
      if last_typed_time is None:
        parser_mediator.ProduceExtractionWarning('missing last typed time.')
        continue

      date_time = dfdatetime_time_elements.TimeElements()

      try:
        date_time.CopyFromStringISO8601(last_typed_time)
      except ValueError as exception:
        parser_mediator.ProduceExtractionWarning(
            'unsupported last typed time: {0:s} with error: {1!s}.'.format(
                last_typed_time, exception))
        continue

      event = time_events.DateTimeValuesEvent(
          date_time, definitions.TIME_DESCRIPTION_LAST_VISITED)
      parser_mediator.ProduceEventWithEventData(event, event_data)


[docs]class OperaGlobalHistoryParser(interface.FileObjectParser):
  """Parses the Opera global_history.dat file."""

  NAME = 'opera_global'
  DESCRIPTION = 'Parser for Opera global_history.dat files.'

  _ENCODING = 'utf-8'

  _MAXIMUM_LINE_SIZE = 512

  _SUPPORTED_URL_SCHEMES = frozenset(['file', 'http', 'https', 'ftp'])

  def _IsValidUrl(self, url):
    """Checks if an URL is considered valid.

    Returns:
      bool: True if the URL is valid.
    """
    parsed_url = urlparse.urlparse(url)
    return parsed_url.scheme in self._SUPPORTED_URL_SCHEMES

  def _ParseRecord(self, parser_mediator, text_file_object):
    """Parses an Opera global history record.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      text_file_object (dfvfs.TextFile): text file.

    Returns:
      bool: True if the record was successfully parsed.
    """
    try:
      title = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode title')
      return False

    if not title:
      return False

    try:
      url = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode url')
      return False

    try:
      timestamp = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode timestamp')
      return False

    try:
      popularity_index = text_file_object.readline()
    except UnicodeDecodeError:
      parser_mediator.ProduceExtractionWarning(
          'unable to read and decode popularity index')
      return False

    event_data = OperaGlobalHistoryEventData()

    event_data.url = url.strip()

    title = title.strip()
    if title != event_data.url:
      event_data.title = title

    popularity_index = popularity_index.strip()
    try:
      event_data.popularity_index = int(popularity_index, 10)
    except ValueError:
      parser_mediator.ProduceExtractionWarning(
          'unable to convert popularity index: {0:s}'.format(popularity_index))

    if event_data.popularity_index < 0:
      event_data.description = 'First and Only Visit'
    else:
      event_data.description = 'Last Visit'

    timestamp = timestamp.strip()
    try:
      timestamp = int(timestamp, 10)
    except ValueError:
      parser_mediator.ProduceExtractionWarning(
          'unable to convert timestamp: {0:s}'.format(timestamp))
      timestamp = None

    if timestamp is None:
      date_time = dfdatetime_semantic_time.SemanticTime('Invalid')
    else:
      date_time = dfdatetime_posix_time.PosixTime(timestamp=timestamp)

    event = time_events.DateTimeValuesEvent(
        date_time, definitions.TIME_DESCRIPTION_LAST_VISITED)
    parser_mediator.ProduceEventWithEventData(event, event_data)

    return True

  def _ParseAndValidateRecord(self, parser_mediator, text_file_object):
    """Parses and validates an Opera global history record.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      text_file_object (dfvfs.TextFile): text file.

    Returns:
      bool: True if the record was successfully parsed.
    """
    try:
      title = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
      url = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
      timestamp = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
      popularity_index = text_file_object.readline(size=self._MAXIMUM_LINE_SIZE)
    except UnicodeDecodeError:
      return False

    if len(title) == self._MAXIMUM_LINE_SIZE and title[-1] != '\n':
      return False

    if len(url) == self._MAXIMUM_LINE_SIZE and url[-1] != '\n':
      return False

    if len(timestamp) == self._MAXIMUM_LINE_SIZE and timestamp[-1] != '\n':
      return False

    if (len(popularity_index) == self._MAXIMUM_LINE_SIZE and
        popularity_index[-1] != '\n'):
      return False

    title = title.strip()
    url = url.strip()
    timestamp = timestamp.strip()
    popularity_index = popularity_index.strip()

    if not title or not url or not timestamp or not popularity_index:
      return False

    event_data = OperaGlobalHistoryEventData()

    if not self._IsValidUrl(url):
      return False

    event_data.url = url
    if title != url:
      event_data.title = title

    try:
      event_data.popularity_index = int(popularity_index, 10)
      timestamp = int(timestamp, 10)
    except ValueError:
      return False

    if event_data.popularity_index < 0:
      event_data.description = 'First and Only Visit'
    else:
      event_data.description = 'Last Visit'

    date_time = dfdatetime_posix_time.PosixTime(timestamp=timestamp)
    event = time_events.DateTimeValuesEvent(
        date_time, definitions.TIME_DESCRIPTION_LAST_VISITED)
    parser_mediator.ProduceEventWithEventData(event, event_data)

    return True

[docs]  def ParseFileObject(self, parser_mediator, file_object):
    """Parses an Opera global history file-like object.

    Args:
      parser_mediator (ParserMediator): mediates interactions between parsers
          and other components, such as storage and dfvfs.
      file_object (dfvfs.FileIO): file-like object.

    Raises:
      UnableToParseFile: when the file cannot be parsed.
    """
    encoding = self._ENCODING or parser_mediator.codepage
    text_file_object = text_file.TextFile(file_object, encoding=encoding)
    if not self._ParseAndValidateRecord(parser_mediator, text_file_object):
      raise errors.UnableToParseFile(
          'Unable to parse as Opera global_history.dat.')

    while self._ParseRecord(parser_mediator, text_file_object):
      pass


manager.ParsersManager.RegisterParsers([
    OperaTypedHistoryParser, OperaGlobalHistoryParser])