# -*- coding: utf-8 -*-
"""Apache access log (access.log) parser.
Parser based on the two default apache formats, common and combined log format
defined in https://httpd.apache.org/docs/2.4/logs.html
"""
from __future__ import unicode_literals
import pyparsing
from dfdatetime import time_elements as dfdatetime_time_elements
from plaso.containers import events
from plaso.containers import time_events
from plaso.lib import errors
from plaso.lib import definitions
from plaso.lib import timelib
from plaso.parsers import manager
from plaso.parsers import text_parser
[docs]class ApacheAccessEventData(events.EventData):
"""Apache access event data.
Attributes:
http_request_referer (str): http request referer header information.
http_request (str): first line of http request.
http_request_user_agent (str): http request user agent header information.
http_response_bytes (int): http response bytes size without headers.
http_response_code (int): http response code from server.
ip_address (str): IPv4 or IPv6 addresses.
remote_name (str): remote logname (from identd, if supplied).
user_name (str): logged user name.
"""
DATA_TYPE = 'apache:access'
def __init__(self):
"""Initializes event data."""
super(ApacheAccessEventData, self).__init__(data_type=self.DATA_TYPE)
self.http_request = None
self.http_request_referer = None
self.http_request_user_agent = None
self.http_response_bytes = None
self.http_response_code = None
self.ip_address = None
self.remote_name = None
self.user_name = None
[docs]class ApacheAccessParser(text_parser.PyparsingSingleLineTextParser):
"""Apache access log file parser"""
NAME = 'apache_access'
DESCRIPTION = 'Apache access Parser'
MAX_LINE_LENGTH = 2048
# Date format [18/Sep/2011:19:18:28 -0400]
_DATE_TIME = pyparsing.Group(
pyparsing.Suppress('[') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('day') +
pyparsing.Suppress('/') +
text_parser.PyparsingConstants.THREE_LETTERS.setResultsName('month') +
pyparsing.Suppress('/') +
text_parser.PyparsingConstants.FOUR_DIGITS.setResultsName('year') +
pyparsing.Suppress(':') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('hours') +
pyparsing.Suppress(':') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('minutes') +
pyparsing.Suppress(':') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('seconds') +
pyparsing.Combine(
pyparsing.oneOf(['-', '+']) +
pyparsing.Word(
pyparsing.nums, exact=4)).setResultsName('time_offset') +
pyparsing.Suppress(']')).setResultsName('date_time')
_HTTP_REQUEST = (
pyparsing.Suppress('"') +
pyparsing.SkipTo('"').setResultsName('http_request') +
pyparsing.Suppress('"'))
_REMOTE_NAME = (
pyparsing.Word(pyparsing.alphanums) |
pyparsing.Literal('-')).setResultsName('remote_name')
_RESPONSE_BYTES = (
pyparsing.Literal('-') |
text_parser.PyparsingConstants.INTEGER).setResultsName('response_bytes')
_REFERER = (
pyparsing.Suppress('"') +
pyparsing.SkipTo('"').setResultsName('referer') +
pyparsing.Suppress('"'))
_USER_AGENT = (
pyparsing.Suppress('"') +
pyparsing.SkipTo('"').setResultsName('user_agent') +
pyparsing.Suppress('"'))
_USER_NAME = (
pyparsing.Word(pyparsing.alphanums) |
pyparsing.Literal('-')).setResultsName('user_name')
# Defined in https://httpd.apache.org/docs/2.4/logs.html
# format: "%h %l %u %t \"%r\" %>s %b"
_COMMON_LOG_FORMAT_LINE = (
text_parser.PyparsingConstants.IP_ADDRESS.setResultsName('ip_address') +
_REMOTE_NAME +
_USER_NAME +
_DATE_TIME +
_HTTP_REQUEST +
text_parser.PyparsingConstants.INTEGER.setResultsName('response_code') +
_RESPONSE_BYTES +
pyparsing.lineEnd())
# Defined in https://httpd.apache.org/docs/2.4/logs.html
# format: "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
_COMBINED_LOG_FORMAT_LINE = (
text_parser.PyparsingConstants.IP_ADDRESS.setResultsName('ip_address') +
_REMOTE_NAME +
_USER_NAME +
_DATE_TIME +
_HTTP_REQUEST +
text_parser.PyparsingConstants.INTEGER.setResultsName('response_code') +
_RESPONSE_BYTES +
_REFERER +
_USER_AGENT +
pyparsing.lineEnd())
LINE_STRUCTURES = [
('combined_log_format', _COMBINED_LOG_FORMAT_LINE),
('common_log_format', _COMMON_LOG_FORMAT_LINE)]
_SUPPORTED_KEYS = frozenset([key for key, _ in LINE_STRUCTURES])
# TODO: migrate function after dfdatetime issue #47 is fixed.
def _GetISO8601String(self, structure):
"""Normalize date time parsed format to an ISO 8601 date time string.
The date and time values in Apache access log files are formatted as:
"[18/Sep/2011:19:18:28 -0400]".
Args:
structure (pyparsing.ParseResults): structure of tokens derived from a
line of a text file.
Returns:
str: ISO 8601 date time string.
Raises:
ValueError: if the structure cannot be converted into a date time string.
"""
month = self._GetValueFromStructure(structure, 'month')
try:
month = timelib.MONTH_DICT.get(month.lower(), 0)
except AttributeError as exception:
raise ValueError('unable to parse month with error: {0!s}.'.format(
exception))
time_offset = self._GetValueFromStructure(structure, 'time_offset')
try:
time_offset_hours = int(time_offset[1:3], 10)
time_offset_minutes = int(time_offset[3:5], 10)
except (IndexError, TypeError, ValueError) as exception:
raise ValueError(
'unable to parse time zone offset with error: {0!s}.'.format(
exception))
year = self._GetValueFromStructure(structure, 'year')
day_of_month = self._GetValueFromStructure(structure, 'day')
hours = self._GetValueFromStructure(structure, 'hours')
minutes = self._GetValueFromStructure(structure, 'minutes')
seconds = self._GetValueFromStructure(structure, 'seconds')
try:
date_time_string = (
'{0:04d}-{1:02d}-{2:02d}T{3:02d}:{4:02d}:{5:02d}.000000'
'{6:s}{7:02d}:{8:02d}').format(
year, month, day_of_month, hours, minutes, seconds,
time_offset[0], time_offset_hours, time_offset_minutes)
except ValueError as exception:
raise ValueError(
'unable to format date time string with error: {0!s}.'.format(
exception))
return date_time_string
[docs] def ParseRecord(self, parser_mediator, key, structure):
"""Parses a matching entry.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
key (str): name of the parsed structure.
structure (pyparsing.ParseResults): elements parsed from the file.
Raises:
ParseError: when the structure type is unknown.
"""
if key not in self._SUPPORTED_KEYS:
raise errors.ParseError(
'Unable to parse record, unknown structure: {0:s}'.format(key))
date_time = dfdatetime_time_elements.TimeElements()
date_time_string = self._GetValueFromStructure(structure, 'date_time')
try:
iso_date_time = self._GetISO8601String(date_time_string)
date_time.CopyFromStringISO8601(iso_date_time)
except ValueError:
parser_mediator.ProduceExtractionWarning(
'invalid date time value: {0!s}'.format(date_time_string))
return
event = time_events.DateTimeValuesEvent(
date_time, definitions.TIME_DESCRIPTION_RECORDED)
event_data = ApacheAccessEventData()
event_data.ip_address = self._GetValueFromStructure(structure, 'ip_address')
event_data.remote_name = self._GetValueFromStructure(
structure, 'remote_name')
event_data.user_name = self._GetValueFromStructure(structure, 'user_name')
event_data.http_request = self._GetValueFromStructure(
structure, 'http_request')
event_data.http_response_code = self._GetValueFromStructure(
structure, 'response_code')
event_data.http_response_bytes = self._GetValueFromStructure(
structure, 'response_bytes')
if key == 'combined_log_format':
event_data.http_request_referer = self._GetValueFromStructure(
structure, 'referer')
event_data.http_request_user_agent = self._GetValueFromStructure(
structure, 'user_agent')
parser_mediator.ProduceEventWithEventData(event, event_data)
# pylint: disable=unused-argument
[docs] def VerifyStructure(self, parser_mediator, line):
"""Verifies that this is an apache access log file.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
line (str): line from the text file.
Returns:
bool: True if this is the correct parser, False otherwise.
"""
return max([parser.matches(line) for _, parser in self.LINE_STRUCTURES])
manager.ParsersManager.RegisterParser(ApacheAccessParser)