# -*- coding: utf-8 -*-
"""Apache access log (access.log) parser.
Parser based on the two default apache formats, common and combined log format
defined in https://httpd.apache.org/docs/2.4/logs.html
"""
from __future__ import unicode_literals
import pyparsing
from dfdatetime import time_elements as dfdatetime_time_elements
from plaso.containers import events
from plaso.containers import time_events
from plaso.lib import errors
from plaso.lib import definitions
from plaso.lib import timelib
from plaso.parsers import manager
from plaso.parsers import text_parser
[docs]class ApacheAccessEventData(events.EventData):
"""Apache access event data.
Attributes:
ip_address (str): IPv4 or IPv6 addresses.
remote_name (str): remote logname (from identd, if supplied).
user_name (str): logged user name.
http_request (str): first line of http request.
http_response_code (int): http response code from server.
http_response_bytes (int): http response bytes size without headers.
http_request_referer (str): http request referer header information.
http_request_user_agent (str): http request user agent header information.
"""
DATA_TYPE = 'apache:access'
def __init__(self):
"""Initializes event data."""
super(ApacheAccessEventData, self).__init__(data_type=self.DATA_TYPE)
self.ip_address = None
self.remote_name = None
self.user_name = None
self.http_request = None
self.http_response_code = None
self.http_response_bytes = None
self.http_request_referer = None
self.http_request_user_agent = None
[docs]class ApacheAccessParser(text_parser.PyparsingSingleLineTextParser):
"""Apache access log file parser"""
NAME = 'apache_access'
DESCRIPTION = 'Apache access Parser'
_PYPARSING_COMPONENTS = {
'ip': text_parser.PyparsingConstants.IP_ADDRESS.setResultsName(
'ip_address'),
'remote_name': (pyparsing.Word(pyparsing.alphanums) |
pyparsing.Literal('-')).setResultsName('remote_name'),
'user_name': (pyparsing.Word(pyparsing.alphanums) |
pyparsing.Literal('-')).setResultsName('user_name'),
'http_request': pyparsing.SkipTo('"').setResultsName('http_request'),
'response_code': text_parser.PyparsingConstants.INTEGER.setResultsName(
'response_code'),
'response_bytes': text_parser.PyparsingConstants.INTEGER.setResultsName(
'response_bytes'),
'referer': pyparsing.SkipTo('"').setResultsName('referer'),
'user_agent': pyparsing.SkipTo('"').setResultsName('user_agent')
}
# date format [18/Sep/2011:19:18:28 -0400]
_PYPARSING_COMPONENTS['date'] = (
(pyparsing.Suppress('[') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('day') +
pyparsing.Suppress('/') +
text_parser.PyparsingConstants.THREE_LETTERS.setResultsName('month') +
pyparsing.Suppress('/') +
text_parser.PyparsingConstants.FOUR_DIGITS.setResultsName('year') +
pyparsing.Suppress(':') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('hours') +
pyparsing.Suppress(':') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('minutes') +
pyparsing.Suppress(':') +
text_parser.PyparsingConstants.TWO_DIGITS.setResultsName('seconds') +
pyparsing.Combine(
pyparsing.oneOf(['-', '+']) +
pyparsing.Word(
pyparsing.nums, exact=4)).setResultsName('time_offset') +
pyparsing.Suppress(']')).setResultsName('date_time')
)
_SKIP_SEP = pyparsing.Suppress('"')
# Defined in https://httpd.apache.org/docs/2.4/logs.html
# format: "%h %l %u %t \"%r\" %>s %b"
_COMMON_LOG_FORMAT = (
_PYPARSING_COMPONENTS['ip'] +
_PYPARSING_COMPONENTS['remote_name'] +
_PYPARSING_COMPONENTS['user_name'] +
_PYPARSING_COMPONENTS['date'] +
_SKIP_SEP + _PYPARSING_COMPONENTS['http_request'] + _SKIP_SEP +
_PYPARSING_COMPONENTS['response_code'] +
_PYPARSING_COMPONENTS['response_bytes']
)
# Defined in https://httpd.apache.org/docs/2.4/logs.html
# format: "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\""
_COMBINED_LOG_FORMAT = (
_COMMON_LOG_FORMAT +
_SKIP_SEP + _PYPARSING_COMPONENTS['referer'] + _SKIP_SEP +
_SKIP_SEP + _PYPARSING_COMPONENTS['user_agent'] + _SKIP_SEP
)
_COMMON_LOG_FORMAT_LINE = _COMMON_LOG_FORMAT + pyparsing.lineEnd
_COMBINED_LOG_FORMAT_LINE = _COMBINED_LOG_FORMAT + pyparsing.lineEnd
LINE_STRUCTURES = [
('combined_log_format', _COMBINED_LOG_FORMAT_LINE),
('common_log_format', _COMMON_LOG_FORMAT_LINE)]
_SUPPORTED_KEYS = frozenset([key for key, _ in LINE_STRUCTURES])
# TODO: migrate function after dfdatetime issue #47 is fixed.
def _GetISO8601String(self, structure):
"""Normalize date time parsed format to an ISO 8601 date time string.
The date and time values in Apache access log files are formatted as:
"[18/Sep/2011:19:18:28 -0400]".
Args:
structure (pyparsing.ParseResults): structure of tokens derived from a
line of a text file.
Returns:
str: ISO 8601 date time string.
Raises:
ValueError: if the structure cannot be converted into a date time string.
"""
time_offset = structure.time_offset
month = timelib.MONTH_DICT.get(structure.month.lower(), 0)
try:
time_offset_hours = int(time_offset[1:3], 10)
time_offset_minutes = int(time_offset[3:5], 10)
except (IndexError, TypeError, ValueError) as exception:
raise ValueError(
'unable to parse time zone offset with error: {0!s}.'.format(
exception))
try:
date_time_string = (
'{0:04d}-{1:02d}-{2:02d}T{3:02d}:{4:02d}:{5:02d}.000000'
'{6:s}{7:02d}:{8:02d}').format(
structure.year, month, structure.day, structure.hours,
structure.minutes, structure.seconds, time_offset[0],
time_offset_hours, time_offset_minutes)
except ValueError as exception:
raise ValueError(
'unable to format date time string with error: {0!s}.'.format(
exception))
return date_time_string
[docs] def ParseRecord(self, parser_mediator, key, structure):
"""Parses a matching entry.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
key (str): name of the parsed structure.
structure (pyparsing.ParseResults): elements parsed from the file.
Raises:
ParseError: when the structure type is unknown.
"""
if key not in self._SUPPORTED_KEYS:
raise errors.ParseError(
'Unable to parse record, unknown structure: {0:s}'.format(key))
date_time = dfdatetime_time_elements.TimeElements()
try:
iso_date_time = self._GetISO8601String(structure.date_time)
date_time.CopyFromStringISO8601(iso_date_time)
except ValueError:
parser_mediator.ProduceExtractionError(
'invalid date time value: {0!s}'.format(structure.date_time))
return
event = time_events.DateTimeValuesEvent(
date_time, definitions.TIME_DESCRIPTION_RECORDED)
event_data = ApacheAccessEventData()
event_data.ip_address = structure.ip_address
event_data.remote_name = structure.remote_name
event_data.user_name = structure.user_name
event_data.http_request = structure.http_request
event_data.http_response_code = structure.response_code
event_data.http_response_bytes = structure.response_bytes
if key == 'combined_log_format':
event_data.http_request_referer = structure.referer
event_data.http_request_user_agent = structure.user_agent
parser_mediator.ProduceEventWithEventData(event, event_data)
# pylint: disable=unused-argument
[docs] def VerifyStructure(self, parser_mediator, line):
"""Verifies that this is an apache access log file.
Args:
parser_mediator (ParserMediator): mediates interactions between parsers
and other components, such as storage and dfvfs.
line (str): line from the text file.
Returns:
bool: True if this is the correct parser, False otherwise.
"""
return max([parser.matches(line) for _, parser in self.LINE_STRUCTURES])
manager.ParsersManager.RegisterParser(ApacheAccessParser)