#!/usr/bin/env python3 # vim: wrapmargin=0 textwidth=0 smarttab expandtab tabstop=2 shiftwidth=2 """Process Apache access.log lines to find highest rate of /upload/.""" import argparse from collections import deque import datetime import dateutil.parser import fileinput import re def process_log_file( input_file: str ='-', request_text: str = '/upload/', window_size: int = 1, ) -> None: """ Process the indicated log file to determine peak rate of interesting lines. :param input_file: Name of input file, `-` for stdin :param request_text: The text that denotes an interesting line :param window_size: Time, in seconds, for the window to assess """ print(f'With:\n\tinput_file: "{input_file}"\n\trequest_text: "{request_text}"') with fileinput.FileInput(files=(input_file)) as f: apache_re = re.compile(r'^(?P<host>[.:0-9a-fA-F]{3,39}) - - \[(?P<datetime>[^\]]+)\] (?P<logtext>.*' + request_text + '.*)$') print(f'Apache RE:\n{apache_re}\n') apache_datetime_re = re.compile( r'^(?P<d>[0-9]{2})/(?P<mon>[^/]{3})/(?P<YYYY>[0-9]{4}):(?P<time>[0-9]{2}:[0-9]{2}:[0-9]{2} \+[0-9]{4})$' ) window_time_delta = datetime.timedelta(seconds=window_size) window_count = 0 last_dt = None window_end_longest_count = None window_dts = deque() line_count = 0 for line in f: matches = apache_re.search(line) if matches: line_count += 1 # print(f'\nMatches:\n{line}') # This will be referenced so many times we want a short name m = apache_datetime_re.search(matches.group('datetime')) this_dt_iso8601 = f'{m.group("YYYY")}-{m.group("mon")}-{m.group("d")} {m.group("time")}' ############################################################### # This code absolutely assumes that the apache log lines are # in strictly increasing time sequence order. # # That's not necessarily true. It has been observed that e.g. # a long line for 00:24:39 can occur in the middle of lines for # 00:24:40. # # Hopefully this doesn't happen too much. ############################################################### this_dt = dateutil.parser.parse(this_dt_iso8601) # print(f'Timestamp: {this_dt}') window_dts.append(this_dt) # Find the oldest entry that is still within the window: oldest_of_interest = this_dt - window_time_delta while window_dts[0] <= oldest_of_interest: window_dts.popleft() if len(window_dts) > window_count: window_count = len(window_dts) window_end_longest_count = last_dt # print(f'Largest window count : {window_count:>9} ({window_count / window_size:>9}/s)') last_dt = this_dt # print() else: # print(f'\nNo matches:\n{line}\n') pass print(f'With window size : {window_size:>9}') print(f'Total line matching lines: {line_count:>9}') print(f'Largest window count : {window_count:>9} ({window_count / window_size:>9}/s)') print(f'Busiest window ended at: {window_end_longest_count.strftime("%d/%b/%Y:%H:%M:%S")}') def main(): parser = argparse.ArgumentParser( description='Process Apache web server access.log lines, counting the number of a specific request per a unit of time.', ) parser.add_argument( '--window-length', metavar='<window size in seconds>', required=False, default=1, help='The time period in which the max rate will be.', ) parser.add_argument( 'input_file', metavar='<input file name>', help='Name of an Apache access.log file. You may use "-" for standard input.', ) parser.add_argument( 'request_text', metavar='<per-request text selector>', help='Text that appears in the log lines of interest. Defaults to "/upload/"', nargs='?', default='/upload/', ) args = parser.parse_args() process_log_file(input_file=args.input_file, request_text=args.request_text, window_size=int(args.window_length)) if __name__ == '__main__': main()