一个实时分析日志的python小脚本

Web运维总要关注相关域名的实时2xx/s、4xx/s、5xx/s、响应时间、带宽等这些指标,之前的日志是五分钟一分割,简单的用awk就可以了,现在由于要推送日志到ELK,继续之前五分钟一分割会有问题,就改为一天分割一次。改成一天一分割后,显然再继续用Shell就不合适了,于是就用Python写了下。

脚本主要运用了文件的seek和tell函数,原理如下:

1.加入crontab,每5分钟执行一次
2.只分析从上次读取日志文件的结束位置到这次读取文件时的末尾位置之间的日志,出结果

可以使用zabbix_sender把结果发送到zabbix server或者直接使用zabbix agent来读取这个文件取数据,配合zabbix出图、做报警,代码如下:

#!/usr/bin/env python
#coding: utf-8

from __future__ import division
import os

LOG_FILE = '/data0/logs/nginx/xxxx-access_log'
POSITION_FILE = '/tmp/position.log'
STATUS_FILE = '/tmp/http_status'
#crontab 执行时间
CRON_TIME = 300

def get_position():
    #第一次读取日志文件,POSITION_FILE为空
    if not os.path.exists(POSITION_FILE):
        start_position = str(0)
        end_position = str(os.path.getsize(LOG_FILE))
        fh = open(POSITION_FILE,'w')
        fh.write('start_position: %s\n' % start_position)
        fh.write('end_position: %s\n' % end_position)
        fh.close()
        os._exit(1)
    else:
        fh = open(POSITION_FILE)
        se = fh.readlines()
        fh.close()
        #其他意外情况导致POSITION_FILE内容不是两行
        if len(se) != 2:
            os.remove(POSITION_FILE)
            os._exit(1)
        last_start_position,last_end_position = [item.split(':')[1].strip() for item in se]
        start_position = last_end_position
        end_position = str(os.path.getsize(LOG_FILE))
        #日志轮转导致start_position > end_position
        #print start_position,end_position
        if start_position > end_position:
            start_position = 0
        #日志停止滚动时
        elif start_position == end_position:
            os._exit(1)
        #print start_position,end_position
        fh = open(POSITION_FILE,'w')
        fh.write('start_position: %s\n' % start_position)
        fh.write('end_position: %s\n' % end_position)
        fh.close()
        return map(int,[start_position,end_position])

def write_status(content):
    fh = open(STATUS_FILE,'w')
    fh.write(content)
    fh.close()

def handle_log(start_position,end_position):
    log = open(LOG_FILE)
    log.seek(start_position,0)
    status_2xx,status_403,status_404,status_500,status_502,status_503,status_504,status_all,rt,bandwidth = 0,0,0,0,0,0,0,0,0,0
    while True:
        current_position = log.tell()
        if current_position >= end_position:
            break
        line = log.readline()
        line = line.split(' ')
        host,request_time,time_local,status,bytes_sent = line[1],line[3],line[5],line[10],line[11]
        #print host,request_time,time_local,status,bytes_sent
        status_all += 1
        try:
            rt += float(request_time.strip('s'))
            bandwidth += int(bytes_sent)
        except:
            pass
        if status == '200' or status == '206':
            status_2xx += 1
        elif status == '403':
            status_403 += 1
        elif status == '404':
            status_404 += 1
        elif status == '500':
            status_500 += 1
        elif status == '502':
            status_502 += 1
        elif status == '503':
            status_503 += 1
        elif status == '504':
            status_504 += 1
    log.close()
    #print "status_2xx: %s\nstatus_403: %s\nstatus_404: %s\nstatus_500: %s\nstatus_502: %s\nstatus_503: %s\nstatus_504: %s\nstatus_all: %s\nrt: %s\nbandwidth: %s\n" % (status_2xx/CRON_TIME,status_403/CRON_TIME,status_404/CRON_TIME,status_500/CRON_TIME,status_502/CRON_TIME,status_503/CRON_TIME,status_504/CRON_TIME,status_all/CRON_TIME,rt/status_all,bandwidth/CRON_TIME)

    write_status("status_2xx: %s\nstatus_403: %s\nstatus_404: %s\nstatus_500: %s\nstatus_502: %s\nstatus_503: %s\nstatus_504: %s\nstatus_all: %s\nrt: %s\nbandwidth: %s\n" % (status_2xx/CRON_TIME,status_403/CRON_TIME,status_404/CRON_TIME,status_500/CRON_TIME,status_502/CRON_TIME,status_503/CRON_TIME,status_504/CRON_TIME,status_all/CRON_TIME,rt/status_all,bandwidth/CRON_TIME))

if __name__ == '__main__':
    start_position,end_position = get_position()
    handle_log(start_position,end_position)

看下分析的结果:

cat /tmp/http_status
status_2xx: 17.3333333333
status_403: 0.0
status_404: 1.0
status_500: 0.0
status_502: 0.0
status_503: 0.0
status_504: 0.0
status_all: 20.0
rt: 0.0782833333333
bandwidth: 204032.0

2017.2.22 更正,start_position、end_position 使用字符串比较会有问题,如下:

In [5]: '99772400' > '100227572'
Out[5]: True

In [6]: int('99772400') > int('100227572')
Out[6]: False

因此,更正为:

#日志轮转导致start_position > end_position
#print start_position,end_position
if int(start_position) > int(end_position):
    start_position = 0
#日志停止滚动时
elif int(start_position) == int(end_position):
    os._exit(1)