1+ import re
2+ import os
3+ import datetime
4+ from baidu_api import ip2province
5+ import pandas as pd
6+ import openpyxl
7+ from openpyxl import load_workbook
8+
9+ # 命名分组
10+ obj = re .compile (r'(?P<ip>.*?)- - \[(?P<time>.*?)\] "(?P<request>.*?)" (?P<status>.*?) (?P<bytes>.*?) "(?P<referer>.*?)" "(?P<ua>.*?)"' )
11+ def load_log (path ):
12+ lst = []
13+ error_lst = []
14+ i = 0
15+ with open (path , mode = "r" , encoding = "utf-8" ) as f :
16+ for line in f :
17+ line = line .strip ()
18+ dic = parse (line )
19+ if dic : # 正确的数据添加到lst列表中
20+ lst .append (dic )
21+ else :
22+ error_lst .append (line ) # 脏数据添加到error_lst列表中
23+ i += 1
24+ if i % 1000 == 0 :
25+ print (i ,"行" )
26+ return lst , error_lst
27+
28+ def parse (line ):
29+ # 解析单行nginx日志
30+ dic = {}
31+ try :
32+ # print(line)
33+ result = obj .match (line )
34+ # print(result.group("time"))
35+ # ip处理
36+ ip = result .group ("ip" )
37+ if ip .strip () == '-' or ip .strip () == "" : # 如果是匹配到没有ip就把这条数据丢弃
38+ return False
39+ dic ['ip' ] = ip .split ("," )[0 ].strip () # 如果有两个ip,取第一个ip
40+ dic ['province' ] = ip2province (dic ['ip' ]) # 用 IP 转换为省份
41+ # print("dic['province']:",dic['province'])
42+ # 状态码处理
43+ status = result .group ("status" ) # 状态码
44+ dic ['status' ] = status
45+
46+ # 时间处理
47+ time = result .group ("time" ) # 21/Dec/2019:21:45:31 +0800
48+ time = time .replace (" +0800" , "" ) # 替换+0800为空
49+ t = datetime .datetime .strptime (time , "%d/%b/%Y:%H:%M:%S" ) # 将时间格式化成友好的格式
50+ dic ['time' ] = t
51+ dic ['hour' ] = t .hour
52+ # request处理
53+ request = result .group ("request" )
54+ a = request .split ()[1 ].split ("?" )[0 ] # 往往url后面会有一些参数,url和参数之间用?分隔,取出不带参数的url
55+ dic ['request' ] = a
56+
57+ # user_agent处理
58+ ua = result .group ("ua" )
59+ if "Windows NT" in ua :
60+ u = "windows"
61+ elif "iPad" in ua :
62+ u = "ipad"
63+ elif "Android" in ua :
64+ u = "android"
65+ elif "Macintosh" in ua :
66+ u = "mac"
67+ elif "iPhone" in ua :
68+ u = "iphone"
69+ else :
70+ u = "其他设备"
71+ dic ['ua' ] = u
72+
73+ # refer处理
74+ referer = result .group ("referer" )
75+ dic ['referer' ] = referer
76+
77+ return dic
78+ except Exception as e :
79+ print ("[parse]" ,line , "-->" , e )
80+ return None
81+
82+ def analyse (lst , datafile ):
83+ df = pd .DataFrame (lst ) # 创建 DataFrame
84+
85+ # 统计省份
86+ province_count_df = pd .value_counts (df ['province' ]).reset_index ().rename (columns = {"index" : "province" , "province" : "count" })
87+
88+ # 统计时段
89+ hour_count_df = pd .value_counts (df ['hour' ]).reset_index ().rename (columns = {"index" : "hour" , "hour" : "count" }).sort_values (by = 'hour' )
90+
91+ # 统计客户端
92+ ua_count_df = pd .value_counts (df ['ua' ]).reset_index ().rename (columns = {"index" : "ua" , "ua" : "count" })
93+
94+ # 数据存储
95+ to_excel (province_count_df , datafile , sheet_name = '省份' )
96+ to_excel (hour_count_df , datafile , sheet_name = '按时' )
97+ to_excel (ua_count_df , datafile , sheet_name = '客户端' )
98+
99+ def to_excel (dataframe , filepath , sheet_name ):
100+ if os .path .exists (filepath ):
101+ excelWriter = pd .ExcelWriter (filepath , engine = 'openpyxl' )
102+ book = load_workbook (excelWriter .path )
103+ excelWriter .book = book
104+ dataframe .to_excel (excel_writer = excelWriter ,sheet_name = sheet_name ,index = None , header = None )
105+ excelWriter .close ()
106+ else :
107+ dataframe .to_excel (filepath , sheet_name = sheet_name , index = None , header = None )
108+
109+ if __name__ == '__main__' :
110+ lst , error_lst = load_log ("nginx_access.log" )
111+ analyse (lst , "data.xlsx" )
0 commit comments