JustDoPython
diff --git a/‎taiyangxue/showdata/analyse/baidu_api.py‎
Lines changed: 45 additions & 0 deletions b/‎taiyangxue/showdata/analyse/baidu_api.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎taiyangxue/showdata/analyse/main.py‎
Lines changed: 111 additions & 0 deletions b/‎taiyangxue/showdata/analyse/main.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎taiyangxue/showdata/ip_cache.txt‎
Lines changed: 186 additions & 0 deletions b/‎taiyangxue/showdata/ip_cache.txt‎
Lines changed: 186 additions & 0 deletions
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# @Time : 2020/8/27 14:06
+# @Author : way
+# @Site : 
+# @Describe: 通过 ip 获取所在省份
+
+import sys
+import json
+import requests
+import os
+
+ak = "<换成你的ak>" # 百度 ak 自行申请 http://lbsyun.baidu.com/index.php?title=webapi/ip-api
+
+ipCache = {}
+if os.path.exists("ip_cache.txt"):
+    with open("ip_cache.txt", "r") as f:
+        data = f.readline()
+        while data:
+            ip, province = data.strip().split("\t")
+            ipCache[ip] = province
+            data = f.readline()
+
+def ip2province(ip):
+    province = ipCache.get(ip, None)
+    if province is None:
+        url = f"https://api.map.baidu.com/location/ip?ak={ak}&ip={ip}&coor=bd09ll"
+        try:
+            province = json.loads(requests.get(url).text)['address'].split('|')[1]
+            ipCache[ip] = province
+            # 这里就需要写入
+            with open("ip_cache.txt","a") as f:
+                f.write(ip + "\t" + province + "\n")
+            return province
+        except Exception as e:
+            return "未知"
+    else:
+        return province
+
+if __name__ == '__main__':
+    for line in sys.stdin:
+        cols = line.replace('\n', '').split('\t')
+        print(cols)
+        cols = [ip2province(cols[0]), cols[0]]
+        sys.stdout.write('\t'.join(cols) + '\n')
@@ -0,0 +1,111 @@
+import re
+import os
+import datetime
+from baidu_api import ip2province
+import pandas as pd
+import openpyxl
+from openpyxl import load_workbook
+
+# 命名分组
+obj = re.compile(r'(?P<ip>.*?)- - \[(?P<time>.*?)\] "(?P<request>.*?)" (?P<status>.*?) (?P<bytes>.*?) "(?P<referer>.*?)" "(?P<ua>.*?)"')
+def load_log(path):
+    lst = []
+    error_lst = []
+    i = 0
+    with open(path, mode="r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            dic = parse(line)
+            if dic:  # 正确的数据添加到lst列表中
+                lst.append(dic)
+            else:
+                error_lst.append(line)  # 脏数据添加到error_lst列表中
+            i += 1
+            if i % 1000 == 0:
+                print(i,"行")
+    return lst, error_lst
+
+def parse(line):
+    # 解析单行nginx日志
+    dic = {}
+    try:
+        # print(line)
+        result = obj.match(line)
+        # print(result.group("time"))
+        # ip处理
+        ip = result.group("ip")
+        if ip.strip() == '-' or ip.strip() == "":  # 如果是匹配到没有ip就把这条数据丢弃
+            return False
+        dic['ip'] = ip.split(",")[0].strip()  # 如果有两个ip，取第一个ip
+        dic['province'] = ip2province(dic['ip'])  # 用 IP 转换为省份
+        # print("dic['province']:",dic['province'])
+        # 状态码处理
+        status = result.group("status")  # 状态码
+        dic['status'] = status
+
+        # 时间处理
+        time = result.group("time")  # 21/Dec/2019:21:45:31 +0800
+        time = time.replace(" +0800", "")  # 替换+0800为空
+        t = datetime.datetime.strptime(time, "%d/%b/%Y:%H:%M:%S")  # 将时间格式化成友好的格式
+        dic['time'] = t
+        dic['hour'] = t.hour
+        # request处理
+        request = result.group("request")
+        a = request.split()[1].split("?")[0]  # 往往url后面会有一些参数，url和参数之间用?分隔，取出不带参数的url
+        dic['request'] = a
+
+        # user_agent处理
+        ua = result.group("ua")
+        if "Windows NT" in ua:
+            u = "windows"
+        elif "iPad" in ua:
+            u = "ipad"
+        elif "Android" in ua:
+            u = "android"
+        elif "Macintosh" in ua:
+            u = "mac"
+        elif "iPhone" in ua:
+            u = "iphone"
+        else:
+            u = "其他设备"
+        dic['ua'] = u
+
+        # refer处理
+        referer = result.group("referer")
+        dic['referer'] = referer
+
+        return dic
+    except Exception as e:
+        print("[parse]",line, "-->", e)
+        return None
+
+def analyse(lst, datafile):
+    df = pd.DataFrame(lst)  # 创建 DataFrame
+
+    # 统计省份
+    province_count_df = pd.value_counts(df['province']).reset_index().rename(columns={"index": "province", "province": "count"})
+
+    # 统计时段
+    hour_count_df = pd.value_counts(df['hour']).reset_index().rename(columns={"index": "hour", "hour": "count"}).sort_values(by='hour')
+
+    # 统计客户端
+    ua_count_df = pd.value_counts(df['ua']).reset_index().rename(columns={"index": "ua", "ua": "count"})
+
+    # 数据存储
+    to_excel(province_count_df, datafile, sheet_name='省份')
+    to_excel(hour_count_df, datafile, sheet_name='按时')
+    to_excel(ua_count_df, datafile, sheet_name='客户端')
+    
+def to_excel(dataframe, filepath, sheet_name):
+    if os.path.exists(filepath):
+        excelWriter = pd.ExcelWriter(filepath, engine='openpyxl')
+        book = load_workbook(excelWriter.path)
+        excelWriter.book = book
+        dataframe.to_excel(excel_writer=excelWriter,sheet_name=sheet_name,index=None, header=None)
+        excelWriter.close()
+    else:
+        dataframe.to_excel(filepath, sheet_name=sheet_name, index=None, header=None)
+
+if __name__ == '__main__':
+    lst, error_lst = load_log("nginx_access.log")
+    analyse(lst, "data.xlsx")
@@ -0,0 +1,186 @@
+124.64.19.27	北京
+124.64.18.118	北京
+114.246.34.133	北京
+123.119.247.136	北京
+221.219.132.144	北京
+116.2.39.176	辽宁
+36.104.125.224	吉林
+124.64.16.10	北京
+124.64.19.17	北京
+124.64.19.198	北京
+61.181.218.54	天津
+124.64.17.231	北京
+218.69.54.122	天津
+124.64.17.228	北京
+221.192.179.8	河北
+211.94.246.253	天津
+211.94.239.98	天津
+114.242.249.80	北京
+114.242.250.129	北京
+101.91.60.81	江苏
+221.192.179.30	河北
+43.249.136.26	天津
+124.64.16.188	北京
+114.242.248.31	北京
+124.64.18.198	北京
+218.68.91.101	辽宁
+61.148.245.90	北京
+117.10.206.88	天津
+203.208.60.64	福建
+203.208.60.3	福建
+203.208.60.60	福建
+124.64.19.57	北京
+123.151.77.91	河北
+124.64.18.144	北京
+123.151.76.158	天津
+61.148.243.176	北京
+124.64.17.67	北京
+223.104.3.204	北京
+124.64.16.93	北京
+211.94.208.121	天津
+211.94.246.15	天津
+124.64.16.138	北京
+61.148.243.38	北京
+120.244.52.54	北京
+124.64.16.136	北京
+124.64.18.217	北京
+120.245.4.41	北京
+211.94.195.158	天津
+124.64.16.135	北京
+123.151.76.248	天津
+61.148.244.12	北京
+124.64.16.253	北京
+218.68.91.112	辽宁
+124.64.18.76	北京
+211.94.238.171	天津
+125.39.46.56	辽宁
+111.196.106.59	北京
+114.241.45.159	北京
+211.94.225.8	天津
+123.150.174.182	天津
+61.181.236.214	天津
+124.64.16.98	北京
+61.148.243.25	北京
+124.64.17.42	北京
+123.151.77.81	河北
+211.94.230.118	天津
+103.3.96.2	天津
+114.242.250.155	北京
+114.240.67.63	北京
+124.64.16.221	北京
+211.94.252.135	天津
+124.64.16.62	北京
+218.69.61.133	天津
+124.64.19.236	北京
+114.242.248.188	北京
+117.10.207.38	天津
+124.64.16.19	北京
+211.94.251.187	天津
+139.214.251.167	吉林
+123.151.77.70	河北
+111.30.142.186	河北
+111.30.142.227	河北
+223.104.236.216	辽宁
+111.30.142.78	河北
+223.104.3.186	北京
+180.97.118.219	江苏
+124.64.18.192	北京
+122.115.226.173	北京
+220.194.106.92	天津
+220.194.106.94	北京
+202.99.89.162	天津
+124.64.19.220	北京
+124.64.18.4	北京
+61.148.245.99	北京
+223.104.3.198	北京
+211.94.239.184	天津
+125.39.132.94	北京
+211.94.225.128	天津
+61.151.207.158	上海
+117.136.38.145	北京
+223.104.227.204	天津
+103.3.96.18	天津
+113.96.232.118	重庆
+223.104.3.11	北京
+211.94.239.180	天津
+139.214.246.116	吉林
+211.94.237.208	天津
+101.89.239.230	上海
+202.99.112.190	天津
+124.64.17.172	北京
+124.64.19.5	北京
+61.148.243.117	北京
+221.192.179.120	河北
+122.97.175.103	江苏
+202.99.113.50	天津
+218.67.234.74	天津
+58.218.133.250	江苏
+122.97.175.148	江苏
+112.224.67.28	山东
+61.151.207.252	江苏
+61.181.218.93	天津
+117.10.206.177	天津
+223.104.236.240	辽宁
+123.151.77.123	河北
+220.194.107.221	北京
+110.251.15.159	河北
+175.24.45.114	上海
+124.64.19.86	北京
+220.194.107.222	北京
+221.192.178.44	河北
+114.242.248.155	北京
+223.104.175.237	辽宁
+223.104.175.86	辽宁
+211.94.240.149	天津
+223.104.176.106	辽宁
+221.192.179.167	河北
+124.64.17.217	北京
+222.186.136.164	江苏
+124.64.18.245	北京
+211.94.208.8	天津
+223.104.176.23	辽宁
+122.97.175.145	江苏
+211.94.253.31	天津
+36.104.39.237	吉林
+221.192.179.96	河北
+218.69.52.34	天津
+211.94.254.22	天津
+223.104.103.17	河北
+61.148.243.204	北京
+124.64.17.54	北京
+139.214.251.83	吉林
+139.214.244.217	吉林
+124.64.19.162	北京
+117.136.54.52	天津
+220.181.108.101	广东
+220.181.108.171	广东
+111.206.221.22	北京
+111.206.221.45	北京
+111.206.221.11	北京
+111.206.198.26	北京
+111.206.221.43	北京
+111.206.198.101	北京
+111.206.221.108	北京
+117.10.206.50	天津
+103.3.96.166	天津
+61.181.219.241	天津
+124.64.19.43	北京
+103.3.97.8	天津
+61.148.243.124	北京
+124.64.19.186	北京
+221.192.179.34	河北
+36.104.122.38	吉林
+221.192.180.153	河北
+211.94.245.65	天津
+221.192.178.240	河北
+220.181.108.177	广东
+220.181.108.161	广东
+111.206.198.50	北京
+111.206.198.119	北京
+111.206.198.70	北京
+111.206.198.6	北京
+111.206.221.102	北京
+111.206.221.44	北京
+111.206.198.41	北京
+36.98.226.192	河北
+124.64.19.93	北京