-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
64 lines (54 loc) · 2.23 KB
/
analysis.py
File metadata and controls
64 lines (54 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
This script generates basic stats and a chart from all_responses.json.
The all_responses.json file should be generated by running the beam pipeline
in stack-overflow.py.
Usage: python3 analysis.py
"""
import re
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('expand_frame_repr', False)
def classify_error(row):
if row["err_type"] == "HTTPError":
return "HTTP_{}".format(row["status"])
elif "Read timed out" in row["err_details"]:
return "READ_TIMEOUT"
elif "Name or service not known" in row["err_details"]:
return "DNS_ERROR"
elif "No address associated with hostname" in row["err_details"]:
return "DNS_ERROR"
elif "Temporary failure in name resolution" in row["err_details"]:
return "DNS_ERROR"
elif "Connection refused" in row["err_details"]:
return "CONN_REFUSED"
elif "Connection reset by peer" in row["err_details"]:
return "CONN_RESET"
elif "SSLError" in row["err_details"]:
return "SSL_ERROR"
elif "Network is unreachable" in row["err_details"]:
return "UNREACHABLE"
elif "No route to host" in row["err_details"]:
return "NO_ROUTE"
elif "Remote end closed connection without response" in row["err_details"]:
return "CONN_CLOSED"
elif re.search(r"Connection to [\w\.\-]+ timed out", row["err_details"]) is not None:
return "CONN_TIMEOUT"
else:
raise Exception("Unclassified error {}".format(row["err_details"]))
def main():
df = pd.read_json("./all_responses.json")
errors = df[~df["err_type"].isnull()]
num_errors = len(errors)
num_responses = len(df)
print("Out of {} URLs checked, {} returned errors. ({:.2f}%)".format(num_responses, num_errors,
100 * num_errors / num_responses))
error_types = errors.apply(classify_error, axis=1)
plt.style.use('ggplot')
plt.subplots_adjust(bottom=0.15)
error_types.value_counts().plot(kind="bar", style="ggplot", ylabel="count",
title="Stack Overflow Broken URL Causes")
plt.subplots_adjust(bottom=0.15)
plt.show()
if __name__ == "__main__":
main()