-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataPy.py
More file actions
193 lines (156 loc) · 6.37 KB
/
DataPy.py
File metadata and controls
193 lines (156 loc) · 6.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#
# Copyright (c) 2017 Jackson Gable
#
# This file is released under the MIT License.
#
from __future__ import division
from math import sqrt
from math import fabs
#
# This class performs 1-variable statistics calculations on the data set it is initialized with.
#
class OneVarStatsCalc:
def __init__(self, data):
data.sort()
self._data = data
# Calculates and returns the mean of the data
def mean(self, data=None):
if data is None:
data = self._data
total = 0
for x in data:
total += x
return total / len(data)
# Calculates and returns the maximum value in the data
def max(self):
return self._data[len(self._data) - 1]
# Calculates and returns the minimum value in the data
def min(self):
return self._data[0]
# Calculates and returns the median value in the data
def median(self, data=None):
if data is None: # if method should perform calculation on default data set
data = self._data
max_index = len(data) - 1
if len(data) % 2 == 1: # odd-length data set
med_index = max_index // 2
median = data[med_index]
else: # even-length data set, this is where it gets (marginally more) interesting
lower_med_index = max_index // 2
upper_med_index = lower_med_index + 1
median = self.mean([data[lower_med_index], data[upper_med_index]])
return median
# Calculates and returns the first quartile of the data
def q1(self):
max_index = len(self._data) - 1
if len(self._data) % 2 == 1: # data length is odd
stop = max_index // 2
else: # data length is even
stop = (max_index // 2) + 1
return self.median(self._data[0: stop])
# Calculates and returns the third quartile of the data
def q3(self):
max_index = len(self._data) - 1
if len(self._data) % 2 == 1:
start = max_index // 2 + 1
else:
start = max_index // 2 + 1
return self.median(self._data[start: max_index + 1])
# Calculates and returns the data's range
def range(self, data=None):
if data is None:
data = self._data
return data[len(data) - 1] - data[0]
# Calculates and returns the data's interquartile range
def iqr(self):
return self.q3() - self.q1()
# Calculates and returns the population standard deviation of the data
def pop_std_dev(self):
count_inverse = len(self._data) ** -1
squared_variance_sum = 0
mean = self.mean() # Prevents self.mean() from running during each for loop iteration
for val in self._data:
squared_variance_sum += (val - mean) ** 2
std_dev_square = count_inverse * squared_variance_sum
return sqrt(std_dev_square)
# Calculates and returns the sample standard deviation of the data
# TODO: Write test method for this
def std_dev(self):
count_inverse = (len(self._data) - 1) ** -1
squared_variance_sum = 0
mean = self.mean() # Prevents self.mean() from running during each for loop iteration
for val in self._data:
squared_variance_sum += (val - mean) ** 2
std_dev_square = count_inverse * squared_variance_sum
return sqrt(std_dev_square)
# Calculates and returns the upper fence of the data.
# Numbers in the data set higher than the value of the upper fence are considered outliers
def upper_fence(self):
return self.iqr() * 1.5 + self.q3()
# Calculates and returns the lower fence of the data.
# Numbers in the data set lower than the value of the lower fence are considered outliers
def lower_fence(self):
return self.q1() - self.iqr() * 1.5
# Calculates and returns the outliers in the data set
def outliers(self):
outliers = []
uf = self.upper_fence()
lf = self.lower_fence()
for x in self._data:
if x > uf or x < lf:
outliers.append(x)
return outliers
# This class accepts two lists of equal length, the first being the list of x-values
# and the second being the list of y-values.
class TwoVarStatsCalc:
def __init__(self, x_list, y_list):
if len(x_list) == len(y_list):
self._x_list = x_list
self._y_list = y_list
self._x_calc = OneVarStatsCalc(x_list)
self._y_calc = OneVarStatsCalc(y_list)
# This variable is also equal to len(self._y_list), since len(self._x_list) == len(self._y_list)
self._data_length = len(self._x_list)
else:
raise Exception("Lists must be of equal length.")
# Calculates and returns the correlation coefficient of the data
def r(self):
x_mean = self._x_calc.mean()
y_mean = self._y_calc.mean()
x_sd = self._x_calc.std_dev()
y_sd = self._y_calc.std_dev()
x_z_scores = []
y_z_scores = []
for x in self._x_list:
z = (x - x_mean) / x_sd
x_z_scores.append(z)
for y in self._y_list:
z = (y - y_mean) / y_sd
y_z_scores.append(z)
z_products = []
for index in range(0, self._data_length):
z = x_z_scores[index] * y_z_scores[index]
z_products.append(z)
z_sum = 0
for z in z_products:
z_sum += z
return z_sum / (self._data_length - 1)
# Calculates the coefficient of determination of the data
def r2(self):
return self.r() ** 2
# Calculates and returns the slope of the linear regression line
def linear_regression_slope(self):
return self.r() * self._y_calc.std_dev() / self._x_calc.std_dev()
# Calculates and returns the y-intercept of the linear regression line
def linear_regression_y_int(self):
return self._y_calc.mean() - self.linear_regression_slope() * self._x_calc.mean()
# Calculates the equation of the linear regression line and returns it as a string
def linear_regression_eq(self):
m = self.linear_regression_slope()
b = self.linear_regression_y_int()
if b < 0:
return "y = {}x - {}".format(m, fabs(b))
elif b > 0:
return "y = {}x + {}".format(m, b)
elif b == 0: # Very unlikely, just covering all the bases.
return "y = {}x".format(m)