-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcharities_toyscript.py
More file actions
162 lines (129 loc) · 5.08 KB
/
charities_toyscript.py
File metadata and controls
162 lines (129 loc) · 5.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#
# charities-toyscript
# Author: Emily Quinn Finney
# A very toy implementation of a content-based recommender system
#
# Current fixes:
# - write tests for every part of this code, probably using pytest
import pandas as pd
import numpy as np
try:
import psyco
psyco.full()
except ImportError:
pass
__all__ = ['Data', 'User', 'calculate_similarity', 'find_best_match']
def test_decorator(function):
"""
Testing a decorator that prints all args and kwargs for a function.
"""
def function_wrapper(*args, **kwargs):
for value in args, kwargs:
print(value)
return function(*args, **kwargs)
return function_wrapper
class Data:
def __init__(self, filename, config_file=None):
if filename.split('.')[-1] == '.json':
self.data = pd.DataFrame(json.load(filename))
else:
self.data = pd.read_csv(filename, sep='\s+', index_col=0, comment='#')
if config_file:
self.configure()
self.configuration = config_file
def configure(self):
for column in self.data:
self.data[column] = config_file[column](self.data[column])
def normalize(self):
min_list = []
max_list = []
for column in self.data.select_dtypes(include=['int', 'float']):
min_value = np.min(self.data[column])
max_value = np.max(self.data[column])
colnorm = (self.data[column] - min_value) / (max_value - min_value)
self.data[column] = colnorm
min_list.append(min_value)
max_list.append(max_value)
return self, min_list, max_list
def standardize(self):
avg_list = []
std_list = []
for column in self.data.select_dtypes(include=['int', 'float']):
avg_value = np.mean(self.data[column])
std_value = np.std(self.data[column])
colnorm = (self.data[column] - avg_value) / std_value
self.data[column] = colnorm
avg_list.append(avg_value)
std_list.append(std_value)
return self, avg_list, std_list
class User:
def __init__(self, user_vec, config_user=None):
self.vector = np.array(user_vec, dtype=float)
if config_user:
self.configure()
self.configuration = config_user
def normalize(self, min_list, max_list):
for idx, value in enumerate(self.vector):
min_value = min_list[idx]
max_value = max_list[idx]
new_value = (value - min_value) / (max_value - min_value)
self.vector[idx] = new_value
return self
def standardize(self, avg_list, std_list):
for idx, value in enumerate(self.vector):
avg_value = avg_list[idx]
std_value = std_list[idx]
new_value = (value - avg_value) / std_value
self.vector[idx] = new_value
return self
def user_preference(charity_list, database):
"""
Determines the user's preference vector, based on other charities the user prefers.
:param charity_list: a list of charities the user likes, list of strings.
(future: add weights)
:param database: the data from which to obtain the user's preferred charity
:return: user_preference, a vector that conveys the user's interests
"""
user_vec = np.zeros(len(database.data.T[charity_list[0]]))
for charity in charity_list:
new_charity = np.array(database.data.T[charity])
user_vec = np.add(user_vec, new_charity)
user_vec = user_vec / len(charity_list)
return user_vec
def calculate_similarity(vec1, vec2):
"""
Determines the cosine distance between two vectors.
:param vec1: A vector of features in the data set (series).
:param vec2: Another vector of features in the data set (series).
:return: The cosine distance, a float.
>>> calculate_similarity([5,78,523456], [5,78,523456])
1.0
>>> calculate_similarity([1, 1, 1], [0, 1, 0])
0.57735026918962584
>>> calculate_similarity([5,70,523456], [5,78,523456])
0.99999999988321431
"""
vec1 = np.asarray(vec1)
vec2 = np.asarray(vec2)
dot_product = np.dot(vec1, vec2)
norm_product = np.linalg.norm(vec1)*np.linalg.norm(vec2)
return dot_product/norm_product
def find_best_match(user_vec, item_matrix):
"""
Determines the row in the data set that best matches the user's
preferences.
:param user_vec: A vector of preferred user features (series).
:param item_matrix: A data frame containing item features.
:return: The row in the data frame that best matches user preferences.
>>> find_best_match([0, 1, 0], Data('charities_toydata.txt'))
Cancer Institute
>>> find_best_match([0.5,0.5,1,0.5,0.5,0.5,0], Data('cat_food.txt'))
Feed the Feline Chicken
"""
# a smaller similarity score corresponds to a smaller difference from user
similarity_scores = item_matrix.data.apply(lambda x: calculate_similarity(user_vec, x), axis=1)
best_name = similarity_scores.idxmax()
return best_name
if __name__ == '__main__':
import doctest
doctest.testmod()