-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain_linearsvm.py
More file actions
103 lines (92 loc) · 3.79 KB
/
main_linearsvm.py
File metadata and controls
103 lines (92 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus import sentence_polarity
from nltk.tokenize import sent_tokenize
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from Polarity_NLTK.dbhandler2 import DatabaseHandler
from sklearn.feature_extraction.text import TfidfTransformer
###LinearSVM without additional preprocessing, with tfidf vectorizer and without tfidf vectorizer
#create list for training and list of target values
train=[]
target = []
#add positive sentences with polarity to training set
train = sent_tokenize(sentence_polarity.raw(categories = 'pos'))
l = len(train)
#add positive target to target file
for i in range(len(train)):
target.append('pos')
#add negative sentences with polarity to training set
train.extend(sent_tokenize(sentence_polarity.raw(categories = 'neg')))
#add negative target to target file
for i in range(len(train)- l):
target.append('neg')
#get logarithmic values for alpha to be tested during parameter tuning
alpha = []
for exponent in range(-15, 1):
alpha.append(0.00001)
alpha.append(0.000001)
alpha.append(2 ** exponent)
#get parameters tuned for classifier only (approach referred to in the report)
# params = {'clf__alpha': (alpha),
# 'clf__penalty': ('l2', 'elasticnet'),
# 'clf__loss': ('hinge', 'squared_hinge'),
# 'clf__n_iter': (10, 50, 80),
# 'clf__fit_intercept': (True, False),
# 'clf__l1_ratio': (0.05, 0.1, 0.15, 0.2, 0.25),
# 'clf__power_t': (0.1, 0.25, 0.5, 0.75, 1)
# }
#pipeline to be processed by GridSearchCV with classifer and countvectorizer only
# pipeline = Pipeline([
# ('vect', CountVectorizer()),
# ('clf', SGDClassifier()),
# ])
#get parameters tuned for classifier, countvectorizer and tfidfvectorizer
params = {'vect__max_df': (0.5, 0.75, 1.0),
'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)),
'tfidf__use_idf': (True, False),
'tfidf__norm': ('l1', 'l2'),
'clf__alpha': (alpha),
'clf__penalty': ('l2', 'elasticnet'),
'clf__loss': ('hinge', 'squared_hinge'),
'clf__n_iter': (10, 50, 80),
'clf__fit_intercept': (True, False),
'clf__l1_ratio': (0.05, 0.1, 0.15, 0.2, 0.25),
'clf__power_t': (0.1, 0.25, 0.5, 0.75, 1)
}
#pipeline to be processed by GridSearchCV with classifer, tfidf vectorizer and countvectorizer only
pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
#parameter tuning using GridSearchCV
gridsearch = GridSearchCV(pipeline, params)
gridsearch.fit(train, target)
print(gridsearch.best_estimator_.get_params())
print(gridsearch.score(train,target))
#get delta newsarticles from database, which did not get a sentiment yet, in
handler = DatabaseHandler()
result = handler.execute(
"""Select n.source_uri as 'source_uri', n.bow as 'bow' from NewsArticlesBOW n WHERE n.source_uri NOT IN (Select s.source_uri FROM NewsArticlesLinearSVM_B s);
""")
#add articles and predicted sentiments to database table and persist a dict
for row in result:
i_text = (row['bow'])
i_text = str(i_text)
print(i_text)
sent = gridsearch.predict([i_text])
uri = row["source_uri"]
uri = str(uri)
if(sent == 'pos'):
sent = 1
elif(sent == 'neg'):
sent = 0
print(sent, uri)
processed ={}
processed['source_uri'] = uri
processed['sentiment'] = sent
handler.persistDict('NewsArticlesLinearSVM_B', [processed])