-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathembedding.py
More file actions
22 lines (18 loc) · 786 Bytes
/
embedding.py
File metadata and controls
22 lines (18 loc) · 786 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from os import listdir
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def tfid(text_vector):
vectorizer = TfidfVectorizer()
untokenized_data =[' '.join(tweet) for tweet in tqdm(text_vector, "Vectorizing...")]
vectorizer = vectorizer.fit(untokenized_data)
vectors = vectorizer.transform(untokenized_data).toarray()
return vectors
def tfid_test(train_vectors, test_vectors):
vectorizer = TfidfVectorizer()
untokenized_data =[' '.join(tweet) for tweet in train_vectors]
vectorizer = vectorizer.fit(untokenized_data)
untokenized_data =[' '.join(tweet) for tweet in test_vectors]
vectors = vectorizer.transform(untokenized_data).toarray()
return vectors