-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpipeline.py
More file actions
48 lines (31 loc) · 1.42 KB
/
pipeline.py
File metadata and controls
48 lines (31 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from sklearn.model_selection import train_test_split
import pickle
import sys
sys.path.append("Job Tag Classifier Tools")
from DataCollection import data_collection, add_new_data
from FeatureCreation import feature_creation
from FeatureProcessing import feature_processing
def DataLoader(data_file, test_size):
print("Starting Data Collection")
df = data_collection(data_file) # collect the data
print("Starting Feature Creation")
df = feature_creation(df) # create some text features
print("Starting Feature Processing")
x, y = feature_processing(df) # convert the text into numbers for processing
print("Data Loading Compelete")
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, random_state=42) # validation
return X_train, X_test, Y_train, Y_test
def BatchData(data_file, sql_add_new_data_string):
# import the new data we are prediciting
df = data_collection(data_file)
# save that data back into the new data table
update = add_new_data(df, sql_add_new_data_string)
return update
def tag_decoder(list_of_indices, threshold):
with open("Models/Tokenizers/target_tokens.pkl", 'rb') as handle:
tokenizer = pickle.load(handle)
target = []
for i, num in enumerate(list_of_indices[0]):
if num > threshold:
target.append(str(tokenizer.classes_[i]))
return target