-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_model.py
More file actions
91 lines (74 loc) Β· 3.31 KB
/
test_model.py
File metadata and controls
91 lines (74 loc) Β· 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
predict_model.py
-----------------------------------
Load the trained model and generate
predicted prices for new test data.
"""
import pandas as pd
import numpy as np
import joblib
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Ensure stopwords available
nltk.download("stopwords")
nltk.download("wordnet")
def clean_text(text):
if not isinstance(text, str): return ""
text = emoji.replace_emoji(text, "")
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", " ", text)
words = text.split()
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
return " ".join([lemmatizer.lemmatize(w) for w in words if w not in stop_words])
def normalize_unit(unit):
if not isinstance(unit, str): return "unknown"
u = unit.strip().lower()
if u in ["ounce","ounces","oz","fl oz","fl ounce","fluid ounce","fl","fl. oz"]: return "oz"
if u in ["pound","pounds","lb","lbs"]: return "lb"
if u in ["gram","grams","g","milligram","mg"]: return "g"
if u in ["kilogram","kilograms","kg"]: return "kg"
if u in ["liter","litre","ltr","millilitre","milliliter","ml"]: return "l"
if u in ["count","ct","pack","packet","bag","box","case","unit","piece","bottle","ea"]: return "count"
if u in ["sq ft","foot","feet"]: return "sq_ft"
if u in ["none","na","unknown","","n/a"]: return "unknown"
return u
print("π¦ Loading trained model and transformers...")
model, embedder, enc, scaler, text_cols, cat_cols, num_cols = joblib.load("final_transformer_lightgbm.pkl")
def predict_new(df_test_path, output_path="predicted_prices_transformer.csv"):
print(f"π Reading test data: {df_test_path}")
df_test = pd.read_csv(df_test_path).fillna("")
# Normalize unit column
if "normalized_unit" in df_test.columns:
df_test["normalized_unit"] = df_test["normalized_unit"].apply(normalize_unit)
elif "quantity_unit" in df_test.columns:
df_test["normalized_unit"] = df_test["quantity_unit"].apply(normalize_unit)
else:
df_test["normalized_unit"] = "unknown"
for col in text_cols:
if col not in df_test.columns: df_test[col] = ""
df_test["combined_text"] = df_test[text_cols].astype(str).agg(" ".join, axis=1)
df_test["combined_text"] = df_test["combined_text"].apply(clean_text)
X_cat_test = enc.transform(df_test[cat_cols]).astype(np.float32)
X_num_test = scaler.transform(df_test[num_cols].replace("",0).fillna(0)).astype(np.float32)
print("βοΈ Generating text embeddings (GPU)...")
text_emb_test = embedder.encode(
df_test["combined_text"].tolist(),
batch_size=128,
show_progress_bar=True,
convert_to_numpy=True,
device="cuda"
).astype(np.float32)
X_test_all = np.hstack([text_emb_test, X_cat_test, X_num_test])
preds = model.predict(X_test_all)
if "sample_id" not in df_test.columns:
df_test["sample_id"] = np.arange(len(df_test))
submission = pd.DataFrame({"sample_id": df_test["sample_id"], "price": preds})
submission.to_csv(output_path, index=False)
print(f"β
Predictions saved β {output_path}")
print(submission.head())
if __name__ == "__main__":
predict_new("test_dataset.csv")