diff --git a/README.md b/README.md index 58055a0..54c8809 100644 --- a/README.md +++ b/README.md @@ -1 +1,35 @@ -# -deepshekhardas.github.io \ No newline at end of file +# MovieLens Hybrid Recommender (Streamlit) + +A hybrid movie recommendation system combining Collaborative Filtering (SVD) with Content-Based similarity (genres + tags), presented with a modern Streamlit UI. + +## Features +- Hybrid scoring: SVD CF + TF-IDF (genres + tags) +- Enter an existing user ID or pick liked/disliked movies +- Top-N recommendations with posters, genres, and predicted ratings +- Optional analytics: top genres, popular movies +- Optional TMDB poster lookup via `TMDB_API_KEY`; falls back to placeholder + +## Setup +1. Python 3.10+ recommended +2. Install dependencies: +```bash +pip install -r requirements.txt +``` +3. Download MovieLens (ml-latest-small): +```bash +python scripts/download_movielens.py --out_dir data +``` +4. Run the app: +```bash +streamlit run app.py +``` + +## Environment +- Optional: export `TMDB_API_KEY` for posters +```bash +export TMDB_API_KEY=your_key_here +``` + +## Notes +- The SVD model is (re)trained on startup for demo simplicity (small dataset). When you adjust liked/disliked movies, the system may re-fit quickly to include your new ratings. +- This is a demo-oriented implementation prioritizing clarity and presentation. \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..fc5132d --- /dev/null +++ b/app.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import List + +import joblib +import numpy as np +import pandas as pd +import streamlit as st + +from src.data_loader import load_movielens, build_popularity +from src.content_model import fit_content_model +from src.cf_model import train_svd +from src.hybrid import hybrid_recommend +from src.posters import get_poster_url + +ARTIFACTS = Path("artifacts") +DATA_DIR = Path("data/ml-latest-small") + +st.set_page_config(page_title="Hybrid Movie Recommender", page_icon="🎬", layout="wide") + +# Minimal CSS for card-style layout +CARD_CSS = """ + +""" + +st.markdown(CARD_CSS, unsafe_allow_html=True) + +@st.cache_resource(show_spinner=False) +def get_data(): + mld = load_movielens(DATA_DIR) + pop = build_popularity(mld) + return mld, pop + +@st.cache_resource(show_spinner=False) +def get_models(mld): + vectorizer, tfidf, sim = fit_content_model(mld.movies, mld.tags) + cf = train_svd(mld.ratings) + return (vectorizer, tfidf, sim), cf + + +def ensure_dataset(): + if not DATA_DIR.exists(): + st.warning("Dataset not found. Click the button to download MovieLens ml-latest-small.") + if st.button("Download Dataset"): + from scripts.download_movielens import download_and_extract, ML_SMALL_URL + download_and_extract(ML_SMALL_URL, Path("data")) + st.experimental_rerun() + + +def render_cards(df: pd.DataFrame): + cols = st.columns(5) + for i, (_, row) in enumerate(df.iterrows()): + col = cols[i % 5] + with col: + with st.container(border=False): + st.markdown('
', unsafe_allow_html=True) + poster = get_poster_url(row["clean_title"], None if pd.isna(row["year"]) else int(row["year"])) + st.image(poster, use_column_width=True) + st.markdown(f"
{row['clean_title']}
", unsafe_allow_html=True) + st.markdown(f"
{', '.join(row['genres_list'])}
", unsafe_allow_html=True) + st.markdown(f"
Hybrid: {row['hybrid_score']:.3f} | CF: {row['cf_score']:.3f} | Content: {row['content_score']:.3f}
", unsafe_allow_html=True) + st.markdown('
', unsafe_allow_html=True) + + +def main(): + st.title("🎬 Hybrid Movie Recommender") + st.caption("MovieLens ml-latest-small | SVD + TF-IDF (genres + tags)") + + ensure_dataset() + if not DATA_DIR.exists(): + st.stop() + + with st.spinner("Loading data and training models..."): + mld, pop = get_data() + (content_vec, tfidf, sim), cf_model = get_models(mld) + + st.sidebar.header("Input") + mode = st.sidebar.radio("Mode", ["Existing User ID", "Pick Likes/Dislikes"]) + top_n = st.sidebar.slider("Top N", 5, 20, 10) + alpha_cf = st.sidebar.slider("CF weight", 0.0, 1.0, 0.6, 0.05) + alpha_content = 1.0 - alpha_cf + + all_movie_ids = mld.movies["movieId"].tolist() + + liked_ids: List[int] = [] + disliked_ids: List[int] = [] + user_id = None + existing_user_ids = sorted(mld.ratings["userId"].unique().tolist()) + + if mode == "Existing User ID": + user_id = st.sidebar.selectbox("User ID", existing_user_ids, index=0) + user_rated = mld.ratings[mld.ratings["userId"] == user_id]["movieId"].tolist() + st.sidebar.caption(f"User has rated {len(user_rated)} movies") + exclude = user_rated + else: + search_titles = mld.movies.sort_values("clean_title")["clean_title"].tolist() + likes = st.sidebar.multiselect("Liked movies", search_titles[:5000]) + dislikes = st.sidebar.multiselect("Disliked movies", search_titles[:5000]) + # Map back to ids + liked_ids = mld.movies[mld.movies["clean_title"].isin(likes)]["movieId"].tolist() + disliked_ids = mld.movies[mld.movies["clean_title"].isin(dislikes)]["movieId"].tolist() + exclude = liked_ids + disliked_ids + + if st.sidebar.button("Recommend"): + with st.spinner("Scoring recommendations..."): + recs = hybrid_recommend( + movies=mld.movies, + cf_model=cf_model, + content_sim_matrix=sim, + all_movie_ids=all_movie_ids, + user_id=user_id, + liked_movie_ids=liked_ids, + disliked_movie_ids=disliked_ids, + alpha_cf=alpha_cf, + alpha_content=alpha_content, + exclude_rated=exclude, + top_n=top_n, + ) + + st.subheader("Top Recommendations") + render_cards(recs) + + st.divider() + st.subheader("Analytics") + col1, col2, col3 = st.columns([1,1,1]) + with col1: + st.metric("Ratings count", len(mld.ratings)) + st.metric("Users", mld.ratings["userId"].nunique()) + st.metric("Movies", len(mld.movies)) + with col2: + st.write("Model RMSE (cf):", f"{cf_model.rmse:.4f}") + with col3: + top_pop = ( + mld.movies.merge(pop, on="movieId").sort_values(["pop_count", "pop_mean"], ascending=False).head(10) + ) + st.write("Most Rated Movies:") + st.dataframe(top_pop[["clean_title","genres","pop_count","pop_mean"]], hide_index=True, use_container_width=True) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/movielens_hybrid_recommender.zip b/movielens_hybrid_recommender.zip new file mode 100644 index 0000000..bc4422b Binary files /dev/null and b/movielens_hybrid_recommender.zip differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0c35f64 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +pandas==2.2.2 +numpy==1.26.4 +scikit-learn==1.5.2 +scikit-surprise==1.1.4 +streamlit==1.37.1 +requests==2.32.3 +plotly==5.23.0 +matplotlib==3.9.2 +scipy==1.13.1 +joblib==1.4.2 \ No newline at end of file diff --git a/scripts/download_movielens.py b/scripts/download_movielens.py new file mode 100644 index 0000000..166e760 --- /dev/null +++ b/scripts/download_movielens.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import argparse +import io +import os +import sys +import zipfile +from pathlib import Path + +import requests + +ML_SMALL_URL = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip" + + +def download_and_extract(url: str, out_dir: Path) -> Path: + out_dir.mkdir(parents=True, exist_ok=True) + zip_path = out_dir / "ml-latest-small.zip" + print(f"Downloading {url} -> {zip_path} ...") + r = requests.get(url, timeout=60) + r.raise_for_status() + zip_path.write_bytes(r.content) + + print("Extracting...") + with zipfile.ZipFile(io.BytesIO(r.content)) as zf: + zf.extractall(out_dir) + + dataset_dir = out_dir / "ml-latest-small" + if not dataset_dir.exists(): + print("Extraction failed: directory not found", file=sys.stderr) + sys.exit(1) + print(f"Dataset available at: {dataset_dir}") + return dataset_dir + + +def main(): + parser = argparse.ArgumentParser(description="Download MovieLens ml-latest-small") + parser.add_argument("--out_dir", type=str, default="data", help="Output directory") + args = parser.parse_args() + + out = Path(args.out_dir) + download_and_extract(ML_SMALL_URL, out) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/smoke_test.py b/scripts/smoke_test.py new file mode 100644 index 0000000..0e07860 --- /dev/null +++ b/scripts/smoke_test.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +from pathlib import Path + +from src.data_loader import load_movielens +from src.content_model import fit_content_model +from src.cf_model import train_svd +from src.hybrid import hybrid_recommend + +DATA_DIR = Path("data/ml-latest-small") + +def main(): + print("Loading data...") + mld = load_movielens(DATA_DIR) + print(f"Movies: {len(mld.movies)}, Ratings: {len(mld.ratings)}, Tags: {len(mld.tags)}") + print("Training content model...") + _, _, sim = fit_content_model(mld.movies, mld.tags) + print("Training SVD model...") + cf = train_svd(mld.ratings) + print(f"CF RMSE: {cf.rmse:.4f}") + + user_id = int(mld.ratings['userId'].sample(1, random_state=42).iloc[0]) + print(f"Generating recommendations for user {user_id}...") + all_movie_ids = mld.movies['movieId'].tolist() + user_rated = mld.ratings[mld.ratings['userId'] == user_id]['movieId'].tolist() + recs = hybrid_recommend( + movies=mld.movies, + cf_model=cf, + content_sim_matrix=sim, + all_movie_ids=all_movie_ids, + user_id=user_id, + exclude_rated=user_rated, + top_n=5, + ) + for i, row in recs.iterrows(): + print(f"- {row['clean_title']} ({row['year']}) -> {row['hybrid_score']:.3f}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/cf_model.py b/src/cf_model.py new file mode 100644 index 0000000..6c1445d --- /dev/null +++ b/src/cf_model.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from typing import Iterable, List, Tuple + +import numpy as np +import pandas as pd +from surprise import Dataset, Reader, SVD +from surprise.model_selection import train_test_split +from surprise.accuracy import rmse + + +class SVDModel: + def __init__(self, model: SVD, trainset): + self.model = model + self.trainset = trainset + self.rmse = None + + +def train_svd(ratings: pd.DataFrame, n_factors: int = 100, n_epochs: int = 20, random_state: int = 42) -> SVDModel: + reader = Reader(rating_scale=(ratings["rating"].min(), ratings["rating"].max())) + dataset = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader) + trainset, testset = train_test_split(dataset, test_size=0.2, random_state=random_state) + + algo = SVD(n_factors=n_factors, n_epochs=n_epochs, biased=True, random_state=random_state, verbose=False) + algo.fit(trainset) + preds = algo.test(testset) + err = rmse(preds, verbose=False) + wrapper = SVDModel(algo, trainset) + wrapper.rmse = err + return wrapper + + +def predict_for_user(model: SVDModel, user_id: int, candidate_movie_ids: Iterable[int]) -> pd.Series: + algo = model.model + # surprise expects raw ids; it will map internally via trainset + preds = [algo.predict(uid=user_id, iid=mid, r_ui=None, verbose=False) for mid in candidate_movie_ids] + ests = {int(p.iid): p.est for p in preds} + return pd.Series(ests) \ No newline at end of file diff --git a/src/content_model.py b/src/content_model.py new file mode 100644 index 0000000..2c2369a --- /dev/null +++ b/src/content_model.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from typing import Tuple, List + +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.metrics.pairwise import cosine_similarity + + +def _build_text_corpus(movies: pd.DataFrame, tags: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]: + # Aggregate tags per movie + tags_agg = ( + tags.groupby("movieId")["tag"].apply(lambda s: " ".join(map(str, s.tolist()))) + if len(tags) > 0 + else pd.Series(dtype=str) + ) + + movies = movies.copy() + movies["tags_text"] = movies["movieId"].map(tags_agg).fillna("") + movies["genres_text"] = movies["genres"].fillna("").str.replace("|", " ") + movies["text"] = (movies["clean_title"].fillna("") + " " + movies["genres_text"] + " " + movies["tags_text"]).str.lower() + return movies, movies["text"].tolist() + + +def fit_content_model(movies: pd.DataFrame, tags: pd.DataFrame) -> Tuple[TfidfVectorizer, np.ndarray, np.ndarray]: + movies_text_df, corpus = _build_text_corpus(movies, tags) + vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2)) + tfidf = vectorizer.fit_transform(corpus) + # Item-item cosine similarity + sim = cosine_similarity(tfidf) + return vectorizer, tfidf, sim + + +def content_similarity_scores( + liked_movie_ids: List[int], + disliked_movie_ids: List[int], + sim_matrix: np.ndarray, + all_movie_ids: List[int], + alpha_like: float = 1.0, + alpha_dislike: float = 1.0, +) -> pd.Series: + """Compute content similarity scores from liked and disliked movie sets.""" + id_to_index = {mid: idx for idx, mid in enumerate(all_movie_ids)} + scores = np.zeros(len(all_movie_ids), dtype=float) + + if liked_movie_ids: + idxs = [id_to_index[m] for m in liked_movie_ids if m in id_to_index] + if idxs: + scores += alpha_like * np.mean(sim_matrix[idxs, :], axis=0) + + if disliked_movie_ids: + idxs = [id_to_index[m] for m in disliked_movie_ids if m in id_to_index] + if idxs: + scores -= alpha_dislike * np.mean(sim_matrix[idxs, :], axis=0) + + return pd.Series(scores, index=all_movie_ids) \ No newline at end of file diff --git a/src/data_loader.py b/src/data_loader.py new file mode 100644 index 0000000..3734673 --- /dev/null +++ b/src/data_loader.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Tuple + +import pandas as pd + + +@dataclass +class MovieLensData: + movies: pd.DataFrame + ratings: pd.DataFrame + tags: pd.DataFrame + + +def load_movielens(dataset_dir: str | Path) -> MovieLensData: + dataset_path = Path(dataset_dir) + movies_path = dataset_path / "movies.csv" + ratings_path = dataset_path / "ratings.csv" + tags_path = dataset_path / "tags.csv" + + if not movies_path.exists() or not ratings_path.exists(): + raise FileNotFoundError( + f"Could not find movies.csv/ratings.csv in {dataset_path}. Run downloader or check path." + ) + + movies = pd.read_csv(movies_path) + ratings = pd.read_csv(ratings_path) + tags = pd.read_csv(tags_path) if tags_path.exists() else pd.DataFrame(columns=["userId","movieId","tag","timestamp"]) + + # Basic cleanup + movies["year"] = movies["title"].str.extract(r"\((\d{4})\)").astype("Int64") + movies["clean_title"] = movies["title"].str.replace(r"\s*\(\d{4}\)$", "", regex=True) + movies["genres_list"] = movies["genres"].fillna("").apply(lambda g: [x for x in g.split("|") if x != "(no genres listed)"]) + + return MovieLensData(movies=movies, ratings=ratings, tags=tags) + + +def build_popularity(mld: MovieLensData) -> pd.DataFrame: + pop = ( + mld.ratings.groupby("movieId")["rating"] + .agg(pop_count="count", pop_mean="mean") + .reset_index() + ) + return pop \ No newline at end of file diff --git a/src/hybrid.py b/src/hybrid.py new file mode 100644 index 0000000..2fcb98d --- /dev/null +++ b/src/hybrid.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd + +from .content_model import content_similarity_scores +from .cf_model import SVDModel, predict_for_user + + +def _minmax(series: pd.Series) -> pd.Series: + if series.max() == series.min(): + return pd.Series(0.5, index=series.index) + return (series - series.min()) / (series.max() - series.min()) + + +def hybrid_recommend( + movies: pd.DataFrame, + cf_model: Optional[SVDModel], + content_sim_matrix, + all_movie_ids: List[int], + user_id: Optional[int] = None, + liked_movie_ids: Optional[List[int]] = None, + disliked_movie_ids: Optional[List[int]] = None, + alpha_cf: float = 0.6, + alpha_content: float = 0.4, + exclude_rated: Optional[List[int]] = None, + top_n: int = 10, +) -> pd.DataFrame: + liked_movie_ids = liked_movie_ids or [] + disliked_movie_ids = disliked_movie_ids or [] + exclude_rated = set(exclude_rated or []) + + # CF scores (if model/user provided) + cf_scores = pd.Series(0.0, index=all_movie_ids) + if cf_model is not None and (user_id is not None or liked_movie_ids): + cf_scores = predict_for_user(cf_model, user_id if user_id is not None else -1, all_movie_ids) + cf_scores = cf_scores.reindex(all_movie_ids).fillna(cf_scores.mean()) + + # Content scores + content_scores = content_similarity_scores(liked_movie_ids, disliked_movie_ids, content_sim_matrix, all_movie_ids) + + # Normalize + cf_norm = _minmax(cf_scores) + content_norm = _minmax(content_scores) + + hybrid = alpha_cf * cf_norm + alpha_content * content_norm + + # Exclude rated/selected movies + for mid in liked_movie_ids + list(disliked_movie_ids) + list(exclude_rated): + if mid in hybrid.index: + hybrid.loc[mid] = -1.0 + + top_ids = hybrid.sort_values(ascending=False).head(top_n).index.tolist() + res = movies[movies["movieId"].isin(top_ids)].copy() + res["hybrid_score"] = res["movieId"].map(hybrid) + res["cf_score"] = res["movieId"].map(cf_scores) + res["content_score"] = res["movieId"].map(content_scores) + return res.sort_values("hybrid_score", ascending=False) \ No newline at end of file diff --git a/src/posters.py b/src/posters.py new file mode 100644 index 0000000..408d1fc --- /dev/null +++ b/src/posters.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import os +from typing import Optional + +import requests + +TMDB_SEARCH_URL = "https://api.themoviedb.org/3/search/movie" +TMDB_IMG_BASE = "https://image.tmdb.org/t/p/w342" +PLACEHOLDER = "https://via.placeholder.com/342x513?text=No+Poster" + + +def get_poster_url(title: str, year: Optional[int] = None, api_key: Optional[str] = None) -> str: + api_key = api_key or os.getenv("TMDB_API_KEY") + if not api_key: + return PLACEHOLDER + try: + params = {"api_key": api_key, "query": title} + if year: + params["year"] = int(year) + r = requests.get(TMDB_SEARCH_URL, params=params, timeout=10) + r.raise_for_status() + data = r.json() + results = data.get("results", []) + if not results: + return PLACEHOLDER + poster_path = results[0].get("poster_path") + if not poster_path: + return PLACEHOLDER + return f"{TMDB_IMG_BASE}{poster_path}" + except Exception: + return PLACEHOLDER \ No newline at end of file