Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,35 @@
# -deepshekhardas.github.io
# MovieLens Hybrid Recommender (Streamlit)

A hybrid movie recommendation system combining Collaborative Filtering (SVD) with Content-Based similarity (genres + tags), presented with a modern Streamlit UI.

## Features
- Hybrid scoring: SVD CF + TF-IDF (genres + tags)
- Enter an existing user ID or pick liked/disliked movies
- Top-N recommendations with posters, genres, and predicted ratings
- Optional analytics: top genres, popular movies
- Optional TMDB poster lookup via `TMDB_API_KEY`; falls back to placeholder

## Setup
1. Python 3.10+ recommended
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Download MovieLens (ml-latest-small):
```bash
python scripts/download_movielens.py --out_dir data
```
4. Run the app:
```bash
streamlit run app.py
```

## Environment
- Optional: export `TMDB_API_KEY` for posters
```bash
export TMDB_API_KEY=your_key_here
```

## Notes
- The SVD model is (re)trained on startup for demo simplicity (small dataset). When you adjust liked/disliked movies, the system may re-fit quickly to include your new ratings.
- This is a demo-oriented implementation prioritizing clarity and presentation.
159 changes: 159 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from __future__ import annotations

import os
from pathlib import Path
from typing import List

import joblib
import numpy as np
import pandas as pd
import streamlit as st

from src.data_loader import load_movielens, build_popularity
from src.content_model import fit_content_model
from src.cf_model import train_svd
from src.hybrid import hybrid_recommend
from src.posters import get_poster_url

ARTIFACTS = Path("artifacts")
DATA_DIR = Path("data/ml-latest-small")

st.set_page_config(page_title="Hybrid Movie Recommender", page_icon="🎬", layout="wide")

# Minimal CSS for card-style layout
CARD_CSS = """
<style>
.card {
border-radius: 12px;
background: #ffffff10;
padding: 12px;
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
height: 100%;
border: 1px solid rgba(255,255,255,0.15);
}
.poster {
width: 100%;
border-radius: 8px;
}
.title {font-weight: 700; margin-top: 8px;}
.meta {opacity: 0.85; font-size: 0.9rem;}
.score {font-size: 0.9rem; margin-top: 6px;}
</style>
"""

st.markdown(CARD_CSS, unsafe_allow_html=True)

@st.cache_resource(show_spinner=False)
def get_data():
mld = load_movielens(DATA_DIR)
pop = build_popularity(mld)
return mld, pop

@st.cache_resource(show_spinner=False)
def get_models(mld):
vectorizer, tfidf, sim = fit_content_model(mld.movies, mld.tags)
cf = train_svd(mld.ratings)
return (vectorizer, tfidf, sim), cf


def ensure_dataset():
if not DATA_DIR.exists():
st.warning("Dataset not found. Click the button to download MovieLens ml-latest-small.")
if st.button("Download Dataset"):
from scripts.download_movielens import download_and_extract, ML_SMALL_URL
download_and_extract(ML_SMALL_URL, Path("data"))
st.experimental_rerun()


def render_cards(df: pd.DataFrame):
cols = st.columns(5)
for i, (_, row) in enumerate(df.iterrows()):
col = cols[i % 5]
with col:
with st.container(border=False):
st.markdown('<div class="card">', unsafe_allow_html=True)
poster = get_poster_url(row["clean_title"], None if pd.isna(row["year"]) else int(row["year"]))
st.image(poster, use_column_width=True)
st.markdown(f"<div class=title>{row['clean_title']}</div>", unsafe_allow_html=True)
st.markdown(f"<div class=meta>{', '.join(row['genres_list'])}</div>", unsafe_allow_html=True)
st.markdown(f"<div class=score>Hybrid: {row['hybrid_score']:.3f} | CF: {row['cf_score']:.3f} | Content: {row['content_score']:.3f}</div>", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)


def main():
st.title("🎬 Hybrid Movie Recommender")
st.caption("MovieLens ml-latest-small | SVD + TF-IDF (genres + tags)")

ensure_dataset()
if not DATA_DIR.exists():
st.stop()

with st.spinner("Loading data and training models..."):
mld, pop = get_data()
(content_vec, tfidf, sim), cf_model = get_models(mld)

st.sidebar.header("Input")
mode = st.sidebar.radio("Mode", ["Existing User ID", "Pick Likes/Dislikes"])
top_n = st.sidebar.slider("Top N", 5, 20, 10)
alpha_cf = st.sidebar.slider("CF weight", 0.0, 1.0, 0.6, 0.05)
alpha_content = 1.0 - alpha_cf

all_movie_ids = mld.movies["movieId"].tolist()

liked_ids: List[int] = []
disliked_ids: List[int] = []
user_id = None
existing_user_ids = sorted(mld.ratings["userId"].unique().tolist())

if mode == "Existing User ID":
user_id = st.sidebar.selectbox("User ID", existing_user_ids, index=0)
user_rated = mld.ratings[mld.ratings["userId"] == user_id]["movieId"].tolist()
st.sidebar.caption(f"User has rated {len(user_rated)} movies")
exclude = user_rated
else:
search_titles = mld.movies.sort_values("clean_title")["clean_title"].tolist()
likes = st.sidebar.multiselect("Liked movies", search_titles[:5000])
dislikes = st.sidebar.multiselect("Disliked movies", search_titles[:5000])
# Map back to ids
liked_ids = mld.movies[mld.movies["clean_title"].isin(likes)]["movieId"].tolist()
disliked_ids = mld.movies[mld.movies["clean_title"].isin(dislikes)]["movieId"].tolist()
exclude = liked_ids + disliked_ids

if st.sidebar.button("Recommend"):
with st.spinner("Scoring recommendations..."):
recs = hybrid_recommend(
movies=mld.movies,
cf_model=cf_model,
content_sim_matrix=sim,
all_movie_ids=all_movie_ids,
user_id=user_id,
liked_movie_ids=liked_ids,
disliked_movie_ids=disliked_ids,
alpha_cf=alpha_cf,
alpha_content=alpha_content,
exclude_rated=exclude,
top_n=top_n,
)

st.subheader("Top Recommendations")
render_cards(recs)

st.divider()
st.subheader("Analytics")
col1, col2, col3 = st.columns([1,1,1])
with col1:
st.metric("Ratings count", len(mld.ratings))
st.metric("Users", mld.ratings["userId"].nunique())
st.metric("Movies", len(mld.movies))
with col2:
st.write("Model RMSE (cf):", f"{cf_model.rmse:.4f}")
with col3:
top_pop = (
mld.movies.merge(pop, on="movieId").sort_values(["pop_count", "pop_mean"], ascending=False).head(10)
)
st.write("Most Rated Movies:")
st.dataframe(top_pop[["clean_title","genres","pop_count","pop_mean"]], hide_index=True, use_container_width=True)


if __name__ == "__main__":
main()
Binary file added movielens_hybrid_recommender.zip
Binary file not shown.
10 changes: 10 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
pandas==2.2.2
numpy==1.26.4
scikit-learn==1.5.2
scikit-surprise==1.1.4
streamlit==1.37.1
requests==2.32.3
plotly==5.23.0
matplotlib==3.9.2
scipy==1.13.1
joblib==1.4.2
44 changes: 44 additions & 0 deletions scripts/download_movielens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
import argparse
import io
import os
import sys
import zipfile
from pathlib import Path

import requests

ML_SMALL_URL = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"


def download_and_extract(url: str, out_dir: Path) -> Path:
out_dir.mkdir(parents=True, exist_ok=True)
zip_path = out_dir / "ml-latest-small.zip"
print(f"Downloading {url} -> {zip_path} ...")
r = requests.get(url, timeout=60)
r.raise_for_status()
zip_path.write_bytes(r.content)

print("Extracting...")
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
zf.extractall(out_dir)

dataset_dir = out_dir / "ml-latest-small"
if not dataset_dir.exists():
print("Extraction failed: directory not found", file=sys.stderr)
sys.exit(1)
print(f"Dataset available at: {dataset_dir}")
return dataset_dir


def main():
parser = argparse.ArgumentParser(description="Download MovieLens ml-latest-small")
parser.add_argument("--out_dir", type=str, default="data", help="Output directory")
args = parser.parse_args()

out = Path(args.out_dir)
download_and_extract(ML_SMALL_URL, out)


if __name__ == "__main__":
main()
38 changes: 38 additions & 0 deletions scripts/smoke_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
from pathlib import Path

from src.data_loader import load_movielens
from src.content_model import fit_content_model
from src.cf_model import train_svd
from src.hybrid import hybrid_recommend

DATA_DIR = Path("data/ml-latest-small")

def main():
print("Loading data...")
mld = load_movielens(DATA_DIR)
print(f"Movies: {len(mld.movies)}, Ratings: {len(mld.ratings)}, Tags: {len(mld.tags)}")
print("Training content model...")
_, _, sim = fit_content_model(mld.movies, mld.tags)
print("Training SVD model...")
cf = train_svd(mld.ratings)
print(f"CF RMSE: {cf.rmse:.4f}")

user_id = int(mld.ratings['userId'].sample(1, random_state=42).iloc[0])
print(f"Generating recommendations for user {user_id}...")
all_movie_ids = mld.movies['movieId'].tolist()
user_rated = mld.ratings[mld.ratings['userId'] == user_id]['movieId'].tolist()
recs = hybrid_recommend(
movies=mld.movies,
cf_model=cf,
content_sim_matrix=sim,
all_movie_ids=all_movie_ids,
user_id=user_id,
exclude_rated=user_rated,
top_n=5,
)
for i, row in recs.iterrows():
print(f"- {row['clean_title']} ({row['year']}) -> {row['hybrid_score']:.3f}")

if __name__ == "__main__":
main()
Empty file added src/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions src/cf_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from __future__ import annotations

from typing import Iterable, List, Tuple

import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse


class SVDModel:
def __init__(self, model: SVD, trainset):
self.model = model
self.trainset = trainset
self.rmse = None


def train_svd(ratings: pd.DataFrame, n_factors: int = 100, n_epochs: int = 20, random_state: int = 42) -> SVDModel:
reader = Reader(rating_scale=(ratings["rating"].min(), ratings["rating"].max()))
dataset = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=random_state)

algo = SVD(n_factors=n_factors, n_epochs=n_epochs, biased=True, random_state=random_state, verbose=False)
algo.fit(trainset)
preds = algo.test(testset)
err = rmse(preds, verbose=False)
wrapper = SVDModel(algo, trainset)
wrapper.rmse = err
return wrapper


def predict_for_user(model: SVDModel, user_id: int, candidate_movie_ids: Iterable[int]) -> pd.Series:
algo = model.model
# surprise expects raw ids; it will map internally via trainset
preds = [algo.predict(uid=user_id, iid=mid, r_ui=None, verbose=False) for mid in candidate_movie_ids]
ests = {int(p.iid): p.est for p in preds}
return pd.Series(ests)
57 changes: 57 additions & 0 deletions src/content_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from typing import Tuple, List

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def _build_text_corpus(movies: pd.DataFrame, tags: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
# Aggregate tags per movie
tags_agg = (
tags.groupby("movieId")["tag"].apply(lambda s: " ".join(map(str, s.tolist())))
if len(tags) > 0
else pd.Series(dtype=str)
)

movies = movies.copy()
movies["tags_text"] = movies["movieId"].map(tags_agg).fillna("")
movies["genres_text"] = movies["genres"].fillna("").str.replace("|", " ")
movies["text"] = (movies["clean_title"].fillna("") + " " + movies["genres_text"] + " " + movies["tags_text"]).str.lower()
return movies, movies["text"].tolist()


def fit_content_model(movies: pd.DataFrame, tags: pd.DataFrame) -> Tuple[TfidfVectorizer, np.ndarray, np.ndarray]:
movies_text_df, corpus = _build_text_corpus(movies, tags)
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
tfidf = vectorizer.fit_transform(corpus)
# Item-item cosine similarity
sim = cosine_similarity(tfidf)
return vectorizer, tfidf, sim


def content_similarity_scores(
liked_movie_ids: List[int],
disliked_movie_ids: List[int],
sim_matrix: np.ndarray,
all_movie_ids: List[int],
alpha_like: float = 1.0,
alpha_dislike: float = 1.0,
) -> pd.Series:
"""Compute content similarity scores from liked and disliked movie sets."""
id_to_index = {mid: idx for idx, mid in enumerate(all_movie_ids)}
scores = np.zeros(len(all_movie_ids), dtype=float)

if liked_movie_ids:
idxs = [id_to_index[m] for m in liked_movie_ids if m in id_to_index]
if idxs:
scores += alpha_like * np.mean(sim_matrix[idxs, :], axis=0)

if disliked_movie_ids:
idxs = [id_to_index[m] for m in disliked_movie_ids if m in id_to_index]
if idxs:
scores -= alpha_dislike * np.mean(sim_matrix[idxs, :], axis=0)

return pd.Series(scores, index=all_movie_ids)
Loading