diff --git a/README.md b/README.md
index 58055a0..54c8809 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,35 @@
-# -deepshekhardas.github.io
\ No newline at end of file
+# MovieLens Hybrid Recommender (Streamlit)
+
+A hybrid movie recommendation system combining Collaborative Filtering (SVD) with Content-Based similarity (genres + tags), presented with a modern Streamlit UI.
+
+## Features
+- Hybrid scoring: SVD CF + TF-IDF (genres + tags)
+- Enter an existing user ID or pick liked/disliked movies
+- Top-N recommendations with posters, genres, and predicted ratings
+- Optional analytics: top genres, popular movies
+- Optional TMDB poster lookup via `TMDB_API_KEY`; falls back to placeholder
+
+## Setup
+1. Python 3.10+ recommended
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Download MovieLens (ml-latest-small):
+```bash
+python scripts/download_movielens.py --out_dir data
+```
+4. Run the app:
+```bash
+streamlit run app.py
+```
+
+## Environment
+- Optional: export `TMDB_API_KEY` for posters
+```bash
+export TMDB_API_KEY=your_key_here
+```
+
+## Notes
+- The SVD model is (re)trained on startup for demo simplicity (small dataset). When you adjust liked/disliked movies, the system may re-fit quickly to include your new ratings.
+- This is a demo-oriented implementation prioritizing clarity and presentation.
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..fc5132d
--- /dev/null
+++ b/app.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import List
+
+import joblib
+import numpy as np
+import pandas as pd
+import streamlit as st
+
+from src.data_loader import load_movielens, build_popularity
+from src.content_model import fit_content_model
+from src.cf_model import train_svd
+from src.hybrid import hybrid_recommend
+from src.posters import get_poster_url
+
+ARTIFACTS = Path("artifacts")
+DATA_DIR = Path("data/ml-latest-small")
+
+st.set_page_config(page_title="Hybrid Movie Recommender", page_icon="🎬", layout="wide")
+
+# Minimal CSS for card-style layout
+CARD_CSS = """
+
+"""
+
+st.markdown(CARD_CSS, unsafe_allow_html=True)
+
+@st.cache_resource(show_spinner=False)
+def get_data():
+ mld = load_movielens(DATA_DIR)
+ pop = build_popularity(mld)
+ return mld, pop
+
+@st.cache_resource(show_spinner=False)
+def get_models(mld):
+ vectorizer, tfidf, sim = fit_content_model(mld.movies, mld.tags)
+ cf = train_svd(mld.ratings)
+ return (vectorizer, tfidf, sim), cf
+
+
+def ensure_dataset():
+ if not DATA_DIR.exists():
+ st.warning("Dataset not found. Click the button to download MovieLens ml-latest-small.")
+ if st.button("Download Dataset"):
+ from scripts.download_movielens import download_and_extract, ML_SMALL_URL
+ download_and_extract(ML_SMALL_URL, Path("data"))
+ st.experimental_rerun()
+
+
+def render_cards(df: pd.DataFrame):
+ cols = st.columns(5)
+ for i, (_, row) in enumerate(df.iterrows()):
+ col = cols[i % 5]
+ with col:
+ with st.container(border=False):
+ st.markdown('
', unsafe_allow_html=True)
+ poster = get_poster_url(row["clean_title"], None if pd.isna(row["year"]) else int(row["year"]))
+ st.image(poster, use_column_width=True)
+ st.markdown(f"
{row['clean_title']}
", unsafe_allow_html=True)
+ st.markdown(f"
{', '.join(row['genres_list'])}
", unsafe_allow_html=True)
+ st.markdown(f"
Hybrid: {row['hybrid_score']:.3f} | CF: {row['cf_score']:.3f} | Content: {row['content_score']:.3f}
", unsafe_allow_html=True)
+ st.markdown('
', unsafe_allow_html=True)
+
+
+def main():
+ st.title("🎬 Hybrid Movie Recommender")
+ st.caption("MovieLens ml-latest-small | SVD + TF-IDF (genres + tags)")
+
+ ensure_dataset()
+ if not DATA_DIR.exists():
+ st.stop()
+
+ with st.spinner("Loading data and training models..."):
+ mld, pop = get_data()
+ (content_vec, tfidf, sim), cf_model = get_models(mld)
+
+ st.sidebar.header("Input")
+ mode = st.sidebar.radio("Mode", ["Existing User ID", "Pick Likes/Dislikes"])
+ top_n = st.sidebar.slider("Top N", 5, 20, 10)
+ alpha_cf = st.sidebar.slider("CF weight", 0.0, 1.0, 0.6, 0.05)
+ alpha_content = 1.0 - alpha_cf
+
+ all_movie_ids = mld.movies["movieId"].tolist()
+
+ liked_ids: List[int] = []
+ disliked_ids: List[int] = []
+ user_id = None
+ existing_user_ids = sorted(mld.ratings["userId"].unique().tolist())
+
+ if mode == "Existing User ID":
+ user_id = st.sidebar.selectbox("User ID", existing_user_ids, index=0)
+ user_rated = mld.ratings[mld.ratings["userId"] == user_id]["movieId"].tolist()
+ st.sidebar.caption(f"User has rated {len(user_rated)} movies")
+ exclude = user_rated
+ else:
+ search_titles = mld.movies.sort_values("clean_title")["clean_title"].tolist()
+ likes = st.sidebar.multiselect("Liked movies", search_titles[:5000])
+ dislikes = st.sidebar.multiselect("Disliked movies", search_titles[:5000])
+ # Map back to ids
+ liked_ids = mld.movies[mld.movies["clean_title"].isin(likes)]["movieId"].tolist()
+ disliked_ids = mld.movies[mld.movies["clean_title"].isin(dislikes)]["movieId"].tolist()
+ exclude = liked_ids + disliked_ids
+
+ if st.sidebar.button("Recommend"):
+ with st.spinner("Scoring recommendations..."):
+ recs = hybrid_recommend(
+ movies=mld.movies,
+ cf_model=cf_model,
+ content_sim_matrix=sim,
+ all_movie_ids=all_movie_ids,
+ user_id=user_id,
+ liked_movie_ids=liked_ids,
+ disliked_movie_ids=disliked_ids,
+ alpha_cf=alpha_cf,
+ alpha_content=alpha_content,
+ exclude_rated=exclude,
+ top_n=top_n,
+ )
+
+ st.subheader("Top Recommendations")
+ render_cards(recs)
+
+ st.divider()
+ st.subheader("Analytics")
+ col1, col2, col3 = st.columns([1,1,1])
+ with col1:
+ st.metric("Ratings count", len(mld.ratings))
+ st.metric("Users", mld.ratings["userId"].nunique())
+ st.metric("Movies", len(mld.movies))
+ with col2:
+ st.write("Model RMSE (cf):", f"{cf_model.rmse:.4f}")
+ with col3:
+ top_pop = (
+ mld.movies.merge(pop, on="movieId").sort_values(["pop_count", "pop_mean"], ascending=False).head(10)
+ )
+ st.write("Most Rated Movies:")
+ st.dataframe(top_pop[["clean_title","genres","pop_count","pop_mean"]], hide_index=True, use_container_width=True)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/movielens_hybrid_recommender.zip b/movielens_hybrid_recommender.zip
new file mode 100644
index 0000000..bc4422b
Binary files /dev/null and b/movielens_hybrid_recommender.zip differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..0c35f64
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,10 @@
+pandas==2.2.2
+numpy==1.26.4
+scikit-learn==1.5.2
+scikit-surprise==1.1.4
+streamlit==1.37.1
+requests==2.32.3
+plotly==5.23.0
+matplotlib==3.9.2
+scipy==1.13.1
+joblib==1.4.2
\ No newline at end of file
diff --git a/scripts/download_movielens.py b/scripts/download_movielens.py
new file mode 100644
index 0000000..166e760
--- /dev/null
+++ b/scripts/download_movielens.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+import argparse
+import io
+import os
+import sys
+import zipfile
+from pathlib import Path
+
+import requests
+
+ML_SMALL_URL = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
+
+
+def download_and_extract(url: str, out_dir: Path) -> Path:
+ out_dir.mkdir(parents=True, exist_ok=True)
+ zip_path = out_dir / "ml-latest-small.zip"
+ print(f"Downloading {url} -> {zip_path} ...")
+ r = requests.get(url, timeout=60)
+ r.raise_for_status()
+ zip_path.write_bytes(r.content)
+
+ print("Extracting...")
+ with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
+ zf.extractall(out_dir)
+
+ dataset_dir = out_dir / "ml-latest-small"
+ if not dataset_dir.exists():
+ print("Extraction failed: directory not found", file=sys.stderr)
+ sys.exit(1)
+ print(f"Dataset available at: {dataset_dir}")
+ return dataset_dir
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Download MovieLens ml-latest-small")
+ parser.add_argument("--out_dir", type=str, default="data", help="Output directory")
+ args = parser.parse_args()
+
+ out = Path(args.out_dir)
+ download_and_extract(ML_SMALL_URL, out)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/scripts/smoke_test.py b/scripts/smoke_test.py
new file mode 100644
index 0000000..0e07860
--- /dev/null
+++ b/scripts/smoke_test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+from pathlib import Path
+
+from src.data_loader import load_movielens
+from src.content_model import fit_content_model
+from src.cf_model import train_svd
+from src.hybrid import hybrid_recommend
+
+DATA_DIR = Path("data/ml-latest-small")
+
+def main():
+ print("Loading data...")
+ mld = load_movielens(DATA_DIR)
+ print(f"Movies: {len(mld.movies)}, Ratings: {len(mld.ratings)}, Tags: {len(mld.tags)}")
+ print("Training content model...")
+ _, _, sim = fit_content_model(mld.movies, mld.tags)
+ print("Training SVD model...")
+ cf = train_svd(mld.ratings)
+ print(f"CF RMSE: {cf.rmse:.4f}")
+
+ user_id = int(mld.ratings['userId'].sample(1, random_state=42).iloc[0])
+ print(f"Generating recommendations for user {user_id}...")
+ all_movie_ids = mld.movies['movieId'].tolist()
+ user_rated = mld.ratings[mld.ratings['userId'] == user_id]['movieId'].tolist()
+ recs = hybrid_recommend(
+ movies=mld.movies,
+ cf_model=cf,
+ content_sim_matrix=sim,
+ all_movie_ids=all_movie_ids,
+ user_id=user_id,
+ exclude_rated=user_rated,
+ top_n=5,
+ )
+ for i, row in recs.iterrows():
+ print(f"- {row['clean_title']} ({row['year']}) -> {row['hybrid_score']:.3f}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/cf_model.py b/src/cf_model.py
new file mode 100644
index 0000000..6c1445d
--- /dev/null
+++ b/src/cf_model.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+from typing import Iterable, List, Tuple
+
+import numpy as np
+import pandas as pd
+from surprise import Dataset, Reader, SVD
+from surprise.model_selection import train_test_split
+from surprise.accuracy import rmse
+
+
+class SVDModel:
+ def __init__(self, model: SVD, trainset):
+ self.model = model
+ self.trainset = trainset
+ self.rmse = None
+
+
+def train_svd(ratings: pd.DataFrame, n_factors: int = 100, n_epochs: int = 20, random_state: int = 42) -> SVDModel:
+ reader = Reader(rating_scale=(ratings["rating"].min(), ratings["rating"].max()))
+ dataset = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
+ trainset, testset = train_test_split(dataset, test_size=0.2, random_state=random_state)
+
+ algo = SVD(n_factors=n_factors, n_epochs=n_epochs, biased=True, random_state=random_state, verbose=False)
+ algo.fit(trainset)
+ preds = algo.test(testset)
+ err = rmse(preds, verbose=False)
+ wrapper = SVDModel(algo, trainset)
+ wrapper.rmse = err
+ return wrapper
+
+
+def predict_for_user(model: SVDModel, user_id: int, candidate_movie_ids: Iterable[int]) -> pd.Series:
+ algo = model.model
+ # surprise expects raw ids; it will map internally via trainset
+ preds = [algo.predict(uid=user_id, iid=mid, r_ui=None, verbose=False) for mid in candidate_movie_ids]
+ ests = {int(p.iid): p.est for p in preds}
+ return pd.Series(ests)
\ No newline at end of file
diff --git a/src/content_model.py b/src/content_model.py
new file mode 100644
index 0000000..2c2369a
--- /dev/null
+++ b/src/content_model.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from typing import Tuple, List
+
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+
+def _build_text_corpus(movies: pd.DataFrame, tags: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
+ # Aggregate tags per movie
+ tags_agg = (
+ tags.groupby("movieId")["tag"].apply(lambda s: " ".join(map(str, s.tolist())))
+ if len(tags) > 0
+ else pd.Series(dtype=str)
+ )
+
+ movies = movies.copy()
+ movies["tags_text"] = movies["movieId"].map(tags_agg).fillna("")
+ movies["genres_text"] = movies["genres"].fillna("").str.replace("|", " ")
+ movies["text"] = (movies["clean_title"].fillna("") + " " + movies["genres_text"] + " " + movies["tags_text"]).str.lower()
+ return movies, movies["text"].tolist()
+
+
+def fit_content_model(movies: pd.DataFrame, tags: pd.DataFrame) -> Tuple[TfidfVectorizer, np.ndarray, np.ndarray]:
+ movies_text_df, corpus = _build_text_corpus(movies, tags)
+ vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
+ tfidf = vectorizer.fit_transform(corpus)
+ # Item-item cosine similarity
+ sim = cosine_similarity(tfidf)
+ return vectorizer, tfidf, sim
+
+
+def content_similarity_scores(
+ liked_movie_ids: List[int],
+ disliked_movie_ids: List[int],
+ sim_matrix: np.ndarray,
+ all_movie_ids: List[int],
+ alpha_like: float = 1.0,
+ alpha_dislike: float = 1.0,
+) -> pd.Series:
+ """Compute content similarity scores from liked and disliked movie sets."""
+ id_to_index = {mid: idx for idx, mid in enumerate(all_movie_ids)}
+ scores = np.zeros(len(all_movie_ids), dtype=float)
+
+ if liked_movie_ids:
+ idxs = [id_to_index[m] for m in liked_movie_ids if m in id_to_index]
+ if idxs:
+ scores += alpha_like * np.mean(sim_matrix[idxs, :], axis=0)
+
+ if disliked_movie_ids:
+ idxs = [id_to_index[m] for m in disliked_movie_ids if m in id_to_index]
+ if idxs:
+ scores -= alpha_dislike * np.mean(sim_matrix[idxs, :], axis=0)
+
+ return pd.Series(scores, index=all_movie_ids)
\ No newline at end of file
diff --git a/src/data_loader.py b/src/data_loader.py
new file mode 100644
index 0000000..3734673
--- /dev/null
+++ b/src/data_loader.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Tuple
+
+import pandas as pd
+
+
+@dataclass
+class MovieLensData:
+ movies: pd.DataFrame
+ ratings: pd.DataFrame
+ tags: pd.DataFrame
+
+
+def load_movielens(dataset_dir: str | Path) -> MovieLensData:
+ dataset_path = Path(dataset_dir)
+ movies_path = dataset_path / "movies.csv"
+ ratings_path = dataset_path / "ratings.csv"
+ tags_path = dataset_path / "tags.csv"
+
+ if not movies_path.exists() or not ratings_path.exists():
+ raise FileNotFoundError(
+ f"Could not find movies.csv/ratings.csv in {dataset_path}. Run downloader or check path."
+ )
+
+ movies = pd.read_csv(movies_path)
+ ratings = pd.read_csv(ratings_path)
+ tags = pd.read_csv(tags_path) if tags_path.exists() else pd.DataFrame(columns=["userId","movieId","tag","timestamp"])
+
+ # Basic cleanup
+ movies["year"] = movies["title"].str.extract(r"\((\d{4})\)").astype("Int64")
+ movies["clean_title"] = movies["title"].str.replace(r"\s*\(\d{4}\)$", "", regex=True)
+ movies["genres_list"] = movies["genres"].fillna("").apply(lambda g: [x for x in g.split("|") if x != "(no genres listed)"])
+
+ return MovieLensData(movies=movies, ratings=ratings, tags=tags)
+
+
+def build_popularity(mld: MovieLensData) -> pd.DataFrame:
+ pop = (
+ mld.ratings.groupby("movieId")["rating"]
+ .agg(pop_count="count", pop_mean="mean")
+ .reset_index()
+ )
+ return pop
\ No newline at end of file
diff --git a/src/hybrid.py b/src/hybrid.py
new file mode 100644
index 0000000..2fcb98d
--- /dev/null
+++ b/src/hybrid.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+
+from .content_model import content_similarity_scores
+from .cf_model import SVDModel, predict_for_user
+
+
+def _minmax(series: pd.Series) -> pd.Series:
+ if series.max() == series.min():
+ return pd.Series(0.5, index=series.index)
+ return (series - series.min()) / (series.max() - series.min())
+
+
+def hybrid_recommend(
+ movies: pd.DataFrame,
+ cf_model: Optional[SVDModel],
+ content_sim_matrix,
+ all_movie_ids: List[int],
+ user_id: Optional[int] = None,
+ liked_movie_ids: Optional[List[int]] = None,
+ disliked_movie_ids: Optional[List[int]] = None,
+ alpha_cf: float = 0.6,
+ alpha_content: float = 0.4,
+ exclude_rated: Optional[List[int]] = None,
+ top_n: int = 10,
+) -> pd.DataFrame:
+ liked_movie_ids = liked_movie_ids or []
+ disliked_movie_ids = disliked_movie_ids or []
+ exclude_rated = set(exclude_rated or [])
+
+ # CF scores (if model/user provided)
+ cf_scores = pd.Series(0.0, index=all_movie_ids)
+ if cf_model is not None and (user_id is not None or liked_movie_ids):
+ cf_scores = predict_for_user(cf_model, user_id if user_id is not None else -1, all_movie_ids)
+ cf_scores = cf_scores.reindex(all_movie_ids).fillna(cf_scores.mean())
+
+ # Content scores
+ content_scores = content_similarity_scores(liked_movie_ids, disliked_movie_ids, content_sim_matrix, all_movie_ids)
+
+ # Normalize
+ cf_norm = _minmax(cf_scores)
+ content_norm = _minmax(content_scores)
+
+ hybrid = alpha_cf * cf_norm + alpha_content * content_norm
+
+ # Exclude rated/selected movies
+ for mid in liked_movie_ids + list(disliked_movie_ids) + list(exclude_rated):
+ if mid in hybrid.index:
+ hybrid.loc[mid] = -1.0
+
+ top_ids = hybrid.sort_values(ascending=False).head(top_n).index.tolist()
+ res = movies[movies["movieId"].isin(top_ids)].copy()
+ res["hybrid_score"] = res["movieId"].map(hybrid)
+ res["cf_score"] = res["movieId"].map(cf_scores)
+ res["content_score"] = res["movieId"].map(content_scores)
+ return res.sort_values("hybrid_score", ascending=False)
\ No newline at end of file
diff --git a/src/posters.py b/src/posters.py
new file mode 100644
index 0000000..408d1fc
--- /dev/null
+++ b/src/posters.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import os
+from typing import Optional
+
+import requests
+
+TMDB_SEARCH_URL = "https://api.themoviedb.org/3/search/movie"
+TMDB_IMG_BASE = "https://image.tmdb.org/t/p/w342"
+PLACEHOLDER = "https://via.placeholder.com/342x513?text=No+Poster"
+
+
+def get_poster_url(title: str, year: Optional[int] = None, api_key: Optional[str] = None) -> str:
+ api_key = api_key or os.getenv("TMDB_API_KEY")
+ if not api_key:
+ return PLACEHOLDER
+ try:
+ params = {"api_key": api_key, "query": title}
+ if year:
+ params["year"] = int(year)
+ r = requests.get(TMDB_SEARCH_URL, params=params, timeout=10)
+ r.raise_for_status()
+ data = r.json()
+ results = data.get("results", [])
+ if not results:
+ return PLACEHOLDER
+ poster_path = results[0].get("poster_path")
+ if not poster_path:
+ return PLACEHOLDER
+ return f"{TMDB_IMG_BASE}{poster_path}"
+ except Exception:
+ return PLACEHOLDER
\ No newline at end of file