diff --git a/issue-2-ai-skill-evaluator/.gitignore b/issue-2-ai-skill-evaluator/.gitignore
new file mode 100644
index 0000000..4e1161d
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/.gitignore
@@ -0,0 +1,10 @@
+__pycache__/
+*.py[cod]
+.ipynb_checkpoints/
+.DS_Store
+.env
+.venv/
+venv/
+artifacts/
+outputs/
+*.log
diff --git a/issue-2-ai-skill-evaluator/README.md b/issue-2-ai-skill-evaluator/README.md
new file mode 100644
index 0000000..64e4bcf
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/README.md
@@ -0,0 +1,44 @@
+# AI Model for Evaluating 21st Century Skills (Issue #2)
+
+## Project Overview
+This module develops a cost-efficient, fine-tuned open-source Vision Language Model (VLM)
+to evaluate student-submitted artifacts (drawings, written responses) against rubric-based
+frameworks measuring 21st-century skills: creativity, critical thinking, problem-solving, and agency.
+
+**Target cost:** < ₹0.10 per evaluation  
+**Replaces:** Gemini-based evaluation pipeline  
+**Approach:** Supervised fine-tuning of open-source VLMs (LLaMA-based) using PyTorch
+
+## Project Structure
+```text
+data/          -> Rubric schemas and labeled artifact datasets
+notebooks/     -> EDA, benchmarking, and training experiments
+src/           -> Core source code (data utils, evaluator, fine-tuning pipeline)
+```
+
+## Getting Started
+```bash
+pip install -r requirements.txt
+```
+
+## Rubric Framework
+Skills assessed:
+- **Creativity** - originality, expression, divergent thinking
+- **Critical Thinking** - analysis, evaluation, logical reasoning
+- **Problem Solving** - approach, method, outcome quality
+- **Agency** - self-direction, initiative, reflection
+
+## Model Candidates Under Evaluation
+| Model | Params | Multimodal | License |
+|-------|--------|-----------|---------|
+| LLaVA-1.5 | 7B/13B | ✅ | Apache 2.0 |
+| InternVL2 | 2B/8B | ✅ | MIT |
+| Qwen2-VL | 2B/7B | ✅ | Apache 2.0 |
+| PaliGemma | 3B | ✅ | Gemma License |
+
+## Milestones
+- [ ] Dataset preparation and schema design
+- [ ] Model benchmarking (zero-shot performance)
+- [ ] Fine-tuning pipeline setup
+- [ ] Cost-efficiency analysis
+- [ ] Benchmark against Gemini and human evaluators
diff --git a/issue-2-ai-skill-evaluator/data/README.md b/issue-2-ai-skill-evaluator/data/README.md
new file mode 100644
index 0000000..562ded2
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/data/README.md
@@ -0,0 +1,20 @@
+# Data Directory
+
+This directory stores rubric definitions and labeled datasets used for benchmarking and
+fine-tuning rubric-based artifact evaluators.
+
+## Expected Inputs
+- `*.json` rubric files defining the scoring schema for a skill
+- Artifact files such as `.png`, `.jpg`, `.jpeg`, `.txt`, or `.md`
+- Label tables in `.csv` format linking artifact identifiers to rubric scores
+
+## Suggested Rubric Schema
+Each rubric JSON file should include:
+- `rubric_version`
+- `skill`
+- `description`
+- `levels` with `score`, `label`, and `descriptor`
+- `artifact_types`
+- `evaluation_modalities`
+
+See [sample_rubric.json](/C:/Users/asus/OneDrive/Desktop/C4GT_2026/issue-2-ai-skill-evaluator/data/sample_rubric.json) for a starter example.
diff --git a/issue-2-ai-skill-evaluator/data/sample_rubric.json b/issue-2-ai-skill-evaluator/data/sample_rubric.json
new file mode 100644
index 0000000..0558aed
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/data/sample_rubric.json
@@ -0,0 +1,29 @@
+{
+  "rubric_version": "1.0",
+  "skill": "creativity",
+  "description": "Assesses originality and creative expression in student artifacts",
+  "levels": [
+    {
+      "score": 1,
+      "label": "Beginning",
+      "descriptor": "Work shows minimal originality; heavily relies on given prompts or examples"
+    },
+    {
+      "score": 2,
+      "label": "Developing",
+      "descriptor": "Shows some original elements but largely conventional in approach"
+    },
+    {
+      "score": 3,
+      "label": "Proficient",
+      "descriptor": "Demonstrates clear original thinking with creative connections"
+    },
+    {
+      "score": 4,
+      "label": "Exemplary",
+      "descriptor": "Highly original work showing inventive, divergent thinking throughout"
+    }
+  ],
+  "artifact_types": ["drawing", "written_response", "prototype"],
+  "evaluation_modalities": ["vision", "text", "multimodal"]
+}
diff --git a/issue-2-ai-skill-evaluator/notebooks/01_model_benchmarking.ipynb b/issue-2-ai-skill-evaluator/notebooks/01_model_benchmarking.ipynb
new file mode 100644
index 0000000..40e274d
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/notebooks/01_model_benchmarking.ipynb
@@ -0,0 +1,169 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Model Benchmarking for Skill Evaluation\n",
+        "\n",
+        "This notebook is a starter for benchmarking multimodal models on rubric-based artifact evaluation.\n",
+        "For this project, benchmarking means comparing candidate VLMs on:\n",
+        "- zero-shot rubric alignment,\n",
+        "- inference latency,\n",
+        "- estimated cost per evaluation,\n",
+        "- practicality for the INR 0.10 target.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Setup\n",
+        "\n",
+        "Pick a model checkpoint that fits local hardware. The example below shows a zero-shot path for LLaVA-style checkpoints using `transformers`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from pathlib import Path\n",
+        "import time\n",
+        "\n",
+        "import pandas as pd\n",
+        "from PIL import Image\n",
+        "\n",
+        "# Uncomment when running with a supported checkpoint locally.\n",
+        "# import torch\n",
+        "# from transformers import AutoProcessor, LlavaForConditionalGeneration\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Load a Test Artifact\n",
+        "\n",
+        "Point this to a sample student artifact image before running model inference."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "artifact_path = Path(\"../data/sample_artifact.png\")\n",
+        "\n",
+        "if artifact_path.exists():\n",
+        "    test_image = Image.open(artifact_path).convert(\"RGB\")\n",
+        "    test_image\n",
+        "else:\n",
+        "    print(f\"Add a sample image at {artifact_path} to run inference.\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Zero-Shot Inference Template\n",
+        "\n",
+        "Use this block to test a candidate model such as LLaVA-1.5 or InternVL2 on a rubric prompt."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "MODEL_NAME = \"llava-hf/llava-1.5-7b-hf\"\n",
+        "PROMPT = \"\"\"Evaluate this student artifact for creativity using a 1-4 rubric.\\n\"\n",
+        "PROMPT += \"Return a score, confidence from 0 to 1, and a short justification.\"\"\"\n",
+        "\n",
+        "# Example template for local benchmarking.\n",
+        "# processor = AutoProcessor.from_pretrained(MODEL_NAME)\n",
+        "# model = LlavaForConditionalGeneration.from_pretrained(\n",
+        "#     MODEL_NAME,\n",
+        "#     torch_dtype=torch.float16,\n",
+        "#     low_cpu_mem_usage=True,\n",
+        "# )\n",
+        "#\n",
+        "# inputs = processor(text=PROMPT, images=test_image, return_tensors=\"pt\")\n",
+        "# start = time.perf_counter()\n",
+        "# output = model.generate(**inputs, max_new_tokens=128)\n",
+        "# inference_seconds = time.perf_counter() - start\n",
+        "# decoded = processor.batch_decode(output, skip_special_tokens=True)[0]\n",
+        "# print(decoded)\n",
+        "# print(f\"Inference time: {inference_seconds:.2f}s\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Benchmark Comparison Table\n",
+        "\n",
+        "Fill in observed latency and token estimates as you benchmark each model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "benchmark_df = pd.DataFrame(\n",
+        "    [\n",
+        "        {\"model\": \"LLaVA-1.5 7B\", \"params_b\": 7, \"inference_time_s\": None, \"avg_tokens_per_eval\": 350, \"cost_per_1m_tokens_usd\": 0.20},\n",
+        "        {\"model\": \"InternVL2 2B\", \"params_b\": 2, \"inference_time_s\": None, \"avg_tokens_per_eval\": 320, \"cost_per_1m_tokens_usd\": 0.12},\n",
+        "        {\"model\": \"Qwen2-VL 2B\", \"params_b\": 2, \"inference_time_s\": None, \"avg_tokens_per_eval\": 340, \"cost_per_1m_tokens_usd\": 0.15},\n",
+        "    ]\n",
+        ")\n",
+        "benchmark_df\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Cost Calculation\n",
+        "\n",
+        "Estimate per-evaluation cost using token pricing assumptions and a USD-to-INR exchange rate."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "exchange_rate = 83.0  # INR per USD\n",
+        "\n",
+        "benchmark_df[\"estimated_cost_inr\"] = (\n",
+        "    benchmark_df[\"cost_per_1m_tokens_usd\"]\n",
+        "    * benchmark_df[\"avg_tokens_per_eval\"]\n",
+        "    / 1_000_000\n",
+        "    * exchange_rate\n",
+        ")\n",
+        "\n",
+        "benchmark_df[[\"model\", \"estimated_cost_inr\"]]\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.11"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/issue-2-ai-skill-evaluator/requirements.txt b/issue-2-ai-skill-evaluator/requirements.txt
new file mode 100644
index 0000000..b4864ba
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/requirements.txt
@@ -0,0 +1,13 @@
+torch>=2.0.0
+transformers>=4.40.0
+pillow>=10.0.0
+pandas>=2.0.0
+numpy>=1.24.0
+jupyter>=1.0.0
+matplotlib>=3.7.0
+seaborn>=0.12.0
+tqdm>=4.65.0
+datasets>=2.14.0
+accelerate>=0.24.0
+peft>=0.6.0
+bitsandbytes>=0.41.0
diff --git a/issue-2-ai-skill-evaluator/src/__init__.py b/issue-2-ai-skill-evaluator/src/__init__.py
new file mode 100644
index 0000000..be942b8
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/src/__init__.py
@@ -0,0 +1 @@
+"""Utilities for the AI skill evaluator project."""
diff --git a/issue-2-ai-skill-evaluator/src/data_utils.py b/issue-2-ai-skill-evaluator/src/data_utils.py
new file mode 100644
index 0000000..4dd040f
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/src/data_utils.py
@@ -0,0 +1,127 @@
+"""Data loading and preprocessing helpers for rubric-based evaluation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+from PIL import Image
+
+
+VALID_ARTIFACT_SUFFIXES = {".png", ".jpg", ".jpeg", ".bmp", ".gif", ".txt", ".md"}
+REQUIRED_RUBRIC_KEYS = {
+    "rubric_version",
+    "skill",
+    "description",
+    "levels",
+    "artifact_types",
+    "evaluation_modalities",
+}
+
+
+def load_artifact(path: str | Path) -> dict[str, Any]:
+    """
+    Load an artifact from disk and return a structured payload.
+
+    Images are opened with Pillow and text files are loaded as UTF-8.
+    """
+    artifact_path = Path(path)
+    if not artifact_path.exists():
+        raise FileNotFoundError(f"Artifact not found: {artifact_path}")
+
+    suffix = artifact_path.suffix.lower()
+    if suffix not in VALID_ARTIFACT_SUFFIXES:
+        raise ValueError(f"Unsupported artifact type: {suffix}")
+
+    if suffix in {".txt", ".md"}:
+        return {
+            "artifact_id": artifact_path.stem,
+            "type": "text",
+            "content": artifact_path.read_text(encoding="utf-8"),
+            "path": str(artifact_path),
+        }
+
+    image = Image.open(artifact_path).convert("RGB")
+    return {
+        "artifact_id": artifact_path.stem,
+        "type": "image",
+        "content": image,
+        "path": str(artifact_path),
+        "size": image.size,
+    }
+
+
+def validate_rubric(rubric_dict: dict[str, Any]) -> bool:
+    """Validate a rubric dictionary against the expected scaffold schema."""
+    missing_keys = REQUIRED_RUBRIC_KEYS.difference(rubric_dict)
+    if missing_keys:
+        raise ValueError(f"Rubric is missing required keys: {sorted(missing_keys)}")
+
+    levels = rubric_dict["levels"]
+    if not isinstance(levels, list) or not levels:
+        raise ValueError("Rubric levels must be a non-empty list.")
+
+    expected_scores = {1, 2, 3, 4}
+    seen_scores = set()
+    for level in levels:
+        for key in ("score", "label", "descriptor"):
+            if key not in level:
+                raise ValueError(f"Rubric level is missing '{key}'.")
+        seen_scores.add(level["score"])
+
+    if seen_scores != expected_scores:
+        raise ValueError(
+            f"Rubric scores must cover {sorted(expected_scores)}; received {sorted(seen_scores)}."
+        )
+
+    return True
+
+
+def prepare_dataset(artifacts_dir: str | Path, labels_csv: str | Path) -> pd.DataFrame:
+    """
+    Convert raw artifacts and labels into a training-ready tabular format.
+
+    The labels CSV is expected to include:
+    - artifact_id
+    - skill
+    - score
+    """
+    artifacts_root = Path(artifacts_dir)
+    labels_path = Path(labels_csv)
+
+    if not artifacts_root.exists():
+        raise FileNotFoundError(f"Artifacts directory not found: {artifacts_root}")
+    if not labels_path.exists():
+        raise FileNotFoundError(f"Labels CSV not found: {labels_path}")
+
+    labels_df = pd.read_csv(labels_path)
+    required_columns = {"artifact_id", "skill", "score"}
+    missing_columns = required_columns.difference(labels_df.columns)
+    if missing_columns:
+        raise ValueError(
+            f"Labels CSV is missing required columns: {sorted(missing_columns)}"
+        )
+
+    artifact_records = []
+    for file_path in artifacts_root.rglob("*"):
+        if file_path.is_file() and file_path.suffix.lower() in VALID_ARTIFACT_SUFFIXES:
+            artifact_records.append(
+                {
+                    "artifact_id": file_path.stem,
+                    "artifact_path": str(file_path),
+                    "artifact_type": "text"
+                    if file_path.suffix.lower() in {".txt", ".md"}
+                    else "image",
+                }
+            )
+
+    artifacts_df = pd.DataFrame(artifact_records)
+    if artifacts_df.empty:
+        raise ValueError("No supported artifact files were found in the provided directory.")
+
+    dataset_df = labels_df.merge(artifacts_df, on="artifact_id", how="inner")
+    if dataset_df.empty:
+        raise ValueError("No labeled artifacts matched files in the artifacts directory.")
+
+    return dataset_df.sort_values(["skill", "artifact_id"]).reset_index(drop=True)
diff --git a/issue-2-ai-skill-evaluator/src/evaluator.py b/issue-2-ai-skill-evaluator/src/evaluator.py
new file mode 100644
index 0000000..ceb7cdf
--- /dev/null
+++ b/issue-2-ai-skill-evaluator/src/evaluator.py
@@ -0,0 +1,60 @@
+"""
+Base evaluator class for rubric-based artifact assessment.
+Designed to be extended by model-specific implementations.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+import json
+
+
+@dataclass
+class EvaluationResult:
+    artifact_id: str
+    skill: str
+    score: int  # 1-4 rubric scale
+    confidence: float
+    justification: str
+    model_used: str
+    cost_inr: Optional[float] = None
+
+
+class BaseEvaluator(ABC):
+    """
+    Abstract base class for all skill evaluators.
+    Implement this for each model (Gemini baseline, LLaVA, InternVL, etc.)
+    """
+
+    def __init__(self, model_name: str, rubric_path: str):
+        self.model_name = model_name
+        with open(rubric_path, "r", encoding="utf-8") as f:
+            self.rubric = json.load(f)
+
+    @abstractmethod
+    def evaluate(self, artifact_path: str, skill: str) -> EvaluationResult:
+        """
+        Evaluate a student artifact against the rubric for a given skill.
+
+        Args:
+            artifact_path: Path to the image/text artifact
+            skill: One of ['creativity', 'critical_thinking', 'problem_solving', 'agency']
+
+        Returns:
+            EvaluationResult with score, confidence, and justification
+        """
+        raise NotImplementedError
+
+    def get_rubric_descriptors(self, skill: str) -> dict:
+        """Return rubric level descriptors for the given skill."""
+        return {level["score"]: level["descriptor"] for level in self.rubric["levels"]}
+
+    def build_prompt(self, skill: str) -> str:
+        """Build a structured evaluation prompt from the rubric."""
+        descriptors = self.get_rubric_descriptors(skill)
+        prompt = f"Evaluate the student artifact for the skill: {skill.upper()}\n\n"
+        prompt += "Scoring rubric:\n"
+        for score, desc in descriptors.items():
+            prompt += f"  Score {score}: {desc}\n"
+        prompt += "\nProvide: (1) score (1-4), (2) confidence (0-1), (3) brief justification."
+        return prompt