diff --git a/issue-2-ai-skill-evaluator/.gitignore b/issue-2-ai-skill-evaluator/.gitignore new file mode 100644 index 0000000..4e1161d --- /dev/null +++ b/issue-2-ai-skill-evaluator/.gitignore @@ -0,0 +1,10 @@ +__pycache__/ +*.py[cod] +.ipynb_checkpoints/ +.DS_Store +.env +.venv/ +venv/ +artifacts/ +outputs/ +*.log diff --git a/issue-2-ai-skill-evaluator/README.md b/issue-2-ai-skill-evaluator/README.md new file mode 100644 index 0000000..64e4bcf --- /dev/null +++ b/issue-2-ai-skill-evaluator/README.md @@ -0,0 +1,44 @@ +# AI Model for Evaluating 21st Century Skills (Issue #2) + +## Project Overview +This module develops a cost-efficient, fine-tuned open-source Vision Language Model (VLM) +to evaluate student-submitted artifacts (drawings, written responses) against rubric-based +frameworks measuring 21st-century skills: creativity, critical thinking, problem-solving, and agency. + +**Target cost:** < ₹0.10 per evaluation +**Replaces:** Gemini-based evaluation pipeline +**Approach:** Supervised fine-tuning of open-source VLMs (LLaMA-based) using PyTorch + +## Project Structure +```text +data/ -> Rubric schemas and labeled artifact datasets +notebooks/ -> EDA, benchmarking, and training experiments +src/ -> Core source code (data utils, evaluator, fine-tuning pipeline) +``` + +## Getting Started +```bash +pip install -r requirements.txt +``` + +## Rubric Framework +Skills assessed: +- **Creativity** - originality, expression, divergent thinking +- **Critical Thinking** - analysis, evaluation, logical reasoning +- **Problem Solving** - approach, method, outcome quality +- **Agency** - self-direction, initiative, reflection + +## Model Candidates Under Evaluation +| Model | Params | Multimodal | License | +|-------|--------|-----------|---------| +| LLaVA-1.5 | 7B/13B | ✅ | Apache 2.0 | +| InternVL2 | 2B/8B | ✅ | MIT | +| Qwen2-VL | 2B/7B | ✅ | Apache 2.0 | +| PaliGemma | 3B | ✅ | Gemma License | + +## Milestones +- [ ] Dataset preparation and schema design +- [ ] Model benchmarking (zero-shot performance) +- [ ] Fine-tuning pipeline setup +- [ ] Cost-efficiency analysis +- [ ] Benchmark against Gemini and human evaluators diff --git a/issue-2-ai-skill-evaluator/data/README.md b/issue-2-ai-skill-evaluator/data/README.md new file mode 100644 index 0000000..562ded2 --- /dev/null +++ b/issue-2-ai-skill-evaluator/data/README.md @@ -0,0 +1,20 @@ +# Data Directory + +This directory stores rubric definitions and labeled datasets used for benchmarking and +fine-tuning rubric-based artifact evaluators. + +## Expected Inputs +- `*.json` rubric files defining the scoring schema for a skill +- Artifact files such as `.png`, `.jpg`, `.jpeg`, `.txt`, or `.md` +- Label tables in `.csv` format linking artifact identifiers to rubric scores + +## Suggested Rubric Schema +Each rubric JSON file should include: +- `rubric_version` +- `skill` +- `description` +- `levels` with `score`, `label`, and `descriptor` +- `artifact_types` +- `evaluation_modalities` + +See [sample_rubric.json](/C:/Users/asus/OneDrive/Desktop/C4GT_2026/issue-2-ai-skill-evaluator/data/sample_rubric.json) for a starter example. diff --git a/issue-2-ai-skill-evaluator/data/sample_rubric.json b/issue-2-ai-skill-evaluator/data/sample_rubric.json new file mode 100644 index 0000000..0558aed --- /dev/null +++ b/issue-2-ai-skill-evaluator/data/sample_rubric.json @@ -0,0 +1,29 @@ +{ + "rubric_version": "1.0", + "skill": "creativity", + "description": "Assesses originality and creative expression in student artifacts", + "levels": [ + { + "score": 1, + "label": "Beginning", + "descriptor": "Work shows minimal originality; heavily relies on given prompts or examples" + }, + { + "score": 2, + "label": "Developing", + "descriptor": "Shows some original elements but largely conventional in approach" + }, + { + "score": 3, + "label": "Proficient", + "descriptor": "Demonstrates clear original thinking with creative connections" + }, + { + "score": 4, + "label": "Exemplary", + "descriptor": "Highly original work showing inventive, divergent thinking throughout" + } + ], + "artifact_types": ["drawing", "written_response", "prototype"], + "evaluation_modalities": ["vision", "text", "multimodal"] +} diff --git a/issue-2-ai-skill-evaluator/notebooks/01_model_benchmarking.ipynb b/issue-2-ai-skill-evaluator/notebooks/01_model_benchmarking.ipynb new file mode 100644 index 0000000..40e274d --- /dev/null +++ b/issue-2-ai-skill-evaluator/notebooks/01_model_benchmarking.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Benchmarking for Skill Evaluation\n", + "\n", + "This notebook is a starter for benchmarking multimodal models on rubric-based artifact evaluation.\n", + "For this project, benchmarking means comparing candidate VLMs on:\n", + "- zero-shot rubric alignment,\n", + "- inference latency,\n", + "- estimated cost per evaluation,\n", + "- practicality for the INR 0.10 target.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Pick a model checkpoint that fits local hardware. The example below shows a zero-shot path for LLaVA-style checkpoints using `transformers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import time\n", + "\n", + "import pandas as pd\n", + "from PIL import Image\n", + "\n", + "# Uncomment when running with a supported checkpoint locally.\n", + "# import torch\n", + "# from transformers import AutoProcessor, LlavaForConditionalGeneration\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load a Test Artifact\n", + "\n", + "Point this to a sample student artifact image before running model inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "artifact_path = Path(\"../data/sample_artifact.png\")\n", + "\n", + "if artifact_path.exists():\n", + " test_image = Image.open(artifact_path).convert(\"RGB\")\n", + " test_image\n", + "else:\n", + " print(f\"Add a sample image at {artifact_path} to run inference.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Zero-Shot Inference Template\n", + "\n", + "Use this block to test a candidate model such as LLaVA-1.5 or InternVL2 on a rubric prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_NAME = \"llava-hf/llava-1.5-7b-hf\"\n", + "PROMPT = \"\"\"Evaluate this student artifact for creativity using a 1-4 rubric.\\n\"\n", + "PROMPT += \"Return a score, confidence from 0 to 1, and a short justification.\"\"\"\n", + "\n", + "# Example template for local benchmarking.\n", + "# processor = AutoProcessor.from_pretrained(MODEL_NAME)\n", + "# model = LlavaForConditionalGeneration.from_pretrained(\n", + "# MODEL_NAME,\n", + "# torch_dtype=torch.float16,\n", + "# low_cpu_mem_usage=True,\n", + "# )\n", + "#\n", + "# inputs = processor(text=PROMPT, images=test_image, return_tensors=\"pt\")\n", + "# start = time.perf_counter()\n", + "# output = model.generate(**inputs, max_new_tokens=128)\n", + "# inference_seconds = time.perf_counter() - start\n", + "# decoded = processor.batch_decode(output, skip_special_tokens=True)[0]\n", + "# print(decoded)\n", + "# print(f\"Inference time: {inference_seconds:.2f}s\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benchmark Comparison Table\n", + "\n", + "Fill in observed latency and token estimates as you benchmark each model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "benchmark_df = pd.DataFrame(\n", + " [\n", + " {\"model\": \"LLaVA-1.5 7B\", \"params_b\": 7, \"inference_time_s\": None, \"avg_tokens_per_eval\": 350, \"cost_per_1m_tokens_usd\": 0.20},\n", + " {\"model\": \"InternVL2 2B\", \"params_b\": 2, \"inference_time_s\": None, \"avg_tokens_per_eval\": 320, \"cost_per_1m_tokens_usd\": 0.12},\n", + " {\"model\": \"Qwen2-VL 2B\", \"params_b\": 2, \"inference_time_s\": None, \"avg_tokens_per_eval\": 340, \"cost_per_1m_tokens_usd\": 0.15},\n", + " ]\n", + ")\n", + "benchmark_df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cost Calculation\n", + "\n", + "Estimate per-evaluation cost using token pricing assumptions and a USD-to-INR exchange rate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exchange_rate = 83.0 # INR per USD\n", + "\n", + "benchmark_df[\"estimated_cost_inr\"] = (\n", + " benchmark_df[\"cost_per_1m_tokens_usd\"]\n", + " * benchmark_df[\"avg_tokens_per_eval\"]\n", + " / 1_000_000\n", + " * exchange_rate\n", + ")\n", + "\n", + "benchmark_df[[\"model\", \"estimated_cost_inr\"]]\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/issue-2-ai-skill-evaluator/requirements.txt b/issue-2-ai-skill-evaluator/requirements.txt new file mode 100644 index 0000000..b4864ba --- /dev/null +++ b/issue-2-ai-skill-evaluator/requirements.txt @@ -0,0 +1,13 @@ +torch>=2.0.0 +transformers>=4.40.0 +pillow>=10.0.0 +pandas>=2.0.0 +numpy>=1.24.0 +jupyter>=1.0.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +tqdm>=4.65.0 +datasets>=2.14.0 +accelerate>=0.24.0 +peft>=0.6.0 +bitsandbytes>=0.41.0 diff --git a/issue-2-ai-skill-evaluator/src/__init__.py b/issue-2-ai-skill-evaluator/src/__init__.py new file mode 100644 index 0000000..be942b8 --- /dev/null +++ b/issue-2-ai-skill-evaluator/src/__init__.py @@ -0,0 +1 @@ +"""Utilities for the AI skill evaluator project.""" diff --git a/issue-2-ai-skill-evaluator/src/data_utils.py b/issue-2-ai-skill-evaluator/src/data_utils.py new file mode 100644 index 0000000..4dd040f --- /dev/null +++ b/issue-2-ai-skill-evaluator/src/data_utils.py @@ -0,0 +1,127 @@ +"""Data loading and preprocessing helpers for rubric-based evaluation.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pandas as pd +from PIL import Image + + +VALID_ARTIFACT_SUFFIXES = {".png", ".jpg", ".jpeg", ".bmp", ".gif", ".txt", ".md"} +REQUIRED_RUBRIC_KEYS = { + "rubric_version", + "skill", + "description", + "levels", + "artifact_types", + "evaluation_modalities", +} + + +def load_artifact(path: str | Path) -> dict[str, Any]: + """ + Load an artifact from disk and return a structured payload. + + Images are opened with Pillow and text files are loaded as UTF-8. + """ + artifact_path = Path(path) + if not artifact_path.exists(): + raise FileNotFoundError(f"Artifact not found: {artifact_path}") + + suffix = artifact_path.suffix.lower() + if suffix not in VALID_ARTIFACT_SUFFIXES: + raise ValueError(f"Unsupported artifact type: {suffix}") + + if suffix in {".txt", ".md"}: + return { + "artifact_id": artifact_path.stem, + "type": "text", + "content": artifact_path.read_text(encoding="utf-8"), + "path": str(artifact_path), + } + + image = Image.open(artifact_path).convert("RGB") + return { + "artifact_id": artifact_path.stem, + "type": "image", + "content": image, + "path": str(artifact_path), + "size": image.size, + } + + +def validate_rubric(rubric_dict: dict[str, Any]) -> bool: + """Validate a rubric dictionary against the expected scaffold schema.""" + missing_keys = REQUIRED_RUBRIC_KEYS.difference(rubric_dict) + if missing_keys: + raise ValueError(f"Rubric is missing required keys: {sorted(missing_keys)}") + + levels = rubric_dict["levels"] + if not isinstance(levels, list) or not levels: + raise ValueError("Rubric levels must be a non-empty list.") + + expected_scores = {1, 2, 3, 4} + seen_scores = set() + for level in levels: + for key in ("score", "label", "descriptor"): + if key not in level: + raise ValueError(f"Rubric level is missing '{key}'.") + seen_scores.add(level["score"]) + + if seen_scores != expected_scores: + raise ValueError( + f"Rubric scores must cover {sorted(expected_scores)}; received {sorted(seen_scores)}." + ) + + return True + + +def prepare_dataset(artifacts_dir: str | Path, labels_csv: str | Path) -> pd.DataFrame: + """ + Convert raw artifacts and labels into a training-ready tabular format. + + The labels CSV is expected to include: + - artifact_id + - skill + - score + """ + artifacts_root = Path(artifacts_dir) + labels_path = Path(labels_csv) + + if not artifacts_root.exists(): + raise FileNotFoundError(f"Artifacts directory not found: {artifacts_root}") + if not labels_path.exists(): + raise FileNotFoundError(f"Labels CSV not found: {labels_path}") + + labels_df = pd.read_csv(labels_path) + required_columns = {"artifact_id", "skill", "score"} + missing_columns = required_columns.difference(labels_df.columns) + if missing_columns: + raise ValueError( + f"Labels CSV is missing required columns: {sorted(missing_columns)}" + ) + + artifact_records = [] + for file_path in artifacts_root.rglob("*"): + if file_path.is_file() and file_path.suffix.lower() in VALID_ARTIFACT_SUFFIXES: + artifact_records.append( + { + "artifact_id": file_path.stem, + "artifact_path": str(file_path), + "artifact_type": "text" + if file_path.suffix.lower() in {".txt", ".md"} + else "image", + } + ) + + artifacts_df = pd.DataFrame(artifact_records) + if artifacts_df.empty: + raise ValueError("No supported artifact files were found in the provided directory.") + + dataset_df = labels_df.merge(artifacts_df, on="artifact_id", how="inner") + if dataset_df.empty: + raise ValueError("No labeled artifacts matched files in the artifacts directory.") + + return dataset_df.sort_values(["skill", "artifact_id"]).reset_index(drop=True) diff --git a/issue-2-ai-skill-evaluator/src/evaluator.py b/issue-2-ai-skill-evaluator/src/evaluator.py new file mode 100644 index 0000000..ceb7cdf --- /dev/null +++ b/issue-2-ai-skill-evaluator/src/evaluator.py @@ -0,0 +1,60 @@ +""" +Base evaluator class for rubric-based artifact assessment. +Designed to be extended by model-specific implementations. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Optional +import json + + +@dataclass +class EvaluationResult: + artifact_id: str + skill: str + score: int # 1-4 rubric scale + confidence: float + justification: str + model_used: str + cost_inr: Optional[float] = None + + +class BaseEvaluator(ABC): + """ + Abstract base class for all skill evaluators. + Implement this for each model (Gemini baseline, LLaVA, InternVL, etc.) + """ + + def __init__(self, model_name: str, rubric_path: str): + self.model_name = model_name + with open(rubric_path, "r", encoding="utf-8") as f: + self.rubric = json.load(f) + + @abstractmethod + def evaluate(self, artifact_path: str, skill: str) -> EvaluationResult: + """ + Evaluate a student artifact against the rubric for a given skill. + + Args: + artifact_path: Path to the image/text artifact + skill: One of ['creativity', 'critical_thinking', 'problem_solving', 'agency'] + + Returns: + EvaluationResult with score, confidence, and justification + """ + raise NotImplementedError + + def get_rubric_descriptors(self, skill: str) -> dict: + """Return rubric level descriptors for the given skill.""" + return {level["score"]: level["descriptor"] for level in self.rubric["levels"]} + + def build_prompt(self, skill: str) -> str: + """Build a structured evaluation prompt from the rubric.""" + descriptors = self.get_rubric_descriptors(skill) + prompt = f"Evaluate the student artifact for the skill: {skill.upper()}\n\n" + prompt += "Scoring rubric:\n" + for score, desc in descriptors.items(): + prompt += f" Score {score}: {desc}\n" + prompt += "\nProvide: (1) score (1-4), (2) confidence (0-1), (3) brief justification." + return prompt