From 23f10d0317aba38b844a397df2c5692c7e795aac Mon Sep 17 00:00:00 2001 From: HONGDAE KIM Date: Sun, 15 Feb 2026 13:01:14 +0900 Subject: [PATCH] feat: normalize analyze input contract across web and cli --- README.md | 68 ++++++++++++++++++++++++ bitnet_tools/analysis.py | 110 ++++++++++++++++++++++++++++++++------- bitnet_tools/cli.py | 10 +++- bitnet_tools/web.py | 49 ++++++++++++++--- tests/test_analysis.py | 37 +++++++++++++ 5 files changed, 244 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index eede2d6..9e915dd 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,74 @@ bitnet-analyze multi-analyze a.csv b.csv --question "비교" --no-cache --out-js --- +## API 입력 계약 요약 (`/api/analyze`) + +웹 API는 확장 가능한 공통 입력 스키마를 사용합니다. + +### 요청 필드 + +- `input_type`: `csv` | `excel` | `document` +- `source_name`: 원본 이름(파일명/시트명 등) +- `normalized_csv_text`: 전처리 완료된 CSV 텍스트 +- `meta`: 입력별 부가 메타데이터(dict) +- `question`: 분석 질문 + +하위호환: 기존 `csv_text`만 보내도 내부에서 `normalized_csv_text`로 승격되어 처리됩니다. + +### 요청 예시 (신규) + +```json +{ + "input_type": "excel", + "source_name": "sales.xlsx#Sheet1", + "normalized_csv_text": "region,amount\nseoul,100\nbusan,120\n", + "meta": { + "sheet": "Sheet1", + "uploaded_by": "analyst" + }, + "question": "지역별 매출을 요약해줘" +} +``` + +### 요청 예시 (레거시 CSV) + +```json +{ + "csv_text": "region,amount\nseoul,100\n", + "question": "핵심 인사이트를 알려줘" +} +``` + +### 정상 응답 예시 (일부) + +```json +{ + "csv_path": "sales.xlsx#Sheet1", + "question": "지역별 매출을 요약해줘", + "summary": {"row_count": 2, "column_count": 2}, + "input": { + "input_type": "excel", + "source_name": "sales.xlsx#Sheet1", + "normalized_csv_text": "region,amount\nseoul,100\nbusan,120\n", + "meta": {"sheet": "Sheet1", "uploaded_by": "analyst"}, + "preprocessing_steps": ["use_normalized_csv_text"] + } +} +``` + +### 오류 응답 예시 + +```json +{ + "error": "analyze payload invalid", + "error_detail": "normalized_csv_text is required", + "input_type": "document", + "preprocessing_stage": "input_validation" +} +``` + +--- + ## 8) GitHub 반영(적용) 절차 로컬에서 문서/설정을 수정한 뒤 아래 순서로 GitHub에 반영합니다. diff --git a/bitnet_tools/analysis.py b/bitnet_tools/analysis.py index c817620..684d3a8 100644 --- a/bitnet_tools/analysis.py +++ b/bitnet_tools/analysis.py @@ -8,6 +8,27 @@ from typing import Any +VALID_INPUT_TYPES = {"csv", "excel", "document"} + + +@dataclass +class NormalizedAnalysisInput: + input_type: str + source_name: str + normalized_csv_text: str + meta: dict[str, Any] + preprocessing_steps: list[str] + + def to_dict(self) -> dict[str, Any]: + return { + "input_type": self.input_type, + "source_name": self.source_name, + "normalized_csv_text": self.normalized_csv_text, + "meta": self.meta, + "preprocessing_steps": self.preprocessing_steps, + } + + @dataclass class DataSummary: row_count: int @@ -151,38 +172,87 @@ def build_markdown_report(summary: DataSummary, question: str) -> str: return "\n".join(lines) -def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any]: - path = Path(csv_path) - if not path.exists(): - raise FileNotFoundError(f"CSV file not found: {path}") - - with path.open("r", encoding="utf-8", newline="") as f: - reader = csv.DictReader(f) - if reader.fieldnames is None: - raise ValueError("CSV header not found") - columns = [str(c) for c in reader.fieldnames] +def normalize_analysis_input(payload: dict[str, Any]) -> NormalizedAnalysisInput: + preprocessing_steps: list[str] = [] + + raw_type = str(payload.get("input_type", "csv")).strip().lower() or "csv" + if raw_type not in VALID_INPUT_TYPES: + raise ValueError(f"unsupported input_type: {raw_type}") + + source_name = str(payload.get("source_name", "")).strip() or "" + + meta = payload.get("meta", {}) + if not isinstance(meta, dict): + meta = {"raw_meta": str(meta)} + preprocessing_steps.append("meta_coerced_to_dict") + + normalized_csv_text = str(payload.get("normalized_csv_text", "")) + if normalized_csv_text.strip(): + preprocessing_steps.append("use_normalized_csv_text") + else: + legacy_csv_text = str(payload.get("csv_text", "")) + if not legacy_csv_text.strip(): + raise ValueError("normalized_csv_text is required") + normalized_csv_text = legacy_csv_text + preprocessing_steps.append("promote_legacy_csv_text") + meta = {**meta, "legacy_csv_text": True} + + return NormalizedAnalysisInput( + input_type=raw_type, + source_name=source_name, + normalized_csv_text=normalized_csv_text, + meta=meta, + preprocessing_steps=preprocessing_steps, + ) - summary = summarize_reader(reader, columns) - return { - "csv_path": str(path), - "question": question, - "summary": summary.to_dict(), - "prompt": build_prompt(summary, question), - } +def build_analysis_payload_from_request( + payload: dict[str, Any], question: str, *, csv_path_override: str | None = None +) -> dict[str, Any]: + normalized_input = normalize_analysis_input(payload) + return build_analysis_payload_from_normalized_input( + normalized_input, + question, + csv_path_override=csv_path_override, + ) -def build_analysis_payload_from_csv_text(csv_text: str, question: str) -> dict[str, Any]: - reader = csv.DictReader(io.StringIO(csv_text)) +def build_analysis_payload_from_normalized_input( + normalized_input: NormalizedAnalysisInput, + question: str, + *, + csv_path_override: str | None = None, +) -> dict[str, Any]: + reader = csv.DictReader(io.StringIO(normalized_input.normalized_csv_text)) if reader.fieldnames is None: raise ValueError("CSV header not found") columns = [str(c) for c in reader.fieldnames] summary = summarize_reader(reader, columns) + csv_path = csv_path_override or normalized_input.source_name return { - "csv_path": "", + "csv_path": csv_path, "question": question, "summary": summary.to_dict(), "prompt": build_prompt(summary, question), + "input": normalized_input.to_dict(), + } + + +def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any]: + path = Path(csv_path) + if not path.exists(): + raise FileNotFoundError(f"CSV file not found: {path}") + + payload = { + "input_type": "csv", + "source_name": path.name, + "normalized_csv_text": path.read_text(encoding="utf-8"), + "meta": {"csv_path": str(path)}, } + return build_analysis_payload_from_request(payload, question, csv_path_override=str(path)) + + +def build_analysis_payload_from_csv_text(csv_text: str, question: str) -> dict[str, Any]: + return build_analysis_payload_from_request({"csv_text": csv_text}, question) diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py index fbc442c..7756a5f 100644 --- a/bitnet_tools/cli.py +++ b/bitnet_tools/cli.py @@ -6,7 +6,7 @@ import sys from pathlib import Path -from .analysis import DataSummary, build_analysis_payload, build_markdown_report +from .analysis import DataSummary, build_analysis_payload, build_analysis_payload_from_request, build_markdown_report from .doctor import collect_environment from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json from .visualize import create_multi_charts @@ -150,7 +150,13 @@ def main(argv: list[str] | None = None) -> int: return 0 if args.command == "analyze": - payload = build_analysis_payload(args.csv, args.question) + request_payload = { + "input_type": "csv", + "source_name": args.csv.name, + "normalized_csv_text": args.csv.read_text(encoding="utf-8"), + "meta": {"csv_path": str(args.csv)}, + } + payload = build_analysis_payload_from_request(request_payload, args.question, csv_path_override=str(args.csv)) args.out.write_text( json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8" ) diff --git a/bitnet_tools/web.py b/bitnet_tools/web.py index c8cdd53..23d70b0 100644 --- a/bitnet_tools/web.py +++ b/bitnet_tools/web.py @@ -12,7 +12,7 @@ from typing import Any from urllib.parse import urlparse -from .analysis import build_analysis_payload_from_csv_text +from .analysis import build_analysis_payload_from_request from .multi_csv import analyze_multiple_csv from .visualize import create_multi_charts @@ -95,8 +95,20 @@ def run_ollama(model: str, prompt: str) -> str: class Handler(BaseHTTPRequestHandler): - def _error_payload(self, message: str, detail: str | None = None) -> dict[str, str]: - return {"error": message, "error_detail": detail or message} + def _error_payload( + self, + message: str, + detail: str | None = None, + *, + input_type: str | None = None, + preprocessing_stage: str | None = None, + ) -> dict[str, str]: + data: dict[str, str] = {"error": message, "error_detail": detail or message} + if input_type: + data["input_type"] = input_type + if preprocessing_stage: + data["preprocessing_stage"] = preprocessing_stage + return data def _send_json(self, data: dict, status: int = HTTPStatus.OK) -> None: body = json.dumps(data, ensure_ascii=False).encode("utf-8") @@ -143,13 +155,33 @@ def do_POST(self) -> None: try: if route == "/api/analyze": - csv_text = str(payload.get("csv_text", "")) question = str(payload.get("question", "")).strip() - if not csv_text.strip(): - return self._send_json(self._error_payload('csv_text is required'), HTTPStatus.BAD_REQUEST) if not question: question = "이 데이터의 핵심 인사이트를 알려줘" - result = build_analysis_payload_from_csv_text(csv_text, question) + + request_payload = { + "input_type": payload.get("input_type", "csv"), + "source_name": payload.get("source_name", ""), + "normalized_csv_text": payload.get("normalized_csv_text", ""), + "meta": payload.get("meta", {}), + "csv_text": payload.get("csv_text", ""), + } + try: + result = build_analysis_payload_from_request(request_payload, question) + except Exception as exc: + input_type = str(request_payload.get("input_type", "csv") or "csv") + has_normalized = bool(str(request_payload.get("normalized_csv_text", "")).strip()) + has_legacy = bool(str(request_payload.get("csv_text", "")).strip()) + preprocessing_stage = "normalized_csv_text" if has_normalized else "legacy_csv_text" if has_legacy else "input_validation" + return self._send_json( + self._error_payload( + "analyze payload invalid", + str(exc), + input_type=input_type, + preprocessing_stage=preprocessing_stage, + ), + HTTPStatus.BAD_REQUEST, + ) return self._send_json(result) @@ -202,7 +234,8 @@ def do_POST(self) -> None: return self._send_json({"answer": answer}) except Exception as exc: # runtime surface for UI - return self._send_json(self._error_payload('request failed', str(exc)), HTTPStatus.BAD_REQUEST) + input_type = str(payload.get('input_type', 'csv') or 'csv') if isinstance(payload, dict) else 'csv' + return self._send_json(self._error_payload('request failed', str(exc), input_type=input_type, preprocessing_stage='runtime'), HTTPStatus.BAD_REQUEST) self.send_error(HTTPStatus.NOT_FOUND) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 0ece20a..da8ae62 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -3,6 +3,8 @@ from bitnet_tools.analysis import ( build_analysis_payload, build_analysis_payload_from_csv_text, + build_analysis_payload_from_request, + normalize_analysis_input, summarize_rows, build_markdown_report, ) @@ -33,6 +35,8 @@ def test_build_analysis_payload(tmp_path): assert payload["csv_path"].endswith("sample.csv") assert payload["summary"]["row_count"] == 2 + assert payload["input"]["input_type"] == "csv" + assert payload["input"]["meta"]["csv_path"].endswith("sample.csv") assert "핵심요약 / 근거 / 한계 / 다음행동" in payload["prompt"] @@ -43,6 +47,7 @@ def test_build_analysis_payload_from_csv_text(): assert payload["csv_path"] == "" assert payload["summary"]["row_count"] == 2 + assert payload["input"]["preprocessing_steps"] == ["promote_legacy_csv_text"] def test_streaming_summary_keeps_mixed_type_as_string(tmp_path): @@ -183,3 +188,35 @@ def test_baseline_fixture_summaries_are_stable(tmp_path): missing = build_analysis_payload(root / "missing_heavy.csv", "baseline") assert missing["summary"]["missing_counts"] == {"a": 2, "b": 2, "c": 1} + + +def test_normalize_analysis_input_new_contract(): + normalized = normalize_analysis_input( + { + "input_type": "excel", + "source_name": "sales.xlsx#sheet1", + "normalized_csv_text": "a,b\n1,2\n", + "meta": {"sheet": "sheet1"}, + } + ) + + assert normalized.input_type == "excel" + assert normalized.source_name == "sales.xlsx#sheet1" + assert normalized.preprocessing_steps == ["use_normalized_csv_text"] + + +def test_build_analysis_payload_from_request_with_legacy_csv_text(): + payload = build_analysis_payload_from_request( + {"csv_text": "x,y\n1,2\n", "source_name": "legacy_inline"}, + "질문", + ) + + assert payload["summary"]["row_count"] == 1 + assert payload["input"]["meta"]["legacy_csv_text"] is True + + +def test_normalize_analysis_input_rejects_unsupported_type(): + import pytest + + with pytest.raises(ValueError): + normalize_analysis_input({"input_type": "json", "normalized_csv_text": "a\n1\n"})