Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,74 @@ bitnet-analyze multi-analyze a.csv b.csv --question "비교" --no-cache --out-js

---

## API 입력 계약 요약 (`/api/analyze`)

웹 API는 확장 가능한 공통 입력 스키마를 사용합니다.

### 요청 필드

- `input_type`: `csv` | `excel` | `document`
- `source_name`: 원본 이름(파일명/시트명 등)
- `normalized_csv_text`: 전처리 완료된 CSV 텍스트
- `meta`: 입력별 부가 메타데이터(dict)
- `question`: 분석 질문

하위호환: 기존 `csv_text`만 보내도 내부에서 `normalized_csv_text`로 승격되어 처리됩니다.

### 요청 예시 (신규)

```json
{
"input_type": "excel",
"source_name": "sales.xlsx#Sheet1",
"normalized_csv_text": "region,amount\nseoul,100\nbusan,120\n",
"meta": {
"sheet": "Sheet1",
"uploaded_by": "analyst"
},
"question": "지역별 매출을 요약해줘"
}
```

### 요청 예시 (레거시 CSV)

```json
{
"csv_text": "region,amount\nseoul,100\n",
"question": "핵심 인사이트를 알려줘"
}
```

### 정상 응답 예시 (일부)

```json
{
"csv_path": "sales.xlsx#Sheet1",
"question": "지역별 매출을 요약해줘",
"summary": {"row_count": 2, "column_count": 2},
"input": {
"input_type": "excel",
"source_name": "sales.xlsx#Sheet1",
"normalized_csv_text": "region,amount\nseoul,100\nbusan,120\n",
"meta": {"sheet": "Sheet1", "uploaded_by": "analyst"},
"preprocessing_steps": ["use_normalized_csv_text"]
}
}
```

### 오류 응답 예시

```json
{
"error": "analyze payload invalid",
"error_detail": "normalized_csv_text is required",
"input_type": "document",
"preprocessing_stage": "input_validation"
}
```

---

## 8) GitHub 반영(적용) 절차

로컬에서 문서/설정을 수정한 뒤 아래 순서로 GitHub에 반영합니다.
Expand Down
110 changes: 90 additions & 20 deletions bitnet_tools/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
from typing import Any


VALID_INPUT_TYPES = {"csv", "excel", "document"}


@dataclass
class NormalizedAnalysisInput:
input_type: str
source_name: str
normalized_csv_text: str
meta: dict[str, Any]
preprocessing_steps: list[str]

def to_dict(self) -> dict[str, Any]:
return {
"input_type": self.input_type,
"source_name": self.source_name,
"normalized_csv_text": self.normalized_csv_text,
"meta": self.meta,
"preprocessing_steps": self.preprocessing_steps,
}


@dataclass
class DataSummary:
row_count: int
Expand Down Expand Up @@ -151,38 +172,87 @@ def build_markdown_report(summary: DataSummary, question: str) -> str:
return "\n".join(lines)


def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any]:
path = Path(csv_path)
if not path.exists():
raise FileNotFoundError(f"CSV file not found: {path}")

with path.open("r", encoding="utf-8", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
raise ValueError("CSV header not found")
columns = [str(c) for c in reader.fieldnames]
def normalize_analysis_input(payload: dict[str, Any]) -> NormalizedAnalysisInput:
preprocessing_steps: list[str] = []

raw_type = str(payload.get("input_type", "csv")).strip().lower() or "csv"
if raw_type not in VALID_INPUT_TYPES:
raise ValueError(f"unsupported input_type: {raw_type}")
Comment on lines +178 to +180
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Default null input_type to csv before validation

input_type is also string-coerced before defaulting, so JSON null is interpreted as 'none' and rejected as unsupported. This breaks backward-compatible analyze requests that send nullable input_type (common with optional client fields) even when valid CSV data is present; None/blank should resolve to the csv default prior to validation.

Useful? React with 👍 / 👎.


source_name = str(payload.get("source_name", "")).strip() or "<inline_csv>"

meta = payload.get("meta", {})
if not isinstance(meta, dict):
meta = {"raw_meta": str(meta)}
preprocessing_steps.append("meta_coerced_to_dict")

normalized_csv_text = str(payload.get("normalized_csv_text", ""))
if normalized_csv_text.strip():
Comment on lines +189 to +190
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Handle null normalized_csv_text as missing

normalize_analysis_input converts normalized_csv_text to str before checking emptiness, so a JSON null becomes the literal 'None' and is treated as valid normalized CSV. In /api/analyze, a payload like {"normalized_csv_text": null, "csv_text": "a,b\n1,2\n"} will skip the legacy fallback and produce a summary for fake header None (often row_count=0), which silently corrupts analysis output instead of using the provided CSV text.

Useful? React with 👍 / 👎.

preprocessing_steps.append("use_normalized_csv_text")
else:
legacy_csv_text = str(payload.get("csv_text", ""))
if not legacy_csv_text.strip():
raise ValueError("normalized_csv_text is required")
normalized_csv_text = legacy_csv_text
preprocessing_steps.append("promote_legacy_csv_text")
meta = {**meta, "legacy_csv_text": True}

return NormalizedAnalysisInput(
input_type=raw_type,
source_name=source_name,
normalized_csv_text=normalized_csv_text,
meta=meta,
preprocessing_steps=preprocessing_steps,
)

summary = summarize_reader(reader, columns)

return {
"csv_path": str(path),
"question": question,
"summary": summary.to_dict(),
"prompt": build_prompt(summary, question),
}
def build_analysis_payload_from_request(
payload: dict[str, Any], question: str, *, csv_path_override: str | None = None
) -> dict[str, Any]:
normalized_input = normalize_analysis_input(payload)
return build_analysis_payload_from_normalized_input(
normalized_input,
question,
csv_path_override=csv_path_override,
)


def build_analysis_payload_from_csv_text(csv_text: str, question: str) -> dict[str, Any]:
reader = csv.DictReader(io.StringIO(csv_text))
def build_analysis_payload_from_normalized_input(
normalized_input: NormalizedAnalysisInput,
question: str,
*,
csv_path_override: str | None = None,
) -> dict[str, Any]:
reader = csv.DictReader(io.StringIO(normalized_input.normalized_csv_text))
if reader.fieldnames is None:
raise ValueError("CSV header not found")

columns = [str(c) for c in reader.fieldnames]
summary = summarize_reader(reader, columns)
csv_path = csv_path_override or normalized_input.source_name

return {
"csv_path": "<inline_csv>",
"csv_path": csv_path,
"question": question,
"summary": summary.to_dict(),
"prompt": build_prompt(summary, question),
"input": normalized_input.to_dict(),
}


def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any]:
path = Path(csv_path)
if not path.exists():
raise FileNotFoundError(f"CSV file not found: {path}")

payload = {
"input_type": "csv",
"source_name": path.name,
"normalized_csv_text": path.read_text(encoding="utf-8"),
"meta": {"csv_path": str(path)},
}
return build_analysis_payload_from_request(payload, question, csv_path_override=str(path))


def build_analysis_payload_from_csv_text(csv_text: str, question: str) -> dict[str, Any]:
return build_analysis_payload_from_request({"csv_text": csv_text}, question)
10 changes: 8 additions & 2 deletions bitnet_tools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
from pathlib import Path

from .analysis import DataSummary, build_analysis_payload, build_markdown_report
from .analysis import DataSummary, build_analysis_payload, build_analysis_payload_from_request, build_markdown_report
from .doctor import collect_environment
from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json
from .visualize import create_multi_charts
Expand Down Expand Up @@ -150,7 +150,13 @@ def main(argv: list[str] | None = None) -> int:
return 0

if args.command == "analyze":
payload = build_analysis_payload(args.csv, args.question)
request_payload = {
"input_type": "csv",
"source_name": args.csv.name,
"normalized_csv_text": args.csv.read_text(encoding="utf-8"),
"meta": {"csv_path": str(args.csv)},
}
payload = build_analysis_payload_from_request(request_payload, args.question, csv_path_override=str(args.csv))
args.out.write_text(
json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8"
)
Expand Down
49 changes: 41 additions & 8 deletions bitnet_tools/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Any
from urllib.parse import urlparse

from .analysis import build_analysis_payload_from_csv_text
from .analysis import build_analysis_payload_from_request
from .multi_csv import analyze_multiple_csv
from .visualize import create_multi_charts

Expand Down Expand Up @@ -95,8 +95,20 @@ def run_ollama(model: str, prompt: str) -> str:


class Handler(BaseHTTPRequestHandler):
def _error_payload(self, message: str, detail: str | None = None) -> dict[str, str]:
return {"error": message, "error_detail": detail or message}
def _error_payload(
self,
message: str,
detail: str | None = None,
*,
input_type: str | None = None,
preprocessing_stage: str | None = None,
) -> dict[str, str]:
data: dict[str, str] = {"error": message, "error_detail": detail or message}
if input_type:
data["input_type"] = input_type
if preprocessing_stage:
data["preprocessing_stage"] = preprocessing_stage
return data

def _send_json(self, data: dict, status: int = HTTPStatus.OK) -> None:
body = json.dumps(data, ensure_ascii=False).encode("utf-8")
Expand Down Expand Up @@ -143,13 +155,33 @@ def do_POST(self) -> None:

try:
if route == "/api/analyze":
csv_text = str(payload.get("csv_text", ""))
question = str(payload.get("question", "")).strip()
if not csv_text.strip():
return self._send_json(self._error_payload('csv_text is required'), HTTPStatus.BAD_REQUEST)
if not question:
question = "이 데이터의 핵심 인사이트를 알려줘"
result = build_analysis_payload_from_csv_text(csv_text, question)

request_payload = {
"input_type": payload.get("input_type", "csv"),
"source_name": payload.get("source_name", "<inline_csv>"),
"normalized_csv_text": payload.get("normalized_csv_text", ""),
"meta": payload.get("meta", {}),
"csv_text": payload.get("csv_text", ""),
}
try:
result = build_analysis_payload_from_request(request_payload, question)
except Exception as exc:
input_type = str(request_payload.get("input_type", "csv") or "csv")
has_normalized = bool(str(request_payload.get("normalized_csv_text", "")).strip())
has_legacy = bool(str(request_payload.get("csv_text", "")).strip())
preprocessing_stage = "normalized_csv_text" if has_normalized else "legacy_csv_text" if has_legacy else "input_validation"
return self._send_json(
self._error_payload(
"analyze payload invalid",
str(exc),
input_type=input_type,
preprocessing_stage=preprocessing_stage,
),
HTTPStatus.BAD_REQUEST,
)
return self._send_json(result)


Expand Down Expand Up @@ -202,7 +234,8 @@ def do_POST(self) -> None:
return self._send_json({"answer": answer})

except Exception as exc: # runtime surface for UI
return self._send_json(self._error_payload('request failed', str(exc)), HTTPStatus.BAD_REQUEST)
input_type = str(payload.get('input_type', 'csv') or 'csv') if isinstance(payload, dict) else 'csv'
return self._send_json(self._error_payload('request failed', str(exc), input_type=input_type, preprocessing_stage='runtime'), HTTPStatus.BAD_REQUEST)

self.send_error(HTTPStatus.NOT_FOUND)

Expand Down
37 changes: 37 additions & 0 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from bitnet_tools.analysis import (
build_analysis_payload,
build_analysis_payload_from_csv_text,
build_analysis_payload_from_request,
normalize_analysis_input,
summarize_rows,
build_markdown_report,
)
Expand Down Expand Up @@ -33,6 +35,8 @@ def test_build_analysis_payload(tmp_path):

assert payload["csv_path"].endswith("sample.csv")
assert payload["summary"]["row_count"] == 2
assert payload["input"]["input_type"] == "csv"
assert payload["input"]["meta"]["csv_path"].endswith("sample.csv")
assert "핵심요약 / 근거 / 한계 / 다음행동" in payload["prompt"]


Expand All @@ -43,6 +47,7 @@ def test_build_analysis_payload_from_csv_text():

assert payload["csv_path"] == "<inline_csv>"
assert payload["summary"]["row_count"] == 2
assert payload["input"]["preprocessing_steps"] == ["promote_legacy_csv_text"]


def test_streaming_summary_keeps_mixed_type_as_string(tmp_path):
Expand Down Expand Up @@ -183,3 +188,35 @@ def test_baseline_fixture_summaries_are_stable(tmp_path):

missing = build_analysis_payload(root / "missing_heavy.csv", "baseline")
assert missing["summary"]["missing_counts"] == {"a": 2, "b": 2, "c": 1}


def test_normalize_analysis_input_new_contract():
normalized = normalize_analysis_input(
{
"input_type": "excel",
"source_name": "sales.xlsx#sheet1",
"normalized_csv_text": "a,b\n1,2\n",
"meta": {"sheet": "sheet1"},
}
)

assert normalized.input_type == "excel"
assert normalized.source_name == "sales.xlsx#sheet1"
assert normalized.preprocessing_steps == ["use_normalized_csv_text"]


def test_build_analysis_payload_from_request_with_legacy_csv_text():
payload = build_analysis_payload_from_request(
{"csv_text": "x,y\n1,2\n", "source_name": "legacy_inline"},
"질문",
)

assert payload["summary"]["row_count"] == 1
assert payload["input"]["meta"]["legacy_csv_text"] is True


def test_normalize_analysis_input_rejects_unsupported_type():
import pytest

with pytest.raises(ValueError):
normalize_analysis_input({"input_type": "json", "normalized_csv_text": "a\n1\n"})