diff --git a/README.md b/README.md index db6fca6..f5cf342 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ## 0) 현재 완성도 빠른 진단 -현 시점 기준 기능 완성도(실사용 관점): **약 94%** +현 시점 기준 기능 완성도(실사용 관점): **약 95%** - 완료 - CSV 기초 요약(행/열/결측/숫자 통계) @@ -15,6 +15,7 @@ - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`) - 컬럼별 결측/고유/상위값 비율 산출 - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성) + - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar, matplotlib 설치 시) - 브라우저 UI(`bitnet-analyze ui`) - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)** - 남은 과제 @@ -219,7 +220,7 @@ bitnet-analyze doctor --model bitnet:latest bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_report.md # 8) 다중 CSV 통합 분석(JSON+MD+코드가이드) -bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --group-column 시도명 --target-column 세차유형 --out-json multi.json --out-report multi.md +bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --group-column 시도명 --target-column 세차유형 --charts-dir charts --out-json multi.json --out-report multi.md ``` --- diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py index c14e299..b5605c7 100644 --- a/bitnet_tools/cli.py +++ b/bitnet_tools/cli.py @@ -9,6 +9,7 @@ from .analysis import DataSummary, build_analysis_payload, build_markdown_report from .doctor import collect_environment from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json +from .visualize import create_multi_charts from .web import serve @@ -72,6 +73,12 @@ def _build_parser() -> argparse.ArgumentParser: default=Path("multi_analysis_report.md"), help="Where to store multi CSV markdown report", ) + multi_parser.add_argument( + "--charts-dir", + type=Path, + default=None, + help="Optional directory to save visualization charts", + ) report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV") report_parser.add_argument("csv", type=Path, help="Input CSV path") @@ -118,6 +125,12 @@ def main(argv: list[str] | None = None) -> int: group_column=args.group_column, target_column=args.target_column, ) + if args.charts_dir is not None: + try: + result["charts"] = create_multi_charts(args.csv, args.charts_dir) + except RuntimeError as exc: + result["charts_error"] = str(exc) + args.out_json.write_text(result_to_json(result), encoding="utf-8") args.out_report.write_text(build_multi_csv_markdown(result), encoding="utf-8") print(f"multi analysis json saved: {args.out_json}") diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py index 69c9467..fef1bc2 100644 --- a/bitnet_tools/multi_csv.py +++ b/bitnet_tools/multi_csv.py @@ -312,6 +312,14 @@ def build_multi_csv_markdown(result: dict[str, Any]) -> str: f"| {col} | {drift['dtype_changed']} | {drift['missing_ratio_range']:.4f} | {drift['dominant_value_ratio_range']:.4f} | {drift['mean_range']:.4f} |" ) + charts = result.get("charts") + if charts: + lines.extend(["", "## 생성된 차트 파일", ""]) + for file_path, chart_paths in charts.items(): + lines.append(f"- {file_path}") + for c in chart_paths: + lines.append(f" - {c}") + lines.extend( [ "", diff --git a/bitnet_tools/visualize.py b/bitnet_tools/visualize.py new file mode 100644 index 0000000..4c3b101 --- /dev/null +++ b/bitnet_tools/visualize.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +import csv +from pathlib import Path +from typing import Any + + +def _is_numeric_column(rows: list[dict[str, str]], col: str) -> bool: + seen = 0 + for row in rows: + raw = (row.get(col) or "").strip() + if not raw: + continue + seen += 1 + try: + float(raw) + except ValueError: + return False + return seen > 0 + + +def _safe_stem(path: Path) -> str: + return path.stem.replace(" ", "_") + + +def _ensure_matplotlib(): + try: + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + return plt + except Exception as exc: + raise RuntimeError("matplotlib is required for chart generation") from exc + + +def create_file_charts( + csv_path: Path, + out_dir: Path, + max_numeric: int = 3, + max_categorical: int = 2, +) -> list[str]: + plt = _ensure_matplotlib() + + out_dir.mkdir(parents=True, exist_ok=True) + with csv_path.open("r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + return [] + columns = [str(c) for c in reader.fieldnames] + rows = list(reader) + + numeric_cols = [c for c in columns if _is_numeric_column(rows, c)][:max_numeric] + categorical_cols = [c for c in columns if c not in numeric_cols][:max_categorical] + + artifacts: list[str] = [] + stem = _safe_stem(csv_path) + + for col in numeric_cols: + values = [] + for row in rows: + raw = (row.get(col) or "").strip() + if raw: + values.append(float(raw)) + if not values: + continue + + fig = plt.figure(figsize=(7, 4)) + plt.hist(values, bins=20) + plt.title(f"{stem} - {col} histogram") + plt.xlabel(col) + plt.ylabel("count") + plt.tight_layout() + out = out_dir / f"{stem}_{col}_hist.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + fig = plt.figure(figsize=(5, 4)) + plt.boxplot(values, vert=True) + plt.title(f"{stem} - {col} boxplot") + plt.ylabel(col) + plt.tight_layout() + out = out_dir / f"{stem}_{col}_box.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + for col in categorical_cols: + counter: dict[str, int] = {} + for row in rows: + raw = (row.get(col) or "").strip() + if not raw: + continue + counter[raw] = counter.get(raw, 0) + 1 + items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[:10] + if not items: + continue + + labels = [x[0] for x in items] + counts = [x[1] for x in items] + fig = plt.figure(figsize=(8, 4)) + plt.bar(range(len(labels)), counts) + plt.xticks(range(len(labels)), labels, rotation=30, ha="right") + plt.title(f"{stem} - {col} top values") + plt.tight_layout() + out = out_dir / f"{stem}_{col}_top.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + return artifacts + + +def create_multi_charts( + csv_paths: list[Path], + out_dir: Path, +) -> dict[str, Any]: + results: dict[str, Any] = {} + for p in csv_paths: + results[str(p)] = create_file_charts(p, out_dir) + return results diff --git a/tests/test_cli.py b/tests/test_cli.py index ea1caeb..3350590 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -111,3 +111,67 @@ def test_cli_multi_analyze_with_group_target(tmp_path): assert code == 0 body = out_json.read_text(encoding="utf-8") assert "group_target_ratio" in body + + +def test_cli_multi_analyze_with_charts(tmp_path, monkeypatch): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out3.json" + out_md = tmp_path / "out3.md" + charts_dir = tmp_path / "charts" + + p1.write_text("city,val\nseoul,1\n", encoding="utf-8") + p2.write_text("city,val\nbusan,2\n", encoding="utf-8") + + monkeypatch.setattr(cli, "create_multi_charts", lambda paths, out: {str(paths[0]): ["chart1.png"]}) + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "차트", + "--charts-dir", + str(charts_dir), + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + body = out_json.read_text(encoding="utf-8") + assert "charts" in body + + +def test_cli_multi_analyze_chart_error_fallback(tmp_path, monkeypatch): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out4.json" + out_md = tmp_path / "out4.md" + + p1.write_text("city,val\nseoul,1\n", encoding="utf-8") + p2.write_text("city,val\nbusan,2\n", encoding="utf-8") + + def boom(paths, out): + raise RuntimeError("matplotlib is required for chart generation") + + monkeypatch.setattr(cli, "create_multi_charts", boom) + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "차트실패", + "--charts-dir", + str(tmp_path / "charts"), + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + body = out_json.read_text(encoding="utf-8") + assert "charts_error" in body