diff --git a/README.md b/README.md index 0aba2c7..4ae706f 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,18 @@ ## 0) 현재 완성도 빠른 진단 -현 시점 기준 기능 완성도(실사용 관점): **약 90%** +현 시점 기준 기능 완성도(실사용 관점): **약 93%** - 완료 - CSV 기초 요약(행/열/결측/숫자 통계) - BitNet용 프롬프트 자동 생성 - - CLI 실행 흐름 + - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`) + - 컬럼별 결측/고유/상위값 비율 산출 + - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성) - 브라우저 UI(`bitnet-analyze ui`) - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)** - 남은 과제 - - 시각화(차트) 자동 생성 + - 대시보드형 시각화 UI 고도화(필터/드릴다운) - 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화 ### 파일 붙여넣기 분석 가능 범위 @@ -128,6 +130,7 @@ jupyter lab 데스크톱 UI 내 `환경진단` 버튼으로 Ollama 설치/실행/모델 보유 여부를 즉시 확인할 수 있습니다. 또한 CSV 파일을 선택하지 않아도 CSV 텍스트를 바로 붙여넣어 분석할 수 있습니다. +(다중 CSV 동시 분석은 현재 CLI `multi-analyze`에서 먼저 지원합니다.) --- @@ -214,6 +217,9 @@ bitnet-analyze doctor --model bitnet:latest # 7) 마크다운 분석 리포트 저장 bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_report.md + +# 8) 다중 CSV 통합 분석(JSON+MD+코드가이드) +bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --out-json multi.json --out-report multi.md ``` --- diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py index 4d2cc57..dcd8a71 100644 --- a/bitnet_tools/cli.py +++ b/bitnet_tools/cli.py @@ -8,6 +8,7 @@ from .analysis import DataSummary, build_analysis_payload, build_markdown_report from .doctor import collect_environment +from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json from .web import serve @@ -53,6 +54,23 @@ def _build_parser() -> argparse.ArgumentParser: doctor_parser = subparsers.add_parser("doctor", help="Run local environment diagnostics") doctor_parser.add_argument("--model", default=None, help="Optional model tag to check availability") + + multi_parser = subparsers.add_parser("multi-analyze", help="Analyze multiple CSV files together") + multi_parser.add_argument("csv", nargs="+", type=Path, help="Input CSV paths") + multi_parser.add_argument("--question", required=True, help="Analysis question") + multi_parser.add_argument( + "--out-json", + type=Path, + default=Path("multi_analysis.json"), + help="Where to store multi CSV analysis JSON", + ) + multi_parser.add_argument( + "--out-report", + type=Path, + default=Path("multi_analysis_report.md"), + help="Where to store multi CSV markdown report", + ) + report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV") report_parser.add_argument("csv", type=Path, help="Input CSV path") report_parser.add_argument("--question", required=True, help="Analysis question") @@ -68,7 +86,7 @@ def _build_parser() -> argparse.ArgumentParser: def main(argv: list[str] | None = None) -> int: raw_args = list(sys.argv[1:] if argv is None else argv) - if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "-h", "--help"}: + if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "multi-analyze", "-h", "--help"}: raw_args.insert(0, "analyze") parser = _build_parser() @@ -90,6 +108,15 @@ def main(argv: list[str] | None = None) -> int: return 0 + + if args.command == "multi-analyze": + result = analyze_multiple_csv(args.csv, args.question) + args.out_json.write_text(result_to_json(result), encoding="utf-8") + args.out_report.write_text(build_multi_csv_markdown(result), encoding="utf-8") + print(f"multi analysis json saved: {args.out_json}") + print(f"multi analysis report saved: {args.out_report}") + return 0 + if args.command == "report": payload = build_analysis_payload(args.csv, args.question) summary = DataSummary(**payload["summary"]) diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py new file mode 100644 index 0000000..adb80f4 --- /dev/null +++ b/bitnet_tools/multi_csv.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import csv +import json +from collections import Counter +from pathlib import Path +from typing import Any + +from .analysis import _to_float, summarize_reader + + +def _profile_rows(rows: list[dict[str, str]], columns: list[str]) -> dict[str, Any]: + row_count = len(rows) + missing = {c: 0 for c in columns} + non_missing = {c: 0 for c in columns} + uniques: dict[str, set[str]] = {c: set() for c in columns} + value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns} + + numeric_positive = {c: 0 for c in columns} + numeric_zero = {c: 0 for c in columns} + numeric_negative = {c: 0 for c in columns} + + for row in rows: + for col in columns: + raw = (row.get(col) or "").strip() + if not raw: + missing[col] += 1 + continue + non_missing[col] += 1 + uniques[col].add(raw) + value_counts[col][raw] += 1 + + num = _to_float(raw) + if num is not None: + if num > 0: + numeric_positive[col] += 1 + elif num < 0: + numeric_negative[col] += 1 + else: + numeric_zero[col] += 1 + + summary = summarize_reader(rows, columns) + profiles: dict[str, Any] = {} + for col in columns: + nn = non_missing[col] + top = value_counts[col].most_common(5) + top_values = [ + { + "value": v, + "count": cnt, + "ratio": round(cnt / row_count, 6) if row_count else 0.0, + } + for v, cnt in top + ] + + numeric_total = numeric_positive[col] + numeric_zero[col] + numeric_negative[col] + numeric_distribution: dict[str, float] = {} + if numeric_total: + numeric_distribution = { + "positive_ratio": round(numeric_positive[col] / numeric_total, 6), + "zero_ratio": round(numeric_zero[col] / numeric_total, 6), + "negative_ratio": round(numeric_negative[col] / numeric_total, 6), + } + + profiles[col] = { + "missing_count": missing[col], + "missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0, + "non_missing_count": nn, + "unique_count": len(uniques[col]), + "unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0, + "top_values": top_values, + "numeric_distribution": numeric_distribution, + "dtype": summary.dtypes[col], + } + + return { + "summary": summary.to_dict(), + "column_profiles": profiles, + } + + +def analyze_multiple_csv(csv_paths: list[Path], question: str) -> dict[str, Any]: + if not csv_paths: + raise ValueError("at least one CSV path is required") + + files: list[dict[str, Any]] = [] + all_columns: list[set[str]] = [] + total_rows = 0 + + for path in csv_paths: + if not path.exists(): + raise FileNotFoundError(f"CSV file not found: {path}") + + with path.open("r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError(f"CSV header not found: {path}") + columns = [str(c) for c in reader.fieldnames] + rows = list(reader) + + profiled = _profile_rows(rows, columns) + total_rows += profiled["summary"]["row_count"] + all_columns.append(set(columns)) + + files.append( + { + "path": str(path), + "question": question, + "summary": profiled["summary"], + "column_profiles": profiled["column_profiles"], + } + ) + + shared_columns = sorted(set.intersection(*all_columns)) if all_columns else [] + union_columns = sorted(set.union(*all_columns)) if all_columns else [] + + return { + "question": question, + "file_count": len(files), + "total_row_count": total_rows, + "shared_columns": shared_columns, + "union_columns": union_columns, + "files": files, + "code_guidance": build_code_guidance(shared_columns), + } + + +def build_code_guidance(shared_columns: list[str]) -> dict[str, str]: + join_key = shared_columns[0] if shared_columns else "공통키컬럼" + + pandas_code = ( + "import pandas as pd\n" + "import matplotlib.pyplot as plt\n\n" + "paths = ['file1.csv', 'file2.csv', 'file3.csv']\n" + "dfs = [pd.read_csv(p) for p in paths]\n\n" + f"key = '{join_key}'\n" + "merged = dfs[0]\n" + "for df in dfs[1:]:\n" + " if key in merged.columns and key in df.columns:\n" + " merged = merged.merge(df, on=key, how='outer', suffixes=('', '_r'))\n\n" + "missing_ratio = merged.isna().mean().sort_values(ascending=False)\n" + "print('결측 비율 상위:\n', missing_ratio.head(10))\n\n" + "numeric_cols = merged.select_dtypes(include='number').columns\n" + "if len(numeric_cols) > 0:\n" + " ratio = (merged[numeric_cols] > 0).mean().sort_values(ascending=False)\n" + " print('양수 비율 상위:\n', ratio.head(10))\n" + " ratio.head(10).plot(kind='bar', title='양수 비율 상위 10개 컬럼')\n" + " plt.tight_layout(); plt.show()\n" + ) + + return { + "recommended_steps": ( + "1) 공통 키 컬럼 확인 후 병합\n" + "2) 컬럼별 결측/고유값/상위값 비율 확인\n" + "3) 수치형 컬럼 비율(양수/0/음수)과 분포 시각화\n" + "4) 지역/유형 컬럼과 수치형 컬럼 교차 집계로 인사이트 도출" + ), + "pandas_example": pandas_code, + } + + +def build_multi_csv_markdown(result: dict[str, Any]) -> str: + lines = [ + "# 다중 CSV 분석 리포트", + "", + f"- 질문: {result['question']}", + f"- 파일 수: {result['file_count']}", + f"- 전체 행 수: {result['total_row_count']}", + f"- 공통 컬럼: {', '.join(result['shared_columns']) if result['shared_columns'] else '(없음)'}", + "", + ] + + for file_info in result["files"]: + lines.extend( + [ + f"## 파일: {file_info['path']}", + "", + f"- 행 수: {file_info['summary']['row_count']}", + f"- 열 수: {file_info['summary']['column_count']}", + "", + "| 컬럼 | 타입 | 결측비율 | 고유비율 |", + "|---|---|---:|---:|", + ] + ) + for col in file_info["summary"]["columns"]: + prof = file_info["column_profiles"][col] + lines.append( + f"| {col} | {prof['dtype']} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} |" + ) + lines.append("") + + lines.extend( + [ + "## 코드 가이드", + "", + "```text", + result["code_guidance"]["recommended_steps"], + "```", + "", + "```python", + result["code_guidance"]["pandas_example"], + "```", + ] + ) + + return "\n".join(lines) + + +def result_to_json(result: dict[str, Any]) -> str: + return json.dumps(result, ensure_ascii=False, indent=2) diff --git a/tests/test_analysis.py b/tests/test_analysis.py index e5d66a2..cc2efdb 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -4,6 +4,8 @@ summarize_rows, build_markdown_report, ) +from bitnet_tools.multi_csv import analyze_multiple_csv, build_multi_csv_markdown + def test_summarize_rows_basic(): @@ -59,3 +61,17 @@ def test_build_markdown_report(): assert "# BitNet CSV 분석 보고서" in report assert "| a |" in report assert "테스트 질문" in report + + +def test_multi_csv_report_builder(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + p1.write_text("city,v\nseoul,1\n", encoding="utf-8") + p2.write_text("city,v2\nseoul,2\n", encoding="utf-8") + + result = analyze_multiple_csv([p1, p2], "비교") + report = build_multi_csv_markdown(result) + + assert result["file_count"] == 2 + assert "city" in result["shared_columns"] + assert "다중 CSV 분석 리포트" in report diff --git a/tests/test_cli.py b/tests/test_cli.py index e281c67..7ecded8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -54,3 +54,30 @@ def test_cli_report_mode(tmp_path): assert code == 0 assert out_path.exists() assert "BitNet CSV 분석 보고서" in out_path.read_text(encoding="utf-8") + + +def test_cli_multi_analyze_mode(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out.json" + out_md = tmp_path / "out.md" + + p1.write_text("city,val\nseoul,1\nbusan,2\n", encoding="utf-8") + p2.write_text("city,val2\nseoul,10\ndaegu,20\n", encoding="utf-8") + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "다중 비교", + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + assert out_json.exists() + assert out_md.exists() + assert "다중 CSV 분석 리포트" in out_md.read_text(encoding="utf-8")