rad1092 · rad1092 · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026 · chatgpt-codex-connector
diff --git a/README.md b/README.md
@@ -7,16 +7,18 @@
 
 ## 0) 현재 완성도 빠른 진단
 
-현 시점 기준 기능 완성도(실사용 관점): **약 90%**
+현 시점 기준 기능 완성도(실사용 관점): **약 93%**
 
 - 완료
   - CSV 기초 요약(행/열/결측/숫자 통계)
   - BitNet용 프롬프트 자동 생성
-  - CLI 실행 흐름
+  - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`)
+  - 컬럼별 결측/고유/상위값 비율 산출
+  - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성)
   - 브라우저 UI(`bitnet-analyze ui`)
   - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)**
 - 남은 과제
-  - 시각화(차트) 자동 생성
+  - 대시보드형 시각화 UI 고도화(필터/드릴다운)
   - 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화
 
 ### 파일 붙여넣기 분석 가능 범위
@@ -128,6 +130,7 @@ jupyter lab
 
 데스크톱 UI 내 `환경진단` 버튼으로 Ollama 설치/실행/모델 보유 여부를 즉시 확인할 수 있습니다.
 또한 CSV 파일을 선택하지 않아도 CSV 텍스트를 바로 붙여넣어 분석할 수 있습니다.
+(다중 CSV 동시 분석은 현재 CLI `multi-analyze`에서 먼저 지원합니다.)
 
 ---
 
@@ -214,6 +217,9 @@ bitnet-analyze doctor --model bitnet:latest
 
 # 7) 마크다운 분석 리포트 저장
 bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_report.md
+
+# 8) 다중 CSV 통합 분석(JSON+MD+코드가이드)
+bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --out-json multi.json --out-report multi.md
 ```
 
 ---

diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py
@@ -8,6 +8,7 @@
 
 from .analysis import DataSummary, build_analysis_payload, build_markdown_report
 from .doctor import collect_environment
+from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json
 from .web import serve
 
 
@@ -53,6 +54,23 @@ def _build_parser() -> argparse.ArgumentParser:
     doctor_parser = subparsers.add_parser("doctor", help="Run local environment diagnostics")
     doctor_parser.add_argument("--model", default=None, help="Optional model tag to check availability")
 
+
+    multi_parser = subparsers.add_parser("multi-analyze", help="Analyze multiple CSV files together")
+    multi_parser.add_argument("csv", nargs="+", type=Path, help="Input CSV paths")
+    multi_parser.add_argument("--question", required=True, help="Analysis question")
+    multi_parser.add_argument(
+        "--out-json",
+        type=Path,
+        default=Path("multi_analysis.json"),
+        help="Where to store multi CSV analysis JSON",
+    )
+    multi_parser.add_argument(
+        "--out-report",
+        type=Path,
+        default=Path("multi_analysis_report.md"),
+        help="Where to store multi CSV markdown report",
+    )
+
     report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV")
     report_parser.add_argument("csv", type=Path, help="Input CSV path")
     report_parser.add_argument("--question", required=True, help="Analysis question")
@@ -68,7 +86,7 @@ def _build_parser() -> argparse.ArgumentParser:
 
 def main(argv: list[str] | None = None) -> int:
     raw_args = list(sys.argv[1:] if argv is None else argv)
-    if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "-h", "--help"}:
+    if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "multi-analyze", "-h", "--help"}:
         raw_args.insert(0, "analyze")
 
     parser = _build_parser()
@@ -90,6 +108,15 @@ def main(argv: list[str] | None = None) -> int:
         return 0
 
 
+
+    if args.command == "multi-analyze":
+        result = analyze_multiple_csv(args.csv, args.question)
+        args.out_json.write_text(result_to_json(result), encoding="utf-8")
+        args.out_report.write_text(build_multi_csv_markdown(result), encoding="utf-8")
+        print(f"multi analysis json saved: {args.out_json}")
+        print(f"multi analysis report saved: {args.out_report}")
+        return 0
+
     if args.command == "report":
         payload = build_analysis_payload(args.csv, args.question)
         summary = DataSummary(**payload["summary"])

diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import csv
+import json
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+from .analysis import _to_float, summarize_reader
+
+
+def _profile_rows(rows: list[dict[str, str]], columns: list[str]) -> dict[str, Any]:
+    row_count = len(rows)
+    missing = {c: 0 for c in columns}
+    non_missing = {c: 0 for c in columns}
+    uniques: dict[str, set[str]] = {c: set() for c in columns}
+    value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns}
+
+    numeric_positive = {c: 0 for c in columns}
+    numeric_zero = {c: 0 for c in columns}
+    numeric_negative = {c: 0 for c in columns}
+
+    for row in rows:
+        for col in columns:
+            raw = (row.get(col) or "").strip()
+            if not raw:
+                missing[col] += 1
+                continue
+            non_missing[col] += 1
+            uniques[col].add(raw)
+            value_counts[col][raw] += 1
+
+            num = _to_float(raw)
+            if num is not None:
+                if num > 0:
+                    numeric_positive[col] += 1
+                elif num < 0:
+                    numeric_negative[col] += 1
+                else:
+                    numeric_zero[col] += 1
+
+    summary = summarize_reader(rows, columns)
+    profiles: dict[str, Any] = {}
+    for col in columns:
+        nn = non_missing[col]
+        top = value_counts[col].most_common(5)
+        top_values = [
+            {
+                "value": v,
+                "count": cnt,
+                "ratio": round(cnt / row_count, 6) if row_count else 0.0,
+            }
+            for v, cnt in top
+        ]
+
+        numeric_total = numeric_positive[col] + numeric_zero[col] + numeric_negative[col]
+        numeric_distribution: dict[str, float] = {}
+        if numeric_total:
+            numeric_distribution = {
+                "positive_ratio": round(numeric_positive[col] / numeric_total, 6),
+                "zero_ratio": round(numeric_zero[col] / numeric_total, 6),
+                "negative_ratio": round(numeric_negative[col] / numeric_total, 6),
+            }
+
+        profiles[col] = {
+            "missing_count": missing[col],
+            "missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0,
+            "non_missing_count": nn,
+            "unique_count": len(uniques[col]),
+            "unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0,
+            "top_values": top_values,
+            "numeric_distribution": numeric_distribution,
+            "dtype": summary.dtypes[col],
+        }
+
+    return {
+        "summary": summary.to_dict(),
+        "column_profiles": profiles,
+    }
+
+
+def analyze_multiple_csv(csv_paths: list[Path], question: str) -> dict[str, Any]:
+    if not csv_paths:
+        raise ValueError("at least one CSV path is required")
+
+    files: list[dict[str, Any]] = []
+    all_columns: list[set[str]] = []
+    total_rows = 0
+
+    for path in csv_paths:
+        if not path.exists():
+            raise FileNotFoundError(f"CSV file not found: {path}")
+
+        with path.open("r", encoding="utf-8-sig", newline="") as f:
+            reader = csv.DictReader(f)
+            if reader.fieldnames is None:
+                raise ValueError(f"CSV header not found: {path}")
+            columns = [str(c) for c in reader.fieldnames]
+            rows = list(reader)
+
+        profiled = _profile_rows(rows, columns)
+        total_rows += profiled["summary"]["row_count"]
+        all_columns.append(set(columns))
+
+        files.append(
+            {
+                "path": str(path),
+                "question": question,
+                "summary": profiled["summary"],
+                "column_profiles": profiled["column_profiles"],
+            }
+        )
+
+    shared_columns = sorted(set.intersection(*all_columns)) if all_columns else []
+    union_columns = sorted(set.union(*all_columns)) if all_columns else []
+
+    return {
+        "question": question,
+        "file_count": len(files),
+        "total_row_count": total_rows,
+        "shared_columns": shared_columns,
+        "union_columns": union_columns,
+        "files": files,
+        "code_guidance": build_code_guidance(shared_columns),
+    }
+
+
+def build_code_guidance(shared_columns: list[str]) -> dict[str, str]:
+    join_key = shared_columns[0] if shared_columns else "공통키컬럼"
+
+    pandas_code = (
+        "import pandas as pd\n"
+        "import matplotlib.pyplot as plt\n\n"
+        "paths = ['file1.csv', 'file2.csv', 'file3.csv']\n"
+        "dfs = [pd.read_csv(p) for p in paths]\n\n"
+        f"key = '{join_key}'\n"
+        "merged = dfs[0]\n"
+        "for df in dfs[1:]:\n"
+        "    if key in merged.columns and key in df.columns:\n"
+        "        merged = merged.merge(df, on=key, how='outer', suffixes=('', '_r'))\n\n"
+        "missing_ratio = merged.isna().mean().sort_values(ascending=False)\n"
+        "print('결측 비율 상위:\n', missing_ratio.head(10))\n\n"
+        "numeric_cols = merged.select_dtypes(include='number').columns\n"
+        "if len(numeric_cols) > 0:\n"
+        "    ratio = (merged[numeric_cols] > 0).mean().sort_values(ascending=False)\n"
+        "    print('양수 비율 상위:\n', ratio.head(10))\n"
+        "    ratio.head(10).plot(kind='bar', title='양수 비율 상위 10개 컬럼')\n"
+        "    plt.tight_layout(); plt.show()\n"
+    )
+
+    return {
+        "recommended_steps": (
+            "1) 공통 키 컬럼 확인 후 병합\n"
+            "2) 컬럼별 결측/고유값/상위값 비율 확인\n"
+            "3) 수치형 컬럼 비율(양수/0/음수)과 분포 시각화\n"
+            "4) 지역/유형 컬럼과 수치형 컬럼 교차 집계로 인사이트 도출"
+        ),
+        "pandas_example": pandas_code,
+    }
+
+
+def build_multi_csv_markdown(result: dict[str, Any]) -> str:
+    lines = [
+        "# 다중 CSV 분석 리포트",
+        "",
+        f"- 질문: {result['question']}",
+        f"- 파일 수: {result['file_count']}",
+        f"- 전체 행 수: {result['total_row_count']}",
+        f"- 공통 컬럼: {', '.join(result['shared_columns']) if result['shared_columns'] else '(없음)'}",
+        "",
+    ]
+
+    for file_info in result["files"]:
+        lines.extend(
+            [
+                f"## 파일: {file_info['path']}",
+                "",
+                f"- 행 수: {file_info['summary']['row_count']}",
+                f"- 열 수: {file_info['summary']['column_count']}",
+                "",
+                "| 컬럼 | 타입 | 결측비율 | 고유비율 |",
+                "|---|---|---:|---:|",
+            ]
+        )
+        for col in file_info["summary"]["columns"]:
+            prof = file_info["column_profiles"][col]
+            lines.append(
+                f"| {col} | {prof['dtype']} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} |"
+            )
+        lines.append("")
+
+    lines.extend(
+        [
+            "## 코드 가이드",
+            "",
+            "```text",
+            result["code_guidance"]["recommended_steps"],
+            "```",
+            "",
+            "```python",
+            result["code_guidance"]["pandas_example"],
+            "```",
+        ]
+    )
+
+    return "\n".join(lines)
+
+
+def result_to_json(result: dict[str, Any]) -> str:
+    return json.dumps(result, ensure_ascii=False, indent=2)
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -4,6 +4,8 @@
     summarize_rows,
     build_markdown_report,
 )
+from bitnet_tools.multi_csv import analyze_multiple_csv, build_multi_csv_markdown
+
 
 
 def test_summarize_rows_basic():
@@ -59,3 +61,17 @@ def test_build_markdown_report():
     assert "# BitNet CSV 분석 보고서" in report
     assert "| a |" in report
     assert "테스트 질문" in report
+
+
+def test_multi_csv_report_builder(tmp_path):
+    p1 = tmp_path / "a.csv"
+    p2 = tmp_path / "b.csv"
+    p1.write_text("city,v\nseoul,1\n", encoding="utf-8")
+    p2.write_text("city,v2\nseoul,2\n", encoding="utf-8")
+
+    result = analyze_multiple_csv([p1, p2], "비교")
+    report = build_multi_csv_markdown(result)
+
+    assert result["file_count"] == 2
+    assert "city" in result["shared_columns"]
+    assert "다중 CSV 분석 리포트" in report
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -54,3 +54,30 @@ def test_cli_report_mode(tmp_path):
     assert code == 0
     assert out_path.exists()
     assert "BitNet CSV 분석 보고서" in out_path.read_text(encoding="utf-8")
+
+
+def test_cli_multi_analyze_mode(tmp_path):
+    p1 = tmp_path / "a.csv"
+    p2 = tmp_path / "b.csv"
+    out_json = tmp_path / "out.json"
+    out_md = tmp_path / "out.md"
+
+    p1.write_text("city,val\nseoul,1\nbusan,2\n", encoding="utf-8")
+    p2.write_text("city,val2\nseoul,10\ndaegu,20\n", encoding="utf-8")
+
+    code = cli.main([
+        "multi-analyze",
+        str(p1),
+        str(p2),
+        "--question",
+        "다중 비교",
+        "--out-json",
+        str(out_json),
+        "--out-report",
+        str(out_md),
+    ])
+
+    assert code == 0
+    assert out_json.exists()
+    assert out_md.exists()
+    assert "다중 CSV 분석 리포트" in out_md.read_text(encoding="utf-8")