Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,18 @@

## 0) 현재 완성도 빠른 진단

현 시점 기준 기능 완성도(실사용 관점): **약 90%**
현 시점 기준 기능 완성도(실사용 관점): **약 93%**

- 완료
- CSV 기초 요약(행/열/결측/숫자 통계)
- BitNet용 프롬프트 자동 생성
- CLI 실행 흐름
- 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`)
- 컬럼별 결측/고유/상위값 비율 산출
- 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성)
- 브라우저 UI(`bitnet-analyze ui`)
- **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)**
- 남은 과제
- 시각화(차트) 자동 생성
- 대시보드형 시각화 UI 고도화(필터/드릴다운)
- 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화

### 파일 붙여넣기 분석 가능 범위
Expand Down Expand Up @@ -128,6 +130,7 @@ jupyter lab

데스크톱 UI 내 `환경진단` 버튼으로 Ollama 설치/실행/모델 보유 여부를 즉시 확인할 수 있습니다.
또한 CSV 파일을 선택하지 않아도 CSV 텍스트를 바로 붙여넣어 분석할 수 있습니다.
(다중 CSV 동시 분석은 현재 CLI `multi-analyze`에서 먼저 지원합니다.)

---

Expand Down Expand Up @@ -214,6 +217,9 @@ bitnet-analyze doctor --model bitnet:latest

# 7) 마크다운 분석 리포트 저장
bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_report.md

# 8) 다중 CSV 통합 분석(JSON+MD+코드가이드)
bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --out-json multi.json --out-report multi.md
```

---
Expand Down
29 changes: 28 additions & 1 deletion bitnet_tools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from .analysis import DataSummary, build_analysis_payload, build_markdown_report
from .doctor import collect_environment
from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json
from .web import serve


Expand Down Expand Up @@ -53,6 +54,23 @@ def _build_parser() -> argparse.ArgumentParser:
doctor_parser = subparsers.add_parser("doctor", help="Run local environment diagnostics")
doctor_parser.add_argument("--model", default=None, help="Optional model tag to check availability")


multi_parser = subparsers.add_parser("multi-analyze", help="Analyze multiple CSV files together")
multi_parser.add_argument("csv", nargs="+", type=Path, help="Input CSV paths")
multi_parser.add_argument("--question", required=True, help="Analysis question")
multi_parser.add_argument(
"--out-json",
type=Path,
default=Path("multi_analysis.json"),
help="Where to store multi CSV analysis JSON",
)
multi_parser.add_argument(
"--out-report",
type=Path,
default=Path("multi_analysis_report.md"),
help="Where to store multi CSV markdown report",
)

report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV")
report_parser.add_argument("csv", type=Path, help="Input CSV path")
report_parser.add_argument("--question", required=True, help="Analysis question")
Expand All @@ -68,7 +86,7 @@ def _build_parser() -> argparse.ArgumentParser:

def main(argv: list[str] | None = None) -> int:
raw_args = list(sys.argv[1:] if argv is None else argv)
if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "-h", "--help"}:
if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "multi-analyze", "-h", "--help"}:
raw_args.insert(0, "analyze")

parser = _build_parser()
Expand All @@ -90,6 +108,15 @@ def main(argv: list[str] | None = None) -> int:
return 0



if args.command == "multi-analyze":
result = analyze_multiple_csv(args.csv, args.question)
args.out_json.write_text(result_to_json(result), encoding="utf-8")
args.out_report.write_text(build_multi_csv_markdown(result), encoding="utf-8")
print(f"multi analysis json saved: {args.out_json}")
print(f"multi analysis report saved: {args.out_report}")
return 0

if args.command == "report":
payload = build_analysis_payload(args.csv, args.question)
summary = DataSummary(**payload["summary"])
Expand Down
210 changes: 210 additions & 0 deletions bitnet_tools/multi_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from __future__ import annotations

import csv
import json
from collections import Counter
from pathlib import Path
from typing import Any

from .analysis import _to_float, summarize_reader


def _profile_rows(rows: list[dict[str, str]], columns: list[str]) -> dict[str, Any]:
row_count = len(rows)
missing = {c: 0 for c in columns}
non_missing = {c: 0 for c in columns}
uniques: dict[str, set[str]] = {c: set() for c in columns}
value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns}

numeric_positive = {c: 0 for c in columns}
numeric_zero = {c: 0 for c in columns}
numeric_negative = {c: 0 for c in columns}

for row in rows:
for col in columns:
raw = (row.get(col) or "").strip()
if not raw:
missing[col] += 1
continue
non_missing[col] += 1
uniques[col].add(raw)
value_counts[col][raw] += 1

num = _to_float(raw)
if num is not None:
if num > 0:
numeric_positive[col] += 1
elif num < 0:
numeric_negative[col] += 1
else:
numeric_zero[col] += 1

summary = summarize_reader(rows, columns)
profiles: dict[str, Any] = {}
for col in columns:
nn = non_missing[col]
top = value_counts[col].most_common(5)
top_values = [
{
"value": v,
"count": cnt,
"ratio": round(cnt / row_count, 6) if row_count else 0.0,
}
for v, cnt in top
]

numeric_total = numeric_positive[col] + numeric_zero[col] + numeric_negative[col]
numeric_distribution: dict[str, float] = {}
if numeric_total:
numeric_distribution = {
"positive_ratio": round(numeric_positive[col] / numeric_total, 6),
"zero_ratio": round(numeric_zero[col] / numeric_total, 6),
"negative_ratio": round(numeric_negative[col] / numeric_total, 6),
}

profiles[col] = {
"missing_count": missing[col],
"missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0,
"non_missing_count": nn,
"unique_count": len(uniques[col]),
"unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0,
"top_values": top_values,
"numeric_distribution": numeric_distribution,
"dtype": summary.dtypes[col],
}

return {
"summary": summary.to_dict(),
"column_profiles": profiles,
}


def analyze_multiple_csv(csv_paths: list[Path], question: str) -> dict[str, Any]:
if not csv_paths:
raise ValueError("at least one CSV path is required")

files: list[dict[str, Any]] = []
all_columns: list[set[str]] = []
total_rows = 0

for path in csv_paths:
if not path.exists():
raise FileNotFoundError(f"CSV file not found: {path}")

with path.open("r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
raise ValueError(f"CSV header not found: {path}")
columns = [str(c) for c in reader.fieldnames]
rows = list(reader)

profiled = _profile_rows(rows, columns)
total_rows += profiled["summary"]["row_count"]
all_columns.append(set(columns))

files.append(
{
"path": str(path),
"question": question,
"summary": profiled["summary"],
"column_profiles": profiled["column_profiles"],
}
)

shared_columns = sorted(set.intersection(*all_columns)) if all_columns else []
union_columns = sorted(set.union(*all_columns)) if all_columns else []

return {
"question": question,
"file_count": len(files),
"total_row_count": total_rows,
"shared_columns": shared_columns,
"union_columns": union_columns,
"files": files,
"code_guidance": build_code_guidance(shared_columns),
}


def build_code_guidance(shared_columns: list[str]) -> dict[str, str]:
join_key = shared_columns[0] if shared_columns else "공통키컬럼"

pandas_code = (
"import pandas as pd\n"
"import matplotlib.pyplot as plt\n\n"
"paths = ['file1.csv', 'file2.csv', 'file3.csv']\n"
"dfs = [pd.read_csv(p) for p in paths]\n\n"
f"key = '{join_key}'\n"
"merged = dfs[0]\n"
"for df in dfs[1:]:\n"
" if key in merged.columns and key in df.columns:\n"
" merged = merged.merge(df, on=key, how='outer', suffixes=('', '_r'))\n\n"
"missing_ratio = merged.isna().mean().sort_values(ascending=False)\n"
"print('결측 비율 상위:\n', missing_ratio.head(10))\n\n"
"numeric_cols = merged.select_dtypes(include='number').columns\n"
"if len(numeric_cols) > 0:\n"
" ratio = (merged[numeric_cols] > 0).mean().sort_values(ascending=False)\n"
" print('양수 비율 상위:\n', ratio.head(10))\n"
Comment on lines +142 to +146
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Escape newline sequences in generated pandas snippet

build_code_guidance currently uses \n inside quoted literals in the template string, so the generated pandas_example contains actual line breaks inside '...' and fails to compile (SyntaxError: unterminated string literal) when users run the suggested code from JSON/markdown output. This breaks the new “code guidance” workflow for any multi-analyze result that is copied into Python.

Useful? React with 👍 / 👎.

" ratio.head(10).plot(kind='bar', title='양수 비율 상위 10개 컬럼')\n"
" plt.tight_layout(); plt.show()\n"
)

return {
"recommended_steps": (
"1) 공통 키 컬럼 확인 후 병합\n"
"2) 컬럼별 결측/고유값/상위값 비율 확인\n"
"3) 수치형 컬럼 비율(양수/0/음수)과 분포 시각화\n"
"4) 지역/유형 컬럼과 수치형 컬럼 교차 집계로 인사이트 도출"
),
"pandas_example": pandas_code,
}


def build_multi_csv_markdown(result: dict[str, Any]) -> str:
lines = [
"# 다중 CSV 분석 리포트",
"",
f"- 질문: {result['question']}",
f"- 파일 수: {result['file_count']}",
f"- 전체 행 수: {result['total_row_count']}",
f"- 공통 컬럼: {', '.join(result['shared_columns']) if result['shared_columns'] else '(없음)'}",
"",
]

for file_info in result["files"]:
lines.extend(
[
f"## 파일: {file_info['path']}",
"",
f"- 행 수: {file_info['summary']['row_count']}",
f"- 열 수: {file_info['summary']['column_count']}",
"",
"| 컬럼 | 타입 | 결측비율 | 고유비율 |",
"|---|---|---:|---:|",
]
)
for col in file_info["summary"]["columns"]:
prof = file_info["column_profiles"][col]
lines.append(
f"| {col} | {prof['dtype']} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} |"
)
lines.append("")

lines.extend(
[
"## 코드 가이드",
"",
"```text",
result["code_guidance"]["recommended_steps"],
"```",
"",
"```python",
result["code_guidance"]["pandas_example"],
"```",
]
)

return "\n".join(lines)


def result_to_json(result: dict[str, Any]) -> str:
return json.dumps(result, ensure_ascii=False, indent=2)
16 changes: 16 additions & 0 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
summarize_rows,
build_markdown_report,
)
from bitnet_tools.multi_csv import analyze_multiple_csv, build_multi_csv_markdown



def test_summarize_rows_basic():
Expand Down Expand Up @@ -59,3 +61,17 @@ def test_build_markdown_report():
assert "# BitNet CSV 분석 보고서" in report
assert "| a |" in report
assert "테스트 질문" in report


def test_multi_csv_report_builder(tmp_path):
p1 = tmp_path / "a.csv"
p2 = tmp_path / "b.csv"
p1.write_text("city,v\nseoul,1\n", encoding="utf-8")
p2.write_text("city,v2\nseoul,2\n", encoding="utf-8")

result = analyze_multiple_csv([p1, p2], "비교")
report = build_multi_csv_markdown(result)

assert result["file_count"] == 2
assert "city" in result["shared_columns"]
assert "다중 CSV 분석 리포트" in report
27 changes: 27 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,30 @@ def test_cli_report_mode(tmp_path):
assert code == 0
assert out_path.exists()
assert "BitNet CSV 분석 보고서" in out_path.read_text(encoding="utf-8")


def test_cli_multi_analyze_mode(tmp_path):
p1 = tmp_path / "a.csv"
p2 = tmp_path / "b.csv"
out_json = tmp_path / "out.json"
out_md = tmp_path / "out.md"

p1.write_text("city,val\nseoul,1\nbusan,2\n", encoding="utf-8")
p2.write_text("city,val2\nseoul,10\ndaegu,20\n", encoding="utf-8")

code = cli.main([
"multi-analyze",
str(p1),
str(p2),
"--question",
"다중 비교",
"--out-json",
str(out_json),
"--out-report",
str(out_md),
])

assert code == 0
assert out_json.exists()
assert out_md.exists()
assert "다중 CSV 분석 리포트" in out_md.read_text(encoding="utf-8")