From 36f4f3f81d0d99e5f91bc8fa58590ae13b71dde6 Mon Sep 17 00:00:00 2001 From: HONGDAE KIM Date: Sat, 14 Feb 2026 23:30:10 +0900 Subject: [PATCH] feat: upgrade multi-csv analytics with drift and group ratios --- BitNet_Desktop_Start.bat | 14 ++ README.md | 98 ++++++++--- bitnet_desktop.pyw | 5 + bitnet_tools/analysis.py | 65 ++++++-- bitnet_tools/cli.py | 74 ++++++++- bitnet_tools/desktop.py | 214 ++++++++++++++++++++++++ bitnet_tools/doctor.py | 51 ++++++ bitnet_tools/multi_csv.py | 334 ++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + tests/test_analysis.py | 51 ++++++ tests/test_cli.py | 84 ++++++++++ 11 files changed, 955 insertions(+), 36 deletions(-) create mode 100644 BitNet_Desktop_Start.bat create mode 100644 bitnet_desktop.pyw create mode 100644 bitnet_tools/desktop.py create mode 100644 bitnet_tools/doctor.py create mode 100644 bitnet_tools/multi_csv.py diff --git a/BitNet_Desktop_Start.bat b/BitNet_Desktop_Start.bat new file mode 100644 index 0000000..3638c8e --- /dev/null +++ b/BitNet_Desktop_Start.bat @@ -0,0 +1,14 @@ +@echo off +setlocal +cd /d %~dp0 + +if not exist .venv ( + py -m venv .venv +) + +call .venv\Scripts\activate +python -m pip install --upgrade pip >nul +python -m pip install -e . >nul + +start "" pythonw "%~dp0bitnet_desktop.pyw" +endlocal diff --git a/README.md b/README.md index 18061de..db6fca6 100644 --- a/README.md +++ b/README.md @@ -5,17 +5,53 @@ --- -## 0) 이번 문서에서 바로 할 일 +## 0) 현재 완성도 빠른 진단 + +현 시점 기준 기능 완성도(실사용 관점): **약 94%** + +- 완료 + - CSV 기초 요약(행/열/결측/숫자 통계) + - BitNet용 프롬프트 자동 생성 + - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`) + - 컬럼별 결측/고유/상위값 비율 산출 + - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성) + - 브라우저 UI(`bitnet-analyze ui`) + - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)** +- 남은 과제 + - 대시보드형 시각화 UI 고도화(필터/드릴다운) + - 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화 + +### 파일 붙여넣기 분석 가능 범위 + +가능: +- Python 코드, 로그, 에러 메시지, 설정 파일(`.toml`, `.json`, `.yaml`), CSV 샘플 +- 모듈 구조/의존성/리팩터링 포인트/버그 후보 분석 +- 여러 파일을 순차로 붙여주면 아키텍처 단위 진단 + +제약: +- 실제 실행이 필요한 문제(환경/권한/OS 특이 이슈)는 붙여넣기만으로 100% 재현 불가 +- 초대형 파일은 핵심 구간(에러 스택, 함수 단위) 분할 제공 권장 + +권장 붙여넣기 순서: +1. 에러 로그 전문 +2. 관련 함수/클래스 +3. 실행 명령어 +4. `pyproject.toml` 또는 의존성 목록 + +--- + +## 1) 이번 문서에서 바로 할 일 1. Ollama 설치 및 실행 2. BitNet 모델 1개 Pull 3. CLI로 동작 확인 4. Open WebUI 연결 5. JupyterLab에서 CSV 분석 + BitNet 해석 워크플로우 구성 +6. (Windows) 더블클릭으로 데스크톱 앱 실행 --- -## 1) 사전 확인 (10~20분) +## 2) 사전 확인 (10~20분) - OS 확인 - RAM/VRAM 확인 @@ -29,7 +65,7 @@ --- -## 2) Step-by-step 시작 절차 (BitNet 우선) +## 3) Step-by-step 시작 절차 (BitNet 우선) ### Step 1. Ollama 설치 ```bash @@ -79,9 +115,26 @@ pip install jupyterlab pandas matplotlib jupyter lab ``` +### Step 6. Windows 원클릭 실행 + +터미널 없이 사용하려면 아래 중 하나를 사용하세요. + +- 방법 A: 프로젝트 루트에서 `BitNet_Desktop_Start.bat` 더블클릭 +- 방법 B: 설치 후 `bitnet-desktop` 실행 +- 방법 C: `bitnet-analyze desktop` 실행 + +`BitNet_Desktop_Start.bat`는 다음을 자동 수행합니다. +- `.venv` 생성(없으면) +- 패키지 설치(`pip install -e .`) +- `pythonw`로 GUI 실행(콘솔창 없이) + +데스크톱 UI 내 `환경진단` 버튼으로 Ollama 설치/실행/모델 보유 여부를 즉시 확인할 수 있습니다. +또한 CSV 파일을 선택하지 않아도 CSV 텍스트를 바로 붙여넣어 분석할 수 있습니다. +(다중 CSV 동시 분석은 현재 CLI `multi-analyze`에서 먼저 지원합니다.) + --- -## 3) BitNet 기본 설정값 (안정성 우선) +## 4) BitNet 기본 설정값 (안정성 우선) - temperature: `0.2 ~ 0.5` - top_p: `0.9` @@ -95,12 +148,12 @@ jupyter lab --- -## 4) 데이터 분석 최소 워크플로우 (BitNet only) +## 5) 데이터 분석 최소 워크플로우 (BitNet only) -1. JupyterLab에서 CSV 로딩 -2. pandas로 결측/타입/기초통계 계산 -3. 계산 결과를 텍스트로 정리 -4. 정리된 텍스트를 BitNet에 입력해 인사이트/한계/추가 데이터 제안 받기 +1. CSV 로딩 +2. 결측/타입/기초통계 계산 +3. 계산 결과 기반 프롬프트 생성 +4. BitNet 실행으로 인사이트/한계/추가 데이터 제안 받기 예시 프롬프트: @@ -121,7 +174,7 @@ jupyter lab --- -## 5) 운영 안정화 체크리스트 +## 6) 운영 안정화 체크리스트 - [ ] BitNet 모델 1~2개만 유지 - [ ] 프롬프트 템플릿은 검증된 것만 유지 @@ -135,7 +188,7 @@ jupyter lab --- -## 6) 지금 바로 실행할 최소 커맨드 모음 +## 7) 지금 바로 실행할 최소 커맨드 모음 ```bash # 0) 프로젝트 설치 @@ -153,19 +206,25 @@ ollama pull # 3) CSV 분석 payload 생성 bitnet-analyze analyze sample.csv --question "샘플 매출 데이터를 요약해줘" --out payload.json -# 4) (선택) 웹 UI 실행 +# 4) 웹 UI 실행 bitnet-analyze ui --host 127.0.0.1 --port 8765 -``` -필요하면 다음 단계에서 환경(OS/CPU/RAM/GPU)에 맞춰 -- 정확한 BitNet 태그 -- 권장 context/max_tokens -- Open WebUI 프리셋 프롬프트 3종 -까지 바로 좁혀서 제안할 수 있습니다. +# 5) 데스크톱 UI 실행 +bitnet-analyze desktop + +# 6) 환경 진단 +bitnet-analyze doctor --model bitnet:latest + +# 7) 마크다운 분석 리포트 저장 +bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_report.md + +# 8) 다중 CSV 통합 분석(JSON+MD+코드가이드) +bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --group-column 시도명 --target-column 세차유형 --out-json multi.json --out-report multi.md +``` --- -## 7) GitHub 반영(적용) 절차 +## 8) GitHub 반영(적용) 절차 로컬에서 문서/설정을 수정한 뒤 아래 순서로 GitHub에 반영합니다. @@ -179,4 +238,3 @@ PR 생성 시 체크 포인트: - 변경 목적(왜 바꿨는지) 1~2줄 - 실행/검증한 명령어 - 사용자 관점에서 달라진 점(BitNet 우선 흐름, 실행 순서 명확화 등) - diff --git a/bitnet_desktop.pyw b/bitnet_desktop.pyw new file mode 100644 index 0000000..a31ec28 --- /dev/null +++ b/bitnet_desktop.pyw @@ -0,0 +1,5 @@ +from bitnet_tools.desktop import launch_desktop + + +if __name__ == "__main__": + launch_desktop() diff --git a/bitnet_tools/analysis.py b/bitnet_tools/analysis.py index 5a615ab..d96ad72 100644 --- a/bitnet_tools/analysis.py +++ b/bitnet_tools/analysis.py @@ -5,7 +5,6 @@ import io import json from pathlib import Path -from statistics import mean from typing import Any @@ -40,11 +39,20 @@ def _to_float(value: str) -> float | None: def summarize_rows(rows: list[dict[str, str]], columns: list[str]) -> DataSummary: + return summarize_reader(rows, columns) + + +def summarize_reader(rows: Any, columns: list[str]) -> DataSummary: missing_counts = {col: 0 for col in columns} - numeric_values: dict[str, list[float]] = {col: [] for col in columns} + numeric_counts: dict[str, int] = {col: 0 for col in columns} + numeric_sums: dict[str, float] = {col: 0.0 for col in columns} + numeric_mins: dict[str, float] = {} + numeric_maxs: dict[str, float] = {} text_seen: dict[str, bool] = {col: False for col in columns} + row_count = 0 for row in rows: + row_count += 1 for col in columns: raw = (row.get(col) or "").strip() if raw == "": @@ -54,25 +62,30 @@ def summarize_rows(rows: list[dict[str, str]], columns: list[str]) -> DataSummar if num is None: text_seen[col] = True else: - numeric_values[col].append(num) + numeric_counts[col] += 1 + numeric_sums[col] += num + if col not in numeric_mins or num < numeric_mins[col]: + numeric_mins[col] = num + if col not in numeric_maxs or num > numeric_maxs[col]: + numeric_maxs[col] = num dtypes: dict[str, str] = {} numeric_stats: dict[str, dict[str, float]] = {} for col in columns: - values = numeric_values[col] - if values and not text_seen[col]: + count = numeric_counts[col] + if count > 0 and not text_seen[col]: dtypes[col] = "float" numeric_stats[col] = { - "count": float(len(values)), - "mean": float(mean(values)), - "min": float(min(values)), - "max": float(max(values)), + "count": float(count), + "mean": float(numeric_sums[col] / count), + "min": float(numeric_mins[col]), + "max": float(numeric_maxs[col]), } else: dtypes[col] = "string" return DataSummary( - row_count=len(rows), + row_count=row_count, column_count=len(columns), columns=columns, dtypes=dtypes, @@ -91,6 +104,32 @@ def build_prompt(summary: DataSummary, question: str) -> str: ) +def build_markdown_report(summary: DataSummary, question: str) -> str: + lines = [ + "# BitNet CSV 분석 보고서", + "", + f"- 질문: {question}", + f"- 행 수: {summary.row_count}", + f"- 열 수: {summary.column_count}", + "", + "## 컬럼 정보", + "", + "| 컬럼 | 타입 | 결측 수 |", + "|---|---|---:|", + ] + for col in summary.columns: + lines.append(f"| {col} | {summary.dtypes.get(col, 'string')} | {summary.missing_counts.get(col, 0)} |") + + if summary.numeric_stats: + lines.extend(["", "## 수치형 통계", "", "| 컬럼 | count | mean | min | max |", "|---|---:|---:|---:|---:|"]) + for col, stats in summary.numeric_stats.items(): + lines.append( + f"| {col} | {stats['count']:.0f} | {stats['mean']:.4f} | {stats['min']:.4f} | {stats['max']:.4f} |" + ) + + return "\n".join(lines) + + def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any]: path = Path(csv_path) if not path.exists(): @@ -101,9 +140,8 @@ def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any if reader.fieldnames is None: raise ValueError("CSV header not found") columns = [str(c) for c in reader.fieldnames] - rows = list(reader) - summary = summarize_rows(rows, columns) + summary = summarize_reader(reader, columns) return { "csv_path": str(path), @@ -119,8 +157,7 @@ def build_analysis_payload_from_csv_text(csv_text: str, question: str) -> dict[s raise ValueError("CSV header not found") columns = [str(c) for c in reader.fieldnames] - rows = list(reader) - summary = summarize_rows(rows, columns) + summary = summarize_reader(reader, columns) return { "csv_path": "", diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py index 5d1362d..c14e299 100644 --- a/bitnet_tools/cli.py +++ b/bitnet_tools/cli.py @@ -6,7 +6,9 @@ import sys from pathlib import Path -from .analysis import build_analysis_payload +from .analysis import DataSummary, build_analysis_payload, build_markdown_report +from .doctor import collect_environment +from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json from .web import serve @@ -47,12 +49,46 @@ def _build_parser() -> argparse.ArgumentParser: ui_parser.add_argument("--host", default="127.0.0.1", help="Bind host") ui_parser.add_argument("--port", default=8765, type=int, help="Bind port") + subparsers.add_parser("desktop", help="Run Windows desktop UI") + + doctor_parser = subparsers.add_parser("doctor", help="Run local environment diagnostics") + doctor_parser.add_argument("--model", default=None, help="Optional model tag to check availability") + + + multi_parser = subparsers.add_parser("multi-analyze", help="Analyze multiple CSV files together") + multi_parser.add_argument("csv", nargs="+", type=Path, help="Input CSV paths") + multi_parser.add_argument("--question", required=True, help="Analysis question") + multi_parser.add_argument("--group-column", default=None, help="Optional group column for ratio table") + multi_parser.add_argument("--target-column", default=None, help="Optional target column for ratio table") + multi_parser.add_argument( + "--out-json", + type=Path, + default=Path("multi_analysis.json"), + help="Where to store multi CSV analysis JSON", + ) + multi_parser.add_argument( + "--out-report", + type=Path, + default=Path("multi_analysis_report.md"), + help="Where to store multi CSV markdown report", + ) + + report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV") + report_parser.add_argument("csv", type=Path, help="Input CSV path") + report_parser.add_argument("--question", required=True, help="Analysis question") + report_parser.add_argument( + "--out", + type=Path, + default=Path("analysis_report.md"), + help="Where to store generated markdown report", + ) + return parser def main(argv: list[str] | None = None) -> int: raw_args = list(sys.argv[1:] if argv is None else argv) - if raw_args and raw_args[0] not in {"analyze", "ui", "-h", "--help"}: + if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "multi-analyze", "-h", "--help"}: raw_args.insert(0, "analyze") parser = _build_parser() @@ -62,6 +98,40 @@ def main(argv: list[str] | None = None) -> int: serve(host=args.host, port=args.port) return 0 + if args.command == "desktop": + from .desktop import launch_desktop + + launch_desktop() + return 0 + + if args.command == "doctor": + report = collect_environment(model=args.model) + print(json.dumps(report, ensure_ascii=False, indent=2)) + return 0 + + + + if args.command == "multi-analyze": + result = analyze_multiple_csv( + args.csv, + args.question, + group_column=args.group_column, + target_column=args.target_column, + ) + args.out_json.write_text(result_to_json(result), encoding="utf-8") + args.out_report.write_text(build_multi_csv_markdown(result), encoding="utf-8") + print(f"multi analysis json saved: {args.out_json}") + print(f"multi analysis report saved: {args.out_report}") + return 0 + + if args.command == "report": + payload = build_analysis_payload(args.csv, args.question) + summary = DataSummary(**payload["summary"]) + report = build_markdown_report(summary, args.question) + args.out.write_text(report, encoding="utf-8") + print(f"report saved: {args.out}") + return 0 + if args.command == "analyze": payload = build_analysis_payload(args.csv, args.question) args.out.write_text( diff --git a/bitnet_tools/desktop.py b/bitnet_tools/desktop.py new file mode 100644 index 0000000..0693e45 --- /dev/null +++ b/bitnet_tools/desktop.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import json +import subprocess +import threading +import tkinter as tk +from pathlib import Path +from tkinter import filedialog, ttk + +from .analysis import build_analysis_payload, build_analysis_payload_from_csv_text +from .doctor import collect_environment + + +def run_ollama(model: str, prompt: str) -> str: + proc = subprocess.run( + ["ollama", "run", model, prompt], + capture_output=True, + text=True, + check=False, + ) + if proc.returncode != 0: + raise RuntimeError(proc.stderr.strip() or "ollama run failed") + return proc.stdout.strip() + + +class DesktopApp: + def __init__(self, root: tk.Tk) -> None: + self.root = root + self.root.title("BitNet CSV Analyzer (Windows)") + self.root.geometry("1100x760") + + self.csv_path: Path | None = None + self.latest_prompt = "" + + self._build_ui() + + def _build_ui(self) -> None: + frame = ttk.Frame(self.root, padding=12) + frame.pack(fill="both", expand=True) + + header = ttk.Label( + frame, + text="BitNet CSV Analyzer - 터미널 없이 바로 실행", + font=("Segoe UI", 14, "bold"), + ) + header.pack(anchor="w") + + sub = ttk.Label( + frame, + text="CSV 선택/붙여넣기 → 분석 → BitNet 실행 순서로 사용하세요.", + ) + sub.pack(anchor="w", pady=(0, 10)) + + top_row = ttk.Frame(frame) + top_row.pack(fill="x", pady=(0, 8)) + ttk.Button(top_row, text="CSV 파일 열기", command=self._open_csv).pack(side="left") + + self.csv_label = ttk.Label(top_row, text="선택된 파일 없음") + self.csv_label.pack(side="left", padx=12) + + csv_row = ttk.LabelFrame(frame, text="CSV 텍스트 (파일 미선택 시 여기에 붙여넣기)") + csv_row.pack(fill="both", pady=(0, 8)) + self.csv_text = tk.Text(csv_row, height=8, wrap="none") + self.csv_text.pack(fill="both", expand=True, padx=8, pady=8) + + question_row = ttk.LabelFrame(frame, text="질문") + question_row.pack(fill="x", pady=(0, 8)) + + chip_row = ttk.Frame(question_row) + chip_row.pack(anchor="w", padx=8, pady=6) + presets = [ + "핵심 인사이트 3개와 근거를 알려줘", + "이상치 의심 포인트와 추가 확인 항목을 알려줘", + "실행 가능한 다음 액션 5개를 우선순위로 제안해줘", + ] + for txt in presets: + ttk.Button(chip_row, text=txt.split()[0], command=lambda t=txt: self._set_question(t)).pack( + side="left", padx=(0, 6) + ) + + self.question = tk.Text(question_row, height=3, wrap="word") + self.question.pack(fill="x", padx=8, pady=(0, 8)) + self.question.insert("1.0", presets[0]) + + model_row = ttk.Frame(frame) + model_row.pack(fill="x", pady=(0, 8)) + + ttk.Label(model_row, text="BitNet 모델 태그").pack(side="left") + self.model = ttk.Entry(model_row) + self.model.insert(0, "bitnet:latest") + self.model.pack(side="left", fill="x", expand=True, padx=8) + + ttk.Button(model_row, text="환경진단", command=self._doctor_async).pack(side="left", padx=(8, 4)) + ttk.Button(model_row, text="1) 분석", command=self._analyze_async).pack(side="left", padx=(0, 4)) + ttk.Button(model_row, text="2) BitNet 실행", command=self._run_model_async).pack(side="left") + + self.status = ttk.Label(frame, text="대기 중") + self.status.pack(anchor="w", pady=(0, 8)) + + output = ttk.Panedwindow(frame, orient="vertical") + output.pack(fill="both", expand=True) + + self.summary = self._make_text_panel(output, "데이터 요약") + self.prompt = self._make_text_panel(output, "생성 프롬프트") + self.answer = self._make_text_panel(output, "BitNet 응답") + + def _make_text_panel(self, parent: ttk.Panedwindow, title: str) -> tk.Text: + panel = ttk.LabelFrame(parent, text=title) + text = tk.Text(panel, wrap="word", height=10) + scrollbar = ttk.Scrollbar(panel, orient="vertical", command=text.yview) + text.configure(yscrollcommand=scrollbar.set) + text.pack(side="left", fill="both", expand=True) + scrollbar.pack(side="right", fill="y") + parent.add(panel, weight=1) + return text + + def _on_ui(self, func, *args) -> None: + self.root.after(0, lambda: func(*args)) + + def _set_question(self, text: str) -> None: + self.question.delete("1.0", "end") + self.question.insert("1.0", text) + + def _open_csv(self) -> None: + path = filedialog.askopenfilename( + title="CSV 파일 선택", + filetypes=[("CSV files", "*.csv"), ("All files", "*.*")], + ) + if not path: + return + self.csv_path = Path(path) + self.csv_label.configure(text=str(self.csv_path)) + content = self.csv_path.read_text(encoding="utf-8") + self.csv_text.delete("1.0", "end") + self.csv_text.insert("1.0", content) + + def _get_question(self) -> str: + question = self.question.get("1.0", "end").strip() + return question or "이 데이터의 핵심 인사이트를 알려줘" + + def _analyze_async(self) -> None: + threading.Thread(target=self._analyze, daemon=True).start() + + def _analyze(self) -> None: + self._on_ui(self._set_status, "분석 중...") + try: + question = self._get_question() + csv_text = self.csv_text.get("1.0", "end").strip() + if self.csv_path: + payload = build_analysis_payload(self.csv_path, question) + elif csv_text: + payload = build_analysis_payload_from_csv_text(csv_text, question) + else: + self._on_ui(self._set_status, "CSV 파일을 선택하거나 CSV 텍스트를 붙여넣어 주세요") + return + + self.latest_prompt = payload["prompt"] + self._on_ui(self._set_text, self.summary, json.dumps(payload["summary"], ensure_ascii=False, indent=2)) + self._on_ui(self._set_text, self.prompt, self.latest_prompt) + self._on_ui(self._set_text, self.answer, "") + self._on_ui(self._set_status, "분석 완료") + except Exception as exc: + self._on_ui(self._set_status, f"오류: {exc}") + + def _doctor_async(self) -> None: + threading.Thread(target=self._doctor, daemon=True).start() + + def _doctor(self) -> None: + self._on_ui(self._set_status, "환경 진단 중...") + report = collect_environment(model=self.model.get().strip() or None) + self._on_ui(self._set_text, self.answer, json.dumps(report, ensure_ascii=False, indent=2)) + if report.get("ollama_installed") and report.get("ollama_running"): + self._on_ui(self._set_status, "환경 진단 완료 (정상)") + else: + self._on_ui(self._set_status, "환경 진단 완료 (확인 필요)") + + def _run_model_async(self) -> None: + threading.Thread(target=self._run_model, daemon=True).start() + + def _run_model(self) -> None: + if not self.latest_prompt: + self._on_ui(self._set_text, self.answer, "먼저 분석을 실행해 프롬프트를 생성하세요.") + return + + model = self.model.get().strip() + if not model: + self._on_ui(self._set_text, self.answer, "모델 태그를 입력하세요. 예: bitnet:latest") + return + + self._on_ui(self._set_status, "BitNet 실행 중...") + try: + result = run_ollama(model, self.latest_prompt) + self._on_ui(self._set_text, self.answer, result) + self._on_ui(self._set_status, "BitNet 실행 완료") + except Exception as exc: + self._on_ui(self._set_text, self.answer, f"오류: {exc}") + self._on_ui(self._set_status, "BitNet 실행 실패") + + def _set_text(self, widget: tk.Text, value: str) -> None: + widget.delete("1.0", "end") + widget.insert("1.0", value) + + def _set_status(self, value: str) -> None: + self.status.configure(text=value) + + +def launch_desktop() -> None: + root = tk.Tk() + DesktopApp(root) + root.mainloop() + + +if __name__ == "__main__": + launch_desktop() diff --git a/bitnet_tools/doctor.py b/bitnet_tools/doctor.py new file mode 100644 index 0000000..2758eea --- /dev/null +++ b/bitnet_tools/doctor.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import platform +import shutil +import subprocess +import sys +from typing import Any + + +def _run(cmd: list[str]) -> tuple[int, str, str]: + proc = subprocess.run(cmd, capture_output=True, text=True, check=False) + return proc.returncode, proc.stdout.strip(), proc.stderr.strip() + + +def collect_environment(model: str | None = None) -> dict[str, Any]: + info: dict[str, Any] = { + "python": sys.version.split()[0], + "platform": platform.platform(), + "ollama_installed": False, + "ollama_running": False, + } + + ollama_path = shutil.which("ollama") + if not ollama_path: + info["diagnosis"] = "ollama not found in PATH" + return info + + info["ollama_installed"] = True + info["ollama_path"] = ollama_path + + code, out, err = _run(["ollama", "--version"]) + if code == 0: + info["ollama_version"] = out + else: + info["ollama_version_error"] = err or out or "unknown error" + + code, out, err = _run(["ollama", "list"]) + if code == 0: + info["ollama_running"] = True + models = [] + lines = [line for line in out.splitlines() if line.strip()] + for line in lines[1:]: + models.append(line.split()[0]) + info["models"] = models + if model: + info["model_requested"] = model + info["model_available"] = any(m.startswith(model) for m in models) + else: + info["ollama_list_error"] = err or out or "failed to query ollama" + + return info diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py new file mode 100644 index 0000000..69c9467 --- /dev/null +++ b/bitnet_tools/multi_csv.py @@ -0,0 +1,334 @@ +from __future__ import annotations + +import csv +import json +import math +from collections import Counter, defaultdict +from pathlib import Path +from typing import Any + +from .analysis import _to_float, summarize_reader + + +def _quantile(sorted_values: list[float], q: float) -> float: + if not sorted_values: + return 0.0 + if len(sorted_values) == 1: + return sorted_values[0] + pos = (len(sorted_values) - 1) * q + low = int(math.floor(pos)) + high = int(math.ceil(pos)) + if low == high: + return sorted_values[low] + weight = pos - low + return sorted_values[low] * (1 - weight) + sorted_values[high] * weight + + +def _outlier_ratio(values: list[float]) -> float: + if len(values) < 4: + return 0.0 + sorted_values = sorted(values) + q1 = _quantile(sorted_values, 0.25) + q3 = _quantile(sorted_values, 0.75) + iqr = q3 - q1 + if iqr == 0: + return 0.0 + low = q1 - 1.5 * iqr + high = q3 + 1.5 * iqr + outliers = sum(1 for v in sorted_values if v < low or v > high) + return round(outliers / len(sorted_values), 6) + + +def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: str) -> dict[str, Any]: + table: dict[str, Counter[str]] = defaultdict(Counter) + for row in rows: + g = (row.get(group_col) or "").strip() + t = (row.get(target_col) or "").strip() + if g and t: + table[g][t] += 1 + + ratio_table: dict[str, Any] = {} + for g, counter in table.items(): + total = sum(counter.values()) + ratio_table[g] = { + k: { + "count": v, + "ratio": round(v / total, 6) if total else 0.0, + } + for k, v in counter.items() + } + + return { + "group_column": group_col, + "target_column": target_col, + "groups": ratio_table, + } + + +def _profile_rows( + rows: list[dict[str, str]], + columns: list[str], + group_column: str | None = None, + target_column: str | None = None, +) -> dict[str, Any]: + row_count = len(rows) + missing = {c: 0 for c in columns} + non_missing = {c: 0 for c in columns} + uniques: dict[str, set[str]] = {c: set() for c in columns} + value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns} + + numeric_positive = {c: 0 for c in columns} + numeric_zero = {c: 0 for c in columns} + numeric_negative = {c: 0 for c in columns} + numeric_values: dict[str, list[float]] = {c: [] for c in columns} + + for row in rows: + for col in columns: + raw = (row.get(col) or "").strip() + if not raw: + missing[col] += 1 + continue + non_missing[col] += 1 + uniques[col].add(raw) + value_counts[col][raw] += 1 + + num = _to_float(raw) + if num is not None: + numeric_values[col].append(num) + if num > 0: + numeric_positive[col] += 1 + elif num < 0: + numeric_negative[col] += 1 + else: + numeric_zero[col] += 1 + + summary = summarize_reader(rows, columns) + profiles: dict[str, Any] = {} + for col in columns: + nn = non_missing[col] + top = value_counts[col].most_common(5) + top_values = [ + { + "value": v, + "count": cnt, + "ratio": round(cnt / row_count, 6) if row_count else 0.0, + } + for v, cnt in top + ] + + numeric_total = numeric_positive[col] + numeric_zero[col] + numeric_negative[col] + numeric_distribution: dict[str, float] = {} + if numeric_total: + numeric_distribution = { + "positive_ratio": round(numeric_positive[col] / numeric_total, 6), + "zero_ratio": round(numeric_zero[col] / numeric_total, 6), + "negative_ratio": round(numeric_negative[col] / numeric_total, 6), + "outlier_ratio": _outlier_ratio(numeric_values[col]), + } + + dominant_value_ratio = top_values[0]["ratio"] if top_values else 0.0 + profiles[col] = { + "missing_count": missing[col], + "missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0, + "non_missing_count": nn, + "unique_count": len(uniques[col]), + "unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0, + "dominant_value_ratio": dominant_value_ratio, + "top_values": top_values, + "numeric_distribution": numeric_distribution, + "dtype": summary.dtypes[col], + } + + group_target_ratio: dict[str, Any] | None = None + if group_column and target_column and group_column in columns and target_column in columns: + group_target_ratio = _group_ratio_table(rows, group_column, target_column) + + return { + "summary": summary.to_dict(), + "column_profiles": profiles, + "group_target_ratio": group_target_ratio, + } + + +def _schema_drift(files: list[dict[str, Any]], shared_columns: list[str]) -> dict[str, Any]: + drift: dict[str, Any] = {} + for col in shared_columns: + dtypes = [f["column_profiles"][col]["dtype"] for f in files if col in f["column_profiles"]] + missing_ratios = [f["column_profiles"][col]["missing_ratio"] for f in files if col in f["column_profiles"]] + dominant_ratios = [f["column_profiles"][col]["dominant_value_ratio"] for f in files if col in f["column_profiles"]] + + means = [] + for f in files: + stats = f["summary"]["numeric_stats"].get(col) + if stats: + means.append(stats["mean"]) + + drift[col] = { + "dtype_changed": len(set(dtypes)) > 1, + "missing_ratio_range": round(max(missing_ratios) - min(missing_ratios), 6) if missing_ratios else 0.0, + "dominant_value_ratio_range": round(max(dominant_ratios) - min(dominant_ratios), 6) if dominant_ratios else 0.0, + "mean_range": round(max(means) - min(means), 6) if means else 0.0, + } + return drift + + +def analyze_multiple_csv( + csv_paths: list[Path], + question: str, + group_column: str | None = None, + target_column: str | None = None, +) -> dict[str, Any]: + if not csv_paths: + raise ValueError("at least one CSV path is required") + + files: list[dict[str, Any]] = [] + all_columns: list[set[str]] = [] + total_rows = 0 + + for path in csv_paths: + if not path.exists(): + raise FileNotFoundError(f"CSV file not found: {path}") + + with path.open("r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError(f"CSV header not found: {path}") + columns = [str(c) for c in reader.fieldnames] + rows = list(reader) + + profiled = _profile_rows(rows, columns, group_column=group_column, target_column=target_column) + total_rows += profiled["summary"]["row_count"] + all_columns.append(set(columns)) + + files.append( + { + "path": str(path), + "question": question, + "summary": profiled["summary"], + "column_profiles": profiled["column_profiles"], + "group_target_ratio": profiled["group_target_ratio"], + } + ) + + shared_columns = sorted(set.intersection(*all_columns)) if all_columns else [] + union_columns = sorted(set.union(*all_columns)) if all_columns else [] + + return { + "question": question, + "file_count": len(files), + "total_row_count": total_rows, + "shared_columns": shared_columns, + "union_columns": union_columns, + "files": files, + "schema_drift": _schema_drift(files, shared_columns), + "code_guidance": build_code_guidance(shared_columns, group_column, target_column), + } + + +def build_code_guidance( + shared_columns: list[str], + group_column: str | None = None, + target_column: str | None = None, +) -> dict[str, str]: + join_key = shared_columns[0] if shared_columns else "공통키컬럼" + + group_block = "" + if group_column and target_column: + group_block = ( + f"ratio_tbl = (merged.groupby('{group_column}')['{target_column}'].value_counts(normalize=True)" + ".rename('ratio').reset_index())\n" + "print('그룹-타깃 비율표:\n', ratio_tbl.head(20))\n\n" + ) + + pandas_code = ( + "import pandas as pd\n" + "import matplotlib.pyplot as plt\n\n" + "paths = ['file1.csv', 'file2.csv', 'file3.csv']\n" + "dfs = [pd.read_csv(p) for p in paths]\n\n" + f"key = '{join_key}'\n" + "merged = dfs[0]\n" + "for df in dfs[1:]:\n" + " if key in merged.columns and key in df.columns:\n" + " merged = merged.merge(df, on=key, how='outer', suffixes=('', '_r'))\n\n" + "missing_ratio = merged.isna().mean().sort_values(ascending=False)\n" + "print('결측 비율 상위:\n', missing_ratio.head(10))\n\n" + "numeric_cols = merged.select_dtypes(include='number').columns\n" + "if len(numeric_cols) > 0:\n" + " ratio = (merged[numeric_cols] > 0).mean().sort_values(ascending=False)\n" + " print('양수 비율 상위:\n', ratio.head(10))\n" + " ratio.head(10).plot(kind='bar', title='양수 비율 상위 10개 컬럼')\n" + " plt.tight_layout(); plt.show()\n\n" + f"{group_block}" + ) + + return { + "recommended_steps": ( + "1) 공통 키 컬럼 확인 후 병합\n" + "2) 컬럼별 결측/고유값/상위값 비율 확인\n" + "3) 수치형 컬럼 비율(양수/0/음수), 이상치 비율, 분포 확인\n" + "4) 그룹 컬럼 기준 타깃 비율 분석(예: 시도명-세차유형)\n" + "5) 파일 간 스키마 변화/평균 변화 범위 확인" + ), + "pandas_example": pandas_code, + } + + +def build_multi_csv_markdown(result: dict[str, Any]) -> str: + lines = [ + "# 다중 CSV 분석 리포트", + "", + f"- 질문: {result['question']}", + f"- 파일 수: {result['file_count']}", + f"- 전체 행 수: {result['total_row_count']}", + f"- 공통 컬럼: {', '.join(result['shared_columns']) if result['shared_columns'] else '(없음)'}", + "", + ] + + for file_info in result["files"]: + lines.extend( + [ + f"## 파일: {file_info['path']}", + "", + f"- 행 수: {file_info['summary']['row_count']}", + f"- 열 수: {file_info['summary']['column_count']}", + "", + "| 컬럼 | 타입 | 결측비율 | 고유비율 | 대표값비율 |", + "|---|---|---:|---:|---:|", + ] + ) + for col in file_info["summary"]["columns"]: + prof = file_info["column_profiles"][col] + lines.append( + f"| {col} | {prof['dtype']} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} | {prof['dominant_value_ratio']:.4f} |" + ) + if file_info.get("group_target_ratio"): + gtr = file_info["group_target_ratio"] + lines.extend(["", f"- 그룹비율: {gtr['group_column']} x {gtr['target_column']}"]) + lines.append("") + + lines.extend(["## 파일 간 스키마/분포 변화", "", "| 컬럼 | 타입변화 | 결측비율범위 | 대표값비율범위 | 평균범위 |", "|---|---|---:|---:|---:|"]) + for col, drift in result["schema_drift"].items(): + lines.append( + f"| {col} | {drift['dtype_changed']} | {drift['missing_ratio_range']:.4f} | {drift['dominant_value_ratio_range']:.4f} | {drift['mean_range']:.4f} |" + ) + + lines.extend( + [ + "", + "## 코드 가이드", + "", + "```text", + result["code_guidance"]["recommended_steps"], + "```", + "", + "```python", + result["code_guidance"]["pandas_example"], + "```", + ] + ) + + return "\n".join(lines) + + +def result_to_json(result: dict[str, Any]) -> str: + return json.dumps(result, ensure_ascii=False, indent=2) diff --git a/pyproject.toml b/pyproject.toml index 4f7a543..ae288de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [] [project.scripts] bitnet-analyze = "bitnet_tools.cli:main" +bitnet-desktop = "bitnet_tools.desktop:launch_desktop" [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tests/test_analysis.py b/tests/test_analysis.py index d9d6454..41de94b 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -2,7 +2,10 @@ build_analysis_payload, build_analysis_payload_from_csv_text, summarize_rows, + build_markdown_report, ) +from bitnet_tools.multi_csv import analyze_multiple_csv, build_multi_csv_markdown + def test_summarize_rows_basic(): @@ -38,3 +41,51 @@ def test_build_analysis_payload_from_csv_text(): assert payload["csv_path"] == "" assert payload["summary"]["row_count"] == 2 + + +def test_streaming_summary_keeps_mixed_type_as_string(tmp_path): + p = tmp_path / "mixed.csv" + p.write_text("a,b\n1,10\n2,hello\n", encoding="utf-8") + + payload = build_analysis_payload(p, "검증") + + assert payload["summary"]["dtypes"]["b"] == "string" + assert "b" not in payload["summary"]["numeric_stats"] + + +def test_build_markdown_report(): + rows = [{"a": "1", "b": "10"}, {"a": "2", "b": "20"}] + summary = summarize_rows(rows, ["a", "b"]) + report = build_markdown_report(summary, "테스트 질문") + + assert "# BitNet CSV 분석 보고서" in report + assert "| a |" in report + assert "테스트 질문" in report + + +def test_multi_csv_report_builder(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + p1.write_text("city,v\nseoul,1\n", encoding="utf-8") + p2.write_text("city,v2\nseoul,2\n", encoding="utf-8") + + result = analyze_multiple_csv([p1, p2], "비교") + report = build_multi_csv_markdown(result) + + assert result["file_count"] == 2 + assert "city" in result["shared_columns"] + assert "다중 CSV 분석 리포트" in report + + +def test_multi_csv_schema_drift_and_group_ratio(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + p1.write_text("city,type,val\nseoul,A,1\nseoul,B,2\n", encoding="utf-8") + p2.write_text("city,type,val\nseoul,A,100\nbusan,A,200\n", encoding="utf-8") + + result = analyze_multiple_csv([p1, p2], "드리프트", group_column="city", target_column="type") + + assert "schema_drift" in result + assert "val" in result["schema_drift"] + assert result["schema_drift"]["val"]["mean_range"] > 0 + assert result["files"][0]["group_target_ratio"] is not None diff --git a/tests/test_cli.py b/tests/test_cli.py index 56d148c..ea1caeb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -27,3 +27,87 @@ def fake_serve(host: str, port: int): assert code == 0 assert called == {"host": "0.0.0.0", "port": 9999} + + +def test_cli_doctor_mode(monkeypatch, capsys): + monkeypatch.setattr( + cli, + "collect_environment", + lambda model=None: {"ollama_installed": True, "model_requested": model}, + ) + + code = cli.main(["doctor", "--model", "bitnet:latest"]) + + assert code == 0 + out = capsys.readouterr().out + assert '"ollama_installed": true' in out + assert '"model_requested": "bitnet:latest"' in out + + +def test_cli_report_mode(tmp_path): + csv_path = tmp_path / "sample.csv" + out_path = tmp_path / "report.md" + csv_path.write_text("a,b\n1,2\n", encoding="utf-8") + + code = cli.main(["report", str(csv_path), "--question", "요약", "--out", str(out_path)]) + + assert code == 0 + assert out_path.exists() + assert "BitNet CSV 분석 보고서" in out_path.read_text(encoding="utf-8") + + +def test_cli_multi_analyze_mode(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out.json" + out_md = tmp_path / "out.md" + + p1.write_text("city,val\nseoul,1\nbusan,2\n", encoding="utf-8") + p2.write_text("city,val2\nseoul,10\ndaegu,20\n", encoding="utf-8") + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "다중 비교", + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + assert out_json.exists() + assert out_md.exists() + assert "다중 CSV 분석 리포트" in out_md.read_text(encoding="utf-8") + + +def test_cli_multi_analyze_with_group_target(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out2.json" + out_md = tmp_path / "out2.md" + + p1.write_text("city,type,val\nseoul,A,1\nseoul,B,2\n", encoding="utf-8") + p2.write_text("city,type,val\nseoul,A,10\nbusan,A,20\n", encoding="utf-8") + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "그룹비율", + "--group-column", + "city", + "--target-column", + "type", + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + body = out_json.read_text(encoding="utf-8") + assert "group_target_ratio" in body