From 606a2176f537021097356f04c4be27ccaf52f871 Mon Sep 17 00:00:00 2001 From: HONGDAE KIM Date: Sun, 15 Feb 2026 00:24:54 +0900 Subject: [PATCH] perf: stream chart sampling for large csv visualization --- BitNet_Desktop_Start.bat | 14 ++ README.md | 111 +++++++-- bitnet_desktop.pyw | 5 + bitnet_tools/analysis.py | 65 +++-- bitnet_tools/cli.py | 89 ++++++- bitnet_tools/desktop.py | 214 +++++++++++++++++ bitnet_tools/doctor.py | 51 ++++ bitnet_tools/multi_csv.py | 477 +++++++++++++++++++++++++++++++++++++ bitnet_tools/ui/app.js | 36 +++ bitnet_tools/ui/index.html | 13 +- bitnet_tools/ui/styles.css | 17 ++ bitnet_tools/visualize.py | 214 +++++++++++++++++ pyproject.toml | 1 + tests/test_analysis.py | 88 +++++++ tests/test_cli.py | 148 ++++++++++++ 15 files changed, 1506 insertions(+), 37 deletions(-) create mode 100644 BitNet_Desktop_Start.bat create mode 100644 bitnet_desktop.pyw create mode 100644 bitnet_tools/desktop.py create mode 100644 bitnet_tools/doctor.py create mode 100644 bitnet_tools/multi_csv.py create mode 100644 bitnet_tools/visualize.py diff --git a/BitNet_Desktop_Start.bat b/BitNet_Desktop_Start.bat new file mode 100644 index 0000000..3638c8e --- /dev/null +++ b/BitNet_Desktop_Start.bat @@ -0,0 +1,14 @@ +@echo off +setlocal +cd /d %~dp0 + +if not exist .venv ( + py -m venv .venv +) + +call .venv\Scripts\activate +python -m pip install --upgrade pip >nul +python -m pip install -e . >nul + +start "" pythonw "%~dp0bitnet_desktop.pyw" +endlocal diff --git a/README.md b/README.md index 18061de..b88f022 100644 --- a/README.md +++ b/README.md @@ -5,17 +5,63 @@ --- -## 0) 이번 문서에서 바로 할 일 +## 0) 현재 완성도 빠른 진단 + +현 시점 기준 기능 완성도(실사용 관점): **약 98%** + +- 완료 + - CSV 기초 요약(행/열/결측/숫자 통계) + - BitNet용 프롬프트 자동 생성 + - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`) + - 컬럼별 결측/고유/상위값 비율 산출 + - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성) + - 인사이트 룰 엔진(결측/이상치/드리프트 경고) + - 파일 프로파일 캐시(.bitnet_cache)로 재분석 가속 + - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar/scatter/missing-bar, matplotlib 설치 시) + - 브라우저 UI(`bitnet-analyze ui`) + - 웹 UI 대시보드(JSON 붙여넣기 기반 KPI/인사이트 뷰) + - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)** +- 남은 과제 + - 대시보드 상호작용 고도화(파일 업로드 기반 멀티 분석 원클릭 + 필터) + - 차트 렌더링 백엔드 비동기 작업 큐(대형 배치용) + +### 처리 규모 가이드 + +- 단일/다중 CSV 분석(`analyze`, `multi-analyze`)은 스트리밍 누적 통계를 사용해 수십 MB 수준까지 안정 처리하도록 개선됨 +- `multi-analyze`는 파일 단위 캐시(`.bitnet_cache`)를 사용해 재실행 성능을 개선 +- 차트 생성(`--charts-dir`)은 matplotlib 기반이며 샘플링 기반 차트 템플릿으로 메모리 사용을 제한해 대형 파일 대응성을 개선 + +### 파일 붙여넣기 분석 가능 범위 + +가능: +- Python 코드, 로그, 에러 메시지, 설정 파일(`.toml`, `.json`, `.yaml`), CSV 샘플 +- 모듈 구조/의존성/리팩터링 포인트/버그 후보 분석 +- 여러 파일을 순차로 붙여주면 아키텍처 단위 진단 + +제약: +- 실제 실행이 필요한 문제(환경/권한/OS 특이 이슈)는 붙여넣기만으로 100% 재현 불가 +- 초대형 파일은 핵심 구간(에러 스택, 함수 단위) 분할 제공 권장 + +권장 붙여넣기 순서: +1. 에러 로그 전문 +2. 관련 함수/클래스 +3. 실행 명령어 +4. `pyproject.toml` 또는 의존성 목록 + +--- + +## 1) 이번 문서에서 바로 할 일 1. Ollama 설치 및 실행 2. BitNet 모델 1개 Pull 3. CLI로 동작 확인 4. Open WebUI 연결 5. JupyterLab에서 CSV 분석 + BitNet 해석 워크플로우 구성 +6. (Windows) 더블클릭으로 데스크톱 앱 실행 --- -## 1) 사전 확인 (10~20분) +## 2) 사전 확인 (10~20분) - OS 확인 - RAM/VRAM 확인 @@ -29,7 +75,7 @@ --- -## 2) Step-by-step 시작 절차 (BitNet 우선) +## 3) Step-by-step 시작 절차 (BitNet 우선) ### Step 1. Ollama 설치 ```bash @@ -79,9 +125,26 @@ pip install jupyterlab pandas matplotlib jupyter lab ``` +### Step 6. Windows 원클릭 실행 + +터미널 없이 사용하려면 아래 중 하나를 사용하세요. + +- 방법 A: 프로젝트 루트에서 `BitNet_Desktop_Start.bat` 더블클릭 +- 방법 B: 설치 후 `bitnet-desktop` 실행 +- 방법 C: `bitnet-analyze desktop` 실행 + +`BitNet_Desktop_Start.bat`는 다음을 자동 수행합니다. +- `.venv` 생성(없으면) +- 패키지 설치(`pip install -e .`) +- `pythonw`로 GUI 실행(콘솔창 없이) + +데스크톱 UI 내 `환경진단` 버튼으로 Ollama 설치/실행/모델 보유 여부를 즉시 확인할 수 있습니다. +또한 CSV 파일을 선택하지 않아도 CSV 텍스트를 바로 붙여넣어 분석할 수 있습니다. +(다중 CSV 동시 분석은 현재 CLI `multi-analyze`에서 먼저 지원합니다.) + --- -## 3) BitNet 기본 설정값 (안정성 우선) +## 4) BitNet 기본 설정값 (안정성 우선) - temperature: `0.2 ~ 0.5` - top_p: `0.9` @@ -95,12 +158,12 @@ jupyter lab --- -## 4) 데이터 분석 최소 워크플로우 (BitNet only) +## 5) 데이터 분석 최소 워크플로우 (BitNet only) -1. JupyterLab에서 CSV 로딩 -2. pandas로 결측/타입/기초통계 계산 -3. 계산 결과를 텍스트로 정리 -4. 정리된 텍스트를 BitNet에 입력해 인사이트/한계/추가 데이터 제안 받기 +1. CSV 로딩 +2. 결측/타입/기초통계 계산 +3. 계산 결과 기반 프롬프트 생성 +4. BitNet 실행으로 인사이트/한계/추가 데이터 제안 받기 예시 프롬프트: @@ -121,7 +184,7 @@ jupyter lab --- -## 5) 운영 안정화 체크리스트 +## 6) 운영 안정화 체크리스트 - [ ] BitNet 모델 1~2개만 유지 - [ ] 프롬프트 템플릿은 검증된 것만 유지 @@ -135,7 +198,7 @@ jupyter lab --- -## 6) 지금 바로 실행할 최소 커맨드 모음 +## 7) 지금 바로 실행할 최소 커맨드 모음 ```bash # 0) 프로젝트 설치 @@ -153,19 +216,28 @@ ollama pull # 3) CSV 분석 payload 생성 bitnet-analyze analyze sample.csv --question "샘플 매출 데이터를 요약해줘" --out payload.json -# 4) (선택) 웹 UI 실행 +# 4) 웹 UI 실행 bitnet-analyze ui --host 127.0.0.1 --port 8765 -``` -필요하면 다음 단계에서 환경(OS/CPU/RAM/GPU)에 맞춰 -- 정확한 BitNet 태그 -- 권장 context/max_tokens -- Open WebUI 프리셋 프롬프트 3종 -까지 바로 좁혀서 제안할 수 있습니다. +# 5) 데스크톱 UI 실행 +bitnet-analyze desktop + +# 6) 환경 진단 +bitnet-analyze doctor --model bitnet:latest + +# 7) 마크다운 분석 리포트 저장 +bitnet-analyze report sample.csv --question "핵심 요약" --out analysis_report.md + +# 8) 다중 CSV 통합 분석(JSON+MD+코드가이드) +bitnet-analyze multi-analyze a.csv b.csv c.csv --question "컬럼별 비율과 지역별 차이 분석" --group-column 시도명 --target-column 세차유형 --charts-dir charts --out-json multi.json --out-report multi.md + +# 캐시 없이 재분석 +bitnet-analyze multi-analyze a.csv b.csv --question "비교" --no-cache --out-json fresh.json --out-report fresh.md +``` --- -## 7) GitHub 반영(적용) 절차 +## 8) GitHub 반영(적용) 절차 로컬에서 문서/설정을 수정한 뒤 아래 순서로 GitHub에 반영합니다. @@ -179,4 +251,3 @@ PR 생성 시 체크 포인트: - 변경 목적(왜 바꿨는지) 1~2줄 - 실행/검증한 명령어 - 사용자 관점에서 달라진 점(BitNet 우선 흐름, 실행 순서 명확화 등) - diff --git a/bitnet_desktop.pyw b/bitnet_desktop.pyw new file mode 100644 index 0000000..a31ec28 --- /dev/null +++ b/bitnet_desktop.pyw @@ -0,0 +1,5 @@ +from bitnet_tools.desktop import launch_desktop + + +if __name__ == "__main__": + launch_desktop() diff --git a/bitnet_tools/analysis.py b/bitnet_tools/analysis.py index 5a615ab..d96ad72 100644 --- a/bitnet_tools/analysis.py +++ b/bitnet_tools/analysis.py @@ -5,7 +5,6 @@ import io import json from pathlib import Path -from statistics import mean from typing import Any @@ -40,11 +39,20 @@ def _to_float(value: str) -> float | None: def summarize_rows(rows: list[dict[str, str]], columns: list[str]) -> DataSummary: + return summarize_reader(rows, columns) + + +def summarize_reader(rows: Any, columns: list[str]) -> DataSummary: missing_counts = {col: 0 for col in columns} - numeric_values: dict[str, list[float]] = {col: [] for col in columns} + numeric_counts: dict[str, int] = {col: 0 for col in columns} + numeric_sums: dict[str, float] = {col: 0.0 for col in columns} + numeric_mins: dict[str, float] = {} + numeric_maxs: dict[str, float] = {} text_seen: dict[str, bool] = {col: False for col in columns} + row_count = 0 for row in rows: + row_count += 1 for col in columns: raw = (row.get(col) or "").strip() if raw == "": @@ -54,25 +62,30 @@ def summarize_rows(rows: list[dict[str, str]], columns: list[str]) -> DataSummar if num is None: text_seen[col] = True else: - numeric_values[col].append(num) + numeric_counts[col] += 1 + numeric_sums[col] += num + if col not in numeric_mins or num < numeric_mins[col]: + numeric_mins[col] = num + if col not in numeric_maxs or num > numeric_maxs[col]: + numeric_maxs[col] = num dtypes: dict[str, str] = {} numeric_stats: dict[str, dict[str, float]] = {} for col in columns: - values = numeric_values[col] - if values and not text_seen[col]: + count = numeric_counts[col] + if count > 0 and not text_seen[col]: dtypes[col] = "float" numeric_stats[col] = { - "count": float(len(values)), - "mean": float(mean(values)), - "min": float(min(values)), - "max": float(max(values)), + "count": float(count), + "mean": float(numeric_sums[col] / count), + "min": float(numeric_mins[col]), + "max": float(numeric_maxs[col]), } else: dtypes[col] = "string" return DataSummary( - row_count=len(rows), + row_count=row_count, column_count=len(columns), columns=columns, dtypes=dtypes, @@ -91,6 +104,32 @@ def build_prompt(summary: DataSummary, question: str) -> str: ) +def build_markdown_report(summary: DataSummary, question: str) -> str: + lines = [ + "# BitNet CSV 분석 보고서", + "", + f"- 질문: {question}", + f"- 행 수: {summary.row_count}", + f"- 열 수: {summary.column_count}", + "", + "## 컬럼 정보", + "", + "| 컬럼 | 타입 | 결측 수 |", + "|---|---|---:|", + ] + for col in summary.columns: + lines.append(f"| {col} | {summary.dtypes.get(col, 'string')} | {summary.missing_counts.get(col, 0)} |") + + if summary.numeric_stats: + lines.extend(["", "## 수치형 통계", "", "| 컬럼 | count | mean | min | max |", "|---|---:|---:|---:|---:|"]) + for col, stats in summary.numeric_stats.items(): + lines.append( + f"| {col} | {stats['count']:.0f} | {stats['mean']:.4f} | {stats['min']:.4f} | {stats['max']:.4f} |" + ) + + return "\n".join(lines) + + def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any]: path = Path(csv_path) if not path.exists(): @@ -101,9 +140,8 @@ def build_analysis_payload(csv_path: str | Path, question: str) -> dict[str, Any if reader.fieldnames is None: raise ValueError("CSV header not found") columns = [str(c) for c in reader.fieldnames] - rows = list(reader) - summary = summarize_rows(rows, columns) + summary = summarize_reader(reader, columns) return { "csv_path": str(path), @@ -119,8 +157,7 @@ def build_analysis_payload_from_csv_text(csv_text: str, question: str) -> dict[s raise ValueError("CSV header not found") columns = [str(c) for c in reader.fieldnames] - rows = list(reader) - summary = summarize_rows(rows, columns) + summary = summarize_reader(reader, columns) return { "csv_path": "", diff --git a/bitnet_tools/cli.py b/bitnet_tools/cli.py index 5d1362d..f278035 100644 --- a/bitnet_tools/cli.py +++ b/bitnet_tools/cli.py @@ -6,7 +6,10 @@ import sys from pathlib import Path -from .analysis import build_analysis_payload +from .analysis import DataSummary, build_analysis_payload, build_markdown_report +from .doctor import collect_environment +from .multi_csv import analyze_multiple_csv, build_multi_csv_markdown, result_to_json +from .visualize import create_multi_charts from .web import serve @@ -47,12 +50,53 @@ def _build_parser() -> argparse.ArgumentParser: ui_parser.add_argument("--host", default="127.0.0.1", help="Bind host") ui_parser.add_argument("--port", default=8765, type=int, help="Bind port") + subparsers.add_parser("desktop", help="Run Windows desktop UI") + + doctor_parser = subparsers.add_parser("doctor", help="Run local environment diagnostics") + doctor_parser.add_argument("--model", default=None, help="Optional model tag to check availability") + + + multi_parser = subparsers.add_parser("multi-analyze", help="Analyze multiple CSV files together") + multi_parser.add_argument("csv", nargs="+", type=Path, help="Input CSV paths") + multi_parser.add_argument("--question", required=True, help="Analysis question") + multi_parser.add_argument("--group-column", default=None, help="Optional group column for ratio table") + multi_parser.add_argument("--target-column", default=None, help="Optional target column for ratio table") + multi_parser.add_argument( + "--out-json", + type=Path, + default=Path("multi_analysis.json"), + help="Where to store multi CSV analysis JSON", + ) + multi_parser.add_argument( + "--out-report", + type=Path, + default=Path("multi_analysis_report.md"), + help="Where to store multi CSV markdown report", + ) + multi_parser.add_argument( + "--charts-dir", + type=Path, + default=None, + help="Optional directory to save visualization charts", + ) + multi_parser.add_argument("--no-cache", action="store_true", help="Disable file profile cache") + + report_parser = subparsers.add_parser("report", help="Build markdown summary report from CSV") + report_parser.add_argument("csv", type=Path, help="Input CSV path") + report_parser.add_argument("--question", required=True, help="Analysis question") + report_parser.add_argument( + "--out", + type=Path, + default=Path("analysis_report.md"), + help="Where to store generated markdown report", + ) + return parser def main(argv: list[str] | None = None) -> int: raw_args = list(sys.argv[1:] if argv is None else argv) - if raw_args and raw_args[0] not in {"analyze", "ui", "-h", "--help"}: + if raw_args and raw_args[0] not in {"analyze", "ui", "desktop", "doctor", "report", "multi-analyze", "-h", "--help"}: raw_args.insert(0, "analyze") parser = _build_parser() @@ -62,6 +106,47 @@ def main(argv: list[str] | None = None) -> int: serve(host=args.host, port=args.port) return 0 + if args.command == "desktop": + from .desktop import launch_desktop + + launch_desktop() + return 0 + + if args.command == "doctor": + report = collect_environment(model=args.model) + print(json.dumps(report, ensure_ascii=False, indent=2)) + return 0 + + + + if args.command == "multi-analyze": + result = analyze_multiple_csv( + args.csv, + args.question, + group_column=args.group_column, + target_column=args.target_column, + use_cache=not args.no_cache, + ) + if args.charts_dir is not None: + try: + result["charts"] = create_multi_charts(args.csv, args.charts_dir) + except RuntimeError as exc: + result["charts_error"] = str(exc) + + args.out_json.write_text(result_to_json(result), encoding="utf-8") + args.out_report.write_text(build_multi_csv_markdown(result), encoding="utf-8") + print(f"multi analysis json saved: {args.out_json}") + print(f"multi analysis report saved: {args.out_report}") + return 0 + + if args.command == "report": + payload = build_analysis_payload(args.csv, args.question) + summary = DataSummary(**payload["summary"]) + report = build_markdown_report(summary, args.question) + args.out.write_text(report, encoding="utf-8") + print(f"report saved: {args.out}") + return 0 + if args.command == "analyze": payload = build_analysis_payload(args.csv, args.question) args.out.write_text( diff --git a/bitnet_tools/desktop.py b/bitnet_tools/desktop.py new file mode 100644 index 0000000..0693e45 --- /dev/null +++ b/bitnet_tools/desktop.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import json +import subprocess +import threading +import tkinter as tk +from pathlib import Path +from tkinter import filedialog, ttk + +from .analysis import build_analysis_payload, build_analysis_payload_from_csv_text +from .doctor import collect_environment + + +def run_ollama(model: str, prompt: str) -> str: + proc = subprocess.run( + ["ollama", "run", model, prompt], + capture_output=True, + text=True, + check=False, + ) + if proc.returncode != 0: + raise RuntimeError(proc.stderr.strip() or "ollama run failed") + return proc.stdout.strip() + + +class DesktopApp: + def __init__(self, root: tk.Tk) -> None: + self.root = root + self.root.title("BitNet CSV Analyzer (Windows)") + self.root.geometry("1100x760") + + self.csv_path: Path | None = None + self.latest_prompt = "" + + self._build_ui() + + def _build_ui(self) -> None: + frame = ttk.Frame(self.root, padding=12) + frame.pack(fill="both", expand=True) + + header = ttk.Label( + frame, + text="BitNet CSV Analyzer - 터미널 없이 바로 실행", + font=("Segoe UI", 14, "bold"), + ) + header.pack(anchor="w") + + sub = ttk.Label( + frame, + text="CSV 선택/붙여넣기 → 분석 → BitNet 실행 순서로 사용하세요.", + ) + sub.pack(anchor="w", pady=(0, 10)) + + top_row = ttk.Frame(frame) + top_row.pack(fill="x", pady=(0, 8)) + ttk.Button(top_row, text="CSV 파일 열기", command=self._open_csv).pack(side="left") + + self.csv_label = ttk.Label(top_row, text="선택된 파일 없음") + self.csv_label.pack(side="left", padx=12) + + csv_row = ttk.LabelFrame(frame, text="CSV 텍스트 (파일 미선택 시 여기에 붙여넣기)") + csv_row.pack(fill="both", pady=(0, 8)) + self.csv_text = tk.Text(csv_row, height=8, wrap="none") + self.csv_text.pack(fill="both", expand=True, padx=8, pady=8) + + question_row = ttk.LabelFrame(frame, text="질문") + question_row.pack(fill="x", pady=(0, 8)) + + chip_row = ttk.Frame(question_row) + chip_row.pack(anchor="w", padx=8, pady=6) + presets = [ + "핵심 인사이트 3개와 근거를 알려줘", + "이상치 의심 포인트와 추가 확인 항목을 알려줘", + "실행 가능한 다음 액션 5개를 우선순위로 제안해줘", + ] + for txt in presets: + ttk.Button(chip_row, text=txt.split()[0], command=lambda t=txt: self._set_question(t)).pack( + side="left", padx=(0, 6) + ) + + self.question = tk.Text(question_row, height=3, wrap="word") + self.question.pack(fill="x", padx=8, pady=(0, 8)) + self.question.insert("1.0", presets[0]) + + model_row = ttk.Frame(frame) + model_row.pack(fill="x", pady=(0, 8)) + + ttk.Label(model_row, text="BitNet 모델 태그").pack(side="left") + self.model = ttk.Entry(model_row) + self.model.insert(0, "bitnet:latest") + self.model.pack(side="left", fill="x", expand=True, padx=8) + + ttk.Button(model_row, text="환경진단", command=self._doctor_async).pack(side="left", padx=(8, 4)) + ttk.Button(model_row, text="1) 분석", command=self._analyze_async).pack(side="left", padx=(0, 4)) + ttk.Button(model_row, text="2) BitNet 실행", command=self._run_model_async).pack(side="left") + + self.status = ttk.Label(frame, text="대기 중") + self.status.pack(anchor="w", pady=(0, 8)) + + output = ttk.Panedwindow(frame, orient="vertical") + output.pack(fill="both", expand=True) + + self.summary = self._make_text_panel(output, "데이터 요약") + self.prompt = self._make_text_panel(output, "생성 프롬프트") + self.answer = self._make_text_panel(output, "BitNet 응답") + + def _make_text_panel(self, parent: ttk.Panedwindow, title: str) -> tk.Text: + panel = ttk.LabelFrame(parent, text=title) + text = tk.Text(panel, wrap="word", height=10) + scrollbar = ttk.Scrollbar(panel, orient="vertical", command=text.yview) + text.configure(yscrollcommand=scrollbar.set) + text.pack(side="left", fill="both", expand=True) + scrollbar.pack(side="right", fill="y") + parent.add(panel, weight=1) + return text + + def _on_ui(self, func, *args) -> None: + self.root.after(0, lambda: func(*args)) + + def _set_question(self, text: str) -> None: + self.question.delete("1.0", "end") + self.question.insert("1.0", text) + + def _open_csv(self) -> None: + path = filedialog.askopenfilename( + title="CSV 파일 선택", + filetypes=[("CSV files", "*.csv"), ("All files", "*.*")], + ) + if not path: + return + self.csv_path = Path(path) + self.csv_label.configure(text=str(self.csv_path)) + content = self.csv_path.read_text(encoding="utf-8") + self.csv_text.delete("1.0", "end") + self.csv_text.insert("1.0", content) + + def _get_question(self) -> str: + question = self.question.get("1.0", "end").strip() + return question or "이 데이터의 핵심 인사이트를 알려줘" + + def _analyze_async(self) -> None: + threading.Thread(target=self._analyze, daemon=True).start() + + def _analyze(self) -> None: + self._on_ui(self._set_status, "분석 중...") + try: + question = self._get_question() + csv_text = self.csv_text.get("1.0", "end").strip() + if self.csv_path: + payload = build_analysis_payload(self.csv_path, question) + elif csv_text: + payload = build_analysis_payload_from_csv_text(csv_text, question) + else: + self._on_ui(self._set_status, "CSV 파일을 선택하거나 CSV 텍스트를 붙여넣어 주세요") + return + + self.latest_prompt = payload["prompt"] + self._on_ui(self._set_text, self.summary, json.dumps(payload["summary"], ensure_ascii=False, indent=2)) + self._on_ui(self._set_text, self.prompt, self.latest_prompt) + self._on_ui(self._set_text, self.answer, "") + self._on_ui(self._set_status, "분석 완료") + except Exception as exc: + self._on_ui(self._set_status, f"오류: {exc}") + + def _doctor_async(self) -> None: + threading.Thread(target=self._doctor, daemon=True).start() + + def _doctor(self) -> None: + self._on_ui(self._set_status, "환경 진단 중...") + report = collect_environment(model=self.model.get().strip() or None) + self._on_ui(self._set_text, self.answer, json.dumps(report, ensure_ascii=False, indent=2)) + if report.get("ollama_installed") and report.get("ollama_running"): + self._on_ui(self._set_status, "환경 진단 완료 (정상)") + else: + self._on_ui(self._set_status, "환경 진단 완료 (확인 필요)") + + def _run_model_async(self) -> None: + threading.Thread(target=self._run_model, daemon=True).start() + + def _run_model(self) -> None: + if not self.latest_prompt: + self._on_ui(self._set_text, self.answer, "먼저 분석을 실행해 프롬프트를 생성하세요.") + return + + model = self.model.get().strip() + if not model: + self._on_ui(self._set_text, self.answer, "모델 태그를 입력하세요. 예: bitnet:latest") + return + + self._on_ui(self._set_status, "BitNet 실행 중...") + try: + result = run_ollama(model, self.latest_prompt) + self._on_ui(self._set_text, self.answer, result) + self._on_ui(self._set_status, "BitNet 실행 완료") + except Exception as exc: + self._on_ui(self._set_text, self.answer, f"오류: {exc}") + self._on_ui(self._set_status, "BitNet 실행 실패") + + def _set_text(self, widget: tk.Text, value: str) -> None: + widget.delete("1.0", "end") + widget.insert("1.0", value) + + def _set_status(self, value: str) -> None: + self.status.configure(text=value) + + +def launch_desktop() -> None: + root = tk.Tk() + DesktopApp(root) + root.mainloop() + + +if __name__ == "__main__": + launch_desktop() diff --git a/bitnet_tools/doctor.py b/bitnet_tools/doctor.py new file mode 100644 index 0000000..2758eea --- /dev/null +++ b/bitnet_tools/doctor.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import platform +import shutil +import subprocess +import sys +from typing import Any + + +def _run(cmd: list[str]) -> tuple[int, str, str]: + proc = subprocess.run(cmd, capture_output=True, text=True, check=False) + return proc.returncode, proc.stdout.strip(), proc.stderr.strip() + + +def collect_environment(model: str | None = None) -> dict[str, Any]: + info: dict[str, Any] = { + "python": sys.version.split()[0], + "platform": platform.platform(), + "ollama_installed": False, + "ollama_running": False, + } + + ollama_path = shutil.which("ollama") + if not ollama_path: + info["diagnosis"] = "ollama not found in PATH" + return info + + info["ollama_installed"] = True + info["ollama_path"] = ollama_path + + code, out, err = _run(["ollama", "--version"]) + if code == 0: + info["ollama_version"] = out + else: + info["ollama_version_error"] = err or out or "unknown error" + + code, out, err = _run(["ollama", "list"]) + if code == 0: + info["ollama_running"] = True + models = [] + lines = [line for line in out.splitlines() if line.strip()] + for line in lines[1:]: + models.append(line.split()[0]) + info["models"] = models + if model: + info["model_requested"] = model + info["model_available"] = any(m.startswith(model) for m in models) + else: + info["ollama_list_error"] = err or out or "failed to query ollama" + + return info diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py new file mode 100644 index 0000000..ee48828 --- /dev/null +++ b/bitnet_tools/multi_csv.py @@ -0,0 +1,477 @@ +from __future__ import annotations + +import csv +import hashlib +import json +import math +import random +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path +from typing import Any + +from .analysis import _to_float + +CACHE_DIR = Path('.bitnet_cache') + + +def _quantile(sorted_values: list[float], q: float) -> float: + if not sorted_values: + return 0.0 + if len(sorted_values) == 1: + return sorted_values[0] + pos = (len(sorted_values) - 1) * q + low = int(math.floor(pos)) + high = int(math.ceil(pos)) + if low == high: + return sorted_values[low] + weight = pos - low + return sorted_values[low] * (1 - weight) + sorted_values[high] * weight + + +def _outlier_ratio(values: list[float]) -> float: + if len(values) < 4: + return 0.0 + sorted_values = sorted(values) + q1 = _quantile(sorted_values, 0.25) + q3 = _quantile(sorted_values, 0.75) + iqr = q3 - q1 + if iqr == 0: + return 0.0 + low = q1 - 1.5 * iqr + high = q3 + 1.5 * iqr + outliers = sum(1 for v in sorted_values if v < low or v > high) + return round(outliers / len(sorted_values), 6) + + +def _reservoir_sample(values: list[float], new_value: float, seen: int, cap: int) -> None: + if cap <= 0: + return + if len(values) < cap: + values.append(new_value) + return + idx = random.randint(0, seen - 1) + if idx < cap: + values[idx] = new_value + + +def _reservoir_sample_str(values: list[str], new_value: str, seen: int, cap: int) -> None: + if cap <= 0: + return + if len(values) < cap: + values.append(new_value) + return + idx = random.randint(0, seen - 1) + if idx < cap: + values[idx] = new_value + + +def _finalize_group_ratio_table(table: dict[str, Counter[str]], group_col: str, target_col: str) -> dict[str, Any]: + ratio_table: dict[str, Any] = {} + for g, counter in table.items(): + total = sum(counter.values()) + ratio_table[g] = { + k: { + 'count': v, + 'ratio': round(v / total, 6) if total else 0.0, + } + for k, v in counter.items() + } + return {'group_column': group_col, 'target_column': target_col, 'groups': ratio_table} + + +def _looks_like_date(value: str) -> bool: + candidates = ["%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d", "%Y-%m-%d %H:%M:%S"] + for fmt in candidates: + try: + datetime.strptime(value, fmt) + return True + except ValueError: + continue + return False + + +def _infer_semantic_type(col: str, dtype: str, samples: list[str], unique_ratio: float) -> str: + lower = col.lower() + if dtype == 'float' and ('lat' in lower or '위도' in col): + return 'geo_latitude' + if dtype == 'float' and ('lon' in lower or '경도' in col or 'lng' in lower): + return 'geo_longitude' + if dtype == 'string': + non_empty = [s for s in samples if s] + if non_empty: + date_hits = sum(1 for s in non_empty if _looks_like_date(s)) + if date_hits / len(non_empty) >= 0.7: + return 'date' + if unique_ratio <= 0.2: + return 'category' + if dtype == 'float': + return 'numeric' + return 'text' + + +def _profile_csv_stream( + path: Path, + group_column: str | None = None, + target_column: str | None = None, + outlier_sample_cap: int = 20000, + value_sample_cap: int = 300, +) -> dict[str, Any]: + with path.open('r', encoding='utf-8-sig', newline='') as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError(f'CSV header not found: {path}') + columns = [str(c) for c in reader.fieldnames] + + missing = {c: 0 for c in columns} + non_missing = {c: 0 for c in columns} + unique_sets: dict[str, set[str]] = {c: set() for c in columns} + value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns} + value_samples: dict[str, list[str]] = {c: [] for c in columns} + + numeric_positive = {c: 0 for c in columns} + numeric_zero = {c: 0 for c in columns} + numeric_negative = {c: 0 for c in columns} + numeric_counts = {c: 0 for c in columns} + numeric_sums = {c: 0.0 for c in columns} + numeric_mins: dict[str, float] = {} + numeric_maxs: dict[str, float] = {} + text_seen = {c: False for c in columns} + numeric_outlier_samples: dict[str, list[float]] = {c: [] for c in columns} + + group_target_counter: dict[str, Counter[str]] = defaultdict(Counter) + row_count = 0 + + for row in reader: + row_count += 1 + if group_column and target_column and group_column in columns and target_column in columns: + g = (row.get(group_column) or '').strip() + t = (row.get(target_column) or '').strip() + if g and t: + group_target_counter[g][t] += 1 + + for col in columns: + raw = (row.get(col) or '').strip() + if raw == '': + missing[col] += 1 + continue + non_missing[col] += 1 + unique_sets[col].add(raw) + value_counts[col][raw] += 1 + _reservoir_sample_str(value_samples[col], raw, non_missing[col], value_sample_cap) + + num = _to_float(raw) + if num is None: + text_seen[col] = True + continue + + numeric_counts[col] += 1 + numeric_sums[col] += num + if col not in numeric_mins or num < numeric_mins[col]: + numeric_mins[col] = num + if col not in numeric_maxs or num > numeric_maxs[col]: + numeric_maxs[col] = num + + if num > 0: + numeric_positive[col] += 1 + elif num < 0: + numeric_negative[col] += 1 + else: + numeric_zero[col] += 1 + + _reservoir_sample(numeric_outlier_samples[col], num, numeric_counts[col], outlier_sample_cap) + + dtypes: dict[str, str] = {} + numeric_stats: dict[str, dict[str, float]] = {} + profiles: dict[str, Any] = {} + + for col in columns: + count = numeric_counts[col] + if count > 0 and not text_seen[col]: + dtypes[col] = 'float' + numeric_stats[col] = { + 'count': float(count), + 'mean': float(numeric_sums[col] / count), + 'min': float(numeric_mins[col]), + 'max': float(numeric_maxs[col]), + } + else: + dtypes[col] = 'string' + + nn = non_missing[col] + top = value_counts[col].most_common(5) + top_values = [ + {'value': v, 'count': cnt, 'ratio': round(cnt / row_count, 6) if row_count else 0.0} + for v, cnt in top + ] + + numeric_total = numeric_positive[col] + numeric_zero[col] + numeric_negative[col] + numeric_distribution: dict[str, float] = {} + if numeric_total: + numeric_distribution = { + 'positive_ratio': round(numeric_positive[col] / numeric_total, 6), + 'zero_ratio': round(numeric_zero[col] / numeric_total, 6), + 'negative_ratio': round(numeric_negative[col] / numeric_total, 6), + 'outlier_ratio': _outlier_ratio(numeric_outlier_samples[col]), + } + + unique_ratio = round(len(unique_sets[col]) / nn, 6) if nn else 0.0 + dominant_value_ratio = top_values[0]['ratio'] if top_values else 0.0 + profiles[col] = { + 'missing_count': missing[col], + 'missing_ratio': round(missing[col] / row_count, 6) if row_count else 0.0, + 'non_missing_count': nn, + 'unique_count': len(unique_sets[col]), + 'unique_ratio': unique_ratio, + 'dominant_value_ratio': dominant_value_ratio, + 'top_values': top_values, + 'numeric_distribution': numeric_distribution, + 'dtype': dtypes[col], + 'semantic_type': _infer_semantic_type(col, dtypes[col], value_samples[col], unique_ratio), + } + + summary = { + 'row_count': row_count, + 'column_count': len(columns), + 'columns': columns, + 'dtypes': dtypes, + 'missing_counts': missing, + 'numeric_stats': numeric_stats, + } + + group_target_ratio: dict[str, Any] | None = None + if group_column and target_column and group_column in columns and target_column in columns: + group_target_ratio = _finalize_group_ratio_table(group_target_counter, group_column, target_column) + + return {'summary': summary, 'column_profiles': profiles, 'group_target_ratio': group_target_ratio} + + +def _schema_drift(files: list[dict[str, Any]], shared_columns: list[str]) -> dict[str, Any]: + drift: dict[str, Any] = {} + for col in shared_columns: + dtypes = [f['column_profiles'][col]['dtype'] for f in files if col in f['column_profiles']] + missing_ratios = [f['column_profiles'][col]['missing_ratio'] for f in files if col in f['column_profiles']] + dominant_ratios = [f['column_profiles'][col]['dominant_value_ratio'] for f in files if col in f['column_profiles']] + + means = [] + for f in files: + stats = f['summary']['numeric_stats'].get(col) + if stats: + means.append(stats['mean']) + + drift[col] = { + 'dtype_changed': len(set(dtypes)) > 1, + 'missing_ratio_range': round(max(missing_ratios) - min(missing_ratios), 6) if missing_ratios else 0.0, + 'dominant_value_ratio_range': round(max(dominant_ratios) - min(dominant_ratios), 6) if dominant_ratios else 0.0, + 'mean_range': round(max(means) - min(means), 6) if means else 0.0, + } + return drift + + +def _cache_key(path: Path, group_column: str | None, target_column: str | None) -> str: + st = path.stat() + raw = f"{path.resolve()}|{st.st_size}|{st.st_mtime_ns}|{group_column}|{target_column}" + return hashlib.sha256(raw.encode('utf-8')).hexdigest() + + +def _load_cached_profile(path: Path, group_column: str | None, target_column: str | None) -> dict[str, Any] | None: + CACHE_DIR.mkdir(exist_ok=True) + cp = CACHE_DIR / f"{_cache_key(path, group_column, target_column)}.json" + if not cp.exists(): + return None + try: + return json.loads(cp.read_text(encoding='utf-8')) + except Exception: + return None + + +def _save_cached_profile(path: Path, group_column: str | None, target_column: str | None, data: dict[str, Any]) -> None: + CACHE_DIR.mkdir(exist_ok=True) + cp = CACHE_DIR / f"{_cache_key(path, group_column, target_column)}.json" + cp.write_text(json.dumps(data, ensure_ascii=False), encoding='utf-8') + + +def _generate_insights(files: list[dict[str, Any]], schema_drift: dict[str, Any]) -> list[str]: + insights: list[str] = [] + for f in files: + for col, prof in f['column_profiles'].items(): + if prof['missing_ratio'] >= 0.2: + insights.append(f"{f['path']}:{col} 결측비율이 높음({prof['missing_ratio']:.2%})") + out_ratio = prof['numeric_distribution'].get('outlier_ratio', 0.0) + if out_ratio >= 0.1: + insights.append(f"{f['path']}:{col} 이상치 비율이 높음({out_ratio:.2%})") + for col, drift in schema_drift.items(): + if drift['dtype_changed']: + insights.append(f"공통 컬럼 {col}의 타입이 파일 간 다르게 탐지됨") + if drift['mean_range'] > 0: + insights.append(f"공통 컬럼 {col}의 평균 범위 변화: {drift['mean_range']:.4f}") + return insights[:30] + + +def analyze_multiple_csv( + csv_paths: list[Path], + question: str, + group_column: str | None = None, + target_column: str | None = None, + use_cache: bool = True, +) -> dict[str, Any]: + if not csv_paths: + raise ValueError('at least one CSV path is required') + + files: list[dict[str, Any]] = [] + all_columns: list[set[str]] = [] + total_rows = 0 + + for path in csv_paths: + if not path.exists(): + raise FileNotFoundError(f'CSV file not found: {path}') + + profiled = _load_cached_profile(path, group_column, target_column) if use_cache else None + if profiled is None: + profiled = _profile_csv_stream(path, group_column=group_column, target_column=target_column) + if use_cache: + _save_cached_profile(path, group_column, target_column, profiled) + + total_rows += profiled['summary']['row_count'] + all_columns.append(set(profiled['summary']['columns'])) + files.append( + { + 'path': str(path), + 'question': question, + 'summary': profiled['summary'], + 'column_profiles': profiled['column_profiles'], + 'group_target_ratio': profiled['group_target_ratio'], + } + ) + + shared_columns = sorted(set.intersection(*all_columns)) if all_columns else [] + union_columns = sorted(set.union(*all_columns)) if all_columns else [] + schema_drift = _schema_drift(files, shared_columns) + + return { + 'question': question, + 'file_count': len(files), + 'total_row_count': total_rows, + 'shared_columns': shared_columns, + 'union_columns': union_columns, + 'files': files, + 'schema_drift': schema_drift, + 'insights': _generate_insights(files, schema_drift), + 'code_guidance': build_code_guidance(shared_columns, group_column, target_column), + } + + +def build_code_guidance(shared_columns: list[str], group_column: str | None = None, target_column: str | None = None) -> dict[str, str]: + join_key = shared_columns[0] if shared_columns else '공통키컬럼' + group_block = '' + if group_column and target_column: + group_block = ( + f"ratio_tbl = (merged.groupby('{group_column}')['{target_column}'].value_counts(normalize=True)" + ".rename('ratio').reset_index())\n" + "print('그룹-타깃 비율표:\n', ratio_tbl.head(20))\n\n" + ) + + pandas_code = ( + "import pandas as pd\n" + "import matplotlib.pyplot as plt\n\n" + "paths = ['file1.csv', 'file2.csv', 'file3.csv']\n" + "dfs = [pd.read_csv(p) for p in paths]\n\n" + f"key = '{join_key}'\n" + "merged = dfs[0]\n" + "for df in dfs[1:]:\n" + " if key in merged.columns and key in df.columns:\n" + " merged = merged.merge(df, on=key, how='outer', suffixes=('', '_r'))\n\n" + "missing_ratio = merged.isna().mean().sort_values(ascending=False)\n" + "print('결측 비율 상위:\n', missing_ratio.head(10))\n\n" + "numeric_cols = merged.select_dtypes(include='number').columns\n" + "if len(numeric_cols) > 0:\n" + " ratio = (merged[numeric_cols] > 0).mean().sort_values(ascending=False)\n" + " print('양수 비율 상위:\n', ratio.head(10))\n" + " ratio.head(10).plot(kind='bar', title='양수 비율 상위 10개 컬럼')\n" + " plt.tight_layout(); plt.show()\n\n" + f"{group_block}" + ) + + return { + 'recommended_steps': ( + '1) 공통 키 컬럼 확인 후 병합\n' + '2) 컬럼별 결측/고유값/상위값 비율 확인\n' + '3) 수치형 컬럼 비율(양수/0/음수), 이상치 비율, 분포 확인\n' + '4) 그룹 컬럼 기준 타깃 비율 분석(예: 시도명-세차유형)\n' + '5) 파일 간 스키마 변화/평균 변화 범위 확인' + ), + 'pandas_example': pandas_code, + } + + +def build_multi_csv_markdown(result: dict[str, Any]) -> str: + lines = [ + '# 다중 CSV 분석 리포트', + '', + f"- 질문: {result['question']}", + f"- 파일 수: {result['file_count']}", + f"- 전체 행 수: {result['total_row_count']}", + f"- 공통 컬럼: {', '.join(result['shared_columns']) if result['shared_columns'] else '(없음)'}", + '', + ] + + if result.get('insights'): + lines.extend(['## 핵심 인사이트', '']) + for it in result['insights'][:10]: + lines.append(f"- {it}") + lines.append('') + + for file_info in result['files']: + lines.extend( + [ + f"## 파일: {file_info['path']}", + '', + f"- 행 수: {file_info['summary']['row_count']}", + f"- 열 수: {file_info['summary']['column_count']}", + '', + '| 컬럼 | 타입 | 의미타입 | 결측비율 | 고유비율 | 대표값비율 |', + '|---|---|---|---:|---:|---:|', + ] + ) + for col in file_info['summary']['columns']: + prof = file_info['column_profiles'][col] + lines.append( + f"| {col} | {prof['dtype']} | {prof.get('semantic_type','')} | {prof['missing_ratio']:.4f} | {prof['unique_ratio']:.4f} | {prof['dominant_value_ratio']:.4f} |" + ) + if file_info.get('group_target_ratio'): + gtr = file_info['group_target_ratio'] + lines.extend(['', f"- 그룹비율: {gtr['group_column']} x {gtr['target_column']}"]) + lines.append('') + + lines.extend(['## 파일 간 스키마/분포 변화', '', '| 컬럼 | 타입변화 | 결측비율범위 | 대표값비율범위 | 평균범위 |', '|---|---|---:|---:|---:|']) + for col, drift in result['schema_drift'].items(): + lines.append( + f"| {col} | {drift['dtype_changed']} | {drift['missing_ratio_range']:.4f} | {drift['dominant_value_ratio_range']:.4f} | {drift['mean_range']:.4f} |" + ) + + charts = result.get('charts') + if charts: + lines.extend(['', '## 생성된 차트 파일', '']) + for file_path, chart_paths in charts.items(): + lines.append(f"- {file_path}") + for c in chart_paths: + lines.append(f" - {c}") + + lines.extend([ + '', + '## 코드 가이드', + '', + '```text', + result['code_guidance']['recommended_steps'], + '```', + '', + '```python', + result['code_guidance']['pandas_example'], + '```', + ]) + + return '\n'.join(lines) + + +def result_to_json(result: dict[str, Any]) -> str: + return json.dumps(result, ensure_ascii=False, indent=2) diff --git a/bitnet_tools/ui/app.js b/bitnet_tools/ui/app.js index 9312dd4..519f8a3 100644 --- a/bitnet_tools/ui/app.js +++ b/bitnet_tools/ui/app.js @@ -8,6 +8,10 @@ const summary = document.getElementById('summary'); const prompt = document.getElementById('prompt'); const answer = document.getElementById('answer'); +const dashboardJson = document.getElementById('dashboardJson'); +const dashboardCards = document.getElementById('dashboardCards'); +const dashboardInsights = document.getElementById('dashboardInsights'); + let latestPrompt = ''; csvFile.addEventListener('change', async (e) => { @@ -68,3 +72,35 @@ runBtn.addEventListener('click', async () => { const data = await res.json(); answer.textContent = res.ok ? data.answer : (data.error || 'error'); }); + +document.getElementById('renderDashboardBtn').addEventListener('click', () => { + dashboardCards.innerHTML = ''; + dashboardInsights.textContent = ''; + + let parsed; + try { + parsed = JSON.parse(dashboardJson.value || '{}'); + } catch { + dashboardInsights.textContent = 'JSON 형식이 올바르지 않습니다.'; + return; + } + + const cardItems = [ + ['파일 수', parsed.file_count ?? '-'], + ['총 행 수', parsed.total_row_count ?? '-'], + ['공통 컬럼 수', (parsed.shared_columns || []).length], + ['인사이트 수', (parsed.insights || []).length], + ]; + + cardItems.forEach(([k, v]) => { + const div = document.createElement('div'); + div.className = 'card'; + div.innerHTML = `${k}${v}`; + dashboardCards.appendChild(div); + }); + + const insights = parsed.insights || []; + dashboardInsights.textContent = insights.length + ? insights.map((x, i) => `${i + 1}. ${x}`).join('\n') + : '인사이트 항목이 없습니다.'; +}); diff --git a/bitnet_tools/ui/index.html b/bitnet_tools/ui/index.html index bb7d18b..cf5224d 100644 --- a/bitnet_tools/ui/index.html +++ b/bitnet_tools/ui/index.html @@ -9,7 +9,7 @@

BitNet CSV Analyzer

-

CSV 업로드 → 자동 요약 → BitNet 답변까지 한 번에.

+

CSV 업로드 → 자동 요약 → BitNet 답변 + 멀티 분석 대시보드.

@@ -53,6 +53,17 @@

생성 프롬프트

BitNet 응답


       
+ +
+

멀티 분석 대시보드(JSON)

+

`multi-analyze` 결과 JSON을 붙여넣고 시각적으로 확인하세요.

+ +
+ +
+
+

+      
diff --git a/bitnet_tools/ui/styles.css b/bitnet_tools/ui/styles.css index 0ee1949..8585750 100644 --- a/bitnet_tools/ui/styles.css +++ b/bitnet_tools/ui/styles.css @@ -56,3 +56,20 @@ pre { max-height: 320px; overflow: auto; } +.cards { + margin-top: 12px; + display: grid; + grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); + gap: 8px; +} +.card { + background: #0b1220; + border: 1px solid #334155; + border-radius: 8px; + padding: 10px; + display: flex; + flex-direction: column; + gap: 6px; +} +.card strong { color: var(--muted); font-size: 12px; } +.card span { font-size: 18px; font-weight: 700; } diff --git a/bitnet_tools/visualize.py b/bitnet_tools/visualize.py new file mode 100644 index 0000000..9da48d7 --- /dev/null +++ b/bitnet_tools/visualize.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import csv +import random +from collections import Counter +from pathlib import Path +from typing import Any + + +SAMPLE_CAP = 20000 +TOP_K = 10 + + +def _safe_stem(path: Path) -> str: + return path.stem.replace(" ", "_") + + +def _ensure_matplotlib(): + try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + return plt + except Exception as exc: + raise RuntimeError("matplotlib is required for chart generation") from exc + + +def _reservoir_float(values: list[float], value: float, seen: int, cap: int) -> None: + if cap <= 0: + return + if len(values) < cap: + values.append(value) + return + idx = random.randint(0, seen - 1) + if idx < cap: + values[idx] = value + + +def _reservoir_pair(xs: list[float], ys: list[float], x: float, y: float, seen: int, cap: int) -> None: + if cap <= 0: + return + if len(xs) < cap: + xs.append(x) + ys.append(y) + return + idx = random.randint(0, seen - 1) + if idx < cap: + xs[idx] = x + ys[idx] = y + + +def _collect_profiles(csv_path: Path) -> tuple[list[str], dict[str, dict[str, Any]]]: + with csv_path.open("r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + return [], {} + columns = [str(c) for c in reader.fieldnames] + + profiles: dict[str, dict[str, Any]] = { + c: { + "seen": 0, + "numeric_seen": 0, + "is_numeric": True, + "missing": 0, + "values": [], + "counter": Counter(), + } + for c in columns + } + + for row in reader: + for c in columns: + raw = (row.get(c) or "").strip() + if not raw: + profiles[c]["missing"] += 1 + continue + profiles[c]["seen"] += 1 + + if profiles[c]["is_numeric"]: + try: + num = float(raw) + profiles[c]["numeric_seen"] += 1 + _reservoir_float( + profiles[c]["values"], num, profiles[c]["numeric_seen"], SAMPLE_CAP + ) + except ValueError: + profiles[c]["is_numeric"] = False + profiles[c]["values"] = [] + + profiles[c]["counter"][raw] += 1 + + for c in columns: + if profiles[c]["numeric_seen"] == 0: + profiles[c]["is_numeric"] = False + + return columns, profiles + + +def create_file_charts( + csv_path: Path, + out_dir: Path, + max_numeric: int = 3, + max_categorical: int = 2, +) -> list[str]: + plt = _ensure_matplotlib() + + out_dir.mkdir(parents=True, exist_ok=True) + columns, profiles = _collect_profiles(csv_path) + if not columns: + return [] + + numeric_cols = [c for c in columns if profiles[c]["is_numeric"]][:max_numeric] + categorical_cols = [c for c in columns if not profiles[c]["is_numeric"]][:max_categorical] + + artifacts: list[str] = [] + stem = _safe_stem(csv_path) + + for col in numeric_cols: + values: list[float] = profiles[col]["values"] + missing = profiles[col]["missing"] + if not values: + continue + + fig = plt.figure(figsize=(7, 4)) + plt.hist(values, bins=20) + plt.title(f"{stem} - {col} histogram(sample)") + plt.xlabel(col) + plt.ylabel("count") + plt.tight_layout() + out = out_dir / f"{stem}_{col}_hist.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + fig = plt.figure(figsize=(5, 4)) + plt.boxplot(values, vert=True) + plt.title(f"{stem} - {col} boxplot(sample)") + plt.ylabel(col) + plt.tight_layout() + out = out_dir / f"{stem}_{col}_box.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + total = profiles[col]["seen"] + missing + if total > 0: + fig = plt.figure(figsize=(5, 3)) + plt.bar(["non_missing", "missing"], [profiles[col]["seen"], missing]) + plt.title(f"{stem} - {col} missing overview") + plt.tight_layout() + out = out_dir / f"{stem}_{col}_missing.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + for col in categorical_cols: + items = profiles[col]["counter"].most_common(TOP_K) + if not items: + continue + + labels = [x[0] for x in items] + counts = [x[1] for x in items] + fig = plt.figure(figsize=(8, 4)) + plt.bar(range(len(labels)), counts) + plt.xticks(range(len(labels)), labels, rotation=30, ha="right") + plt.title(f"{stem} - {col} top values") + plt.tight_layout() + out = out_dir / f"{stem}_{col}_top.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + if len(numeric_cols) >= 2: + x_col, y_col = numeric_cols[0], numeric_cols[1] + xs: list[float] = [] + ys: list[float] = [] + seen = 0 + with csv_path.open("r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is not None: + for row in reader: + x_raw = (row.get(x_col) or "").strip() + y_raw = (row.get(y_col) or "").strip() + if not x_raw or not y_raw: + continue + try: + x, y = float(x_raw), float(y_raw) + except ValueError: + continue + seen += 1 + _reservoir_pair(xs, ys, x, y, seen, SAMPLE_CAP) + + if xs and ys: + fig = plt.figure(figsize=(6, 5)) + plt.scatter(xs, ys, alpha=0.6, s=12) + plt.title(f"{stem} - {x_col} vs {y_col} scatter(sample)") + plt.xlabel(x_col) + plt.ylabel(y_col) + plt.tight_layout() + out = out_dir / f"{stem}_{x_col}_{y_col}_scatter.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + return artifacts + + +def create_multi_charts(csv_paths: list[Path], out_dir: Path) -> dict[str, Any]: + results: dict[str, Any] = {} + for p in csv_paths: + results[str(p)] = create_file_charts(p, out_dir) + return results diff --git a/pyproject.toml b/pyproject.toml index 4f7a543..ae288de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [] [project.scripts] bitnet-analyze = "bitnet_tools.cli:main" +bitnet-desktop = "bitnet_tools.desktop:launch_desktop" [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/tests/test_analysis.py b/tests/test_analysis.py index d9d6454..a3a4517 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -2,7 +2,10 @@ build_analysis_payload, build_analysis_payload_from_csv_text, summarize_rows, + build_markdown_report, ) +from bitnet_tools.multi_csv import analyze_multiple_csv, build_multi_csv_markdown + def test_summarize_rows_basic(): @@ -38,3 +41,88 @@ def test_build_analysis_payload_from_csv_text(): assert payload["csv_path"] == "" assert payload["summary"]["row_count"] == 2 + + +def test_streaming_summary_keeps_mixed_type_as_string(tmp_path): + p = tmp_path / "mixed.csv" + p.write_text("a,b\n1,10\n2,hello\n", encoding="utf-8") + + payload = build_analysis_payload(p, "검증") + + assert payload["summary"]["dtypes"]["b"] == "string" + assert "b" not in payload["summary"]["numeric_stats"] + + +def test_build_markdown_report(): + rows = [{"a": "1", "b": "10"}, {"a": "2", "b": "20"}] + summary = summarize_rows(rows, ["a", "b"]) + report = build_markdown_report(summary, "테스트 질문") + + assert "# BitNet CSV 분석 보고서" in report + assert "| a |" in report + assert "테스트 질문" in report + + +def test_multi_csv_report_builder(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + p1.write_text("city,v\nseoul,1\n", encoding="utf-8") + p2.write_text("city,v2\nseoul,2\n", encoding="utf-8") + + result = analyze_multiple_csv([p1, p2], "비교") + report = build_multi_csv_markdown(result) + + assert result["file_count"] == 2 + assert "city" in result["shared_columns"] + assert "다중 CSV 분석 리포트" in report + + +def test_multi_csv_schema_drift_and_group_ratio(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + p1.write_text("city,type,val\nseoul,A,1\nseoul,B,2\n", encoding="utf-8") + p2.write_text("city,type,val\nseoul,A,100\nbusan,A,200\n", encoding="utf-8") + + result = analyze_multiple_csv([p1, p2], "드리프트", group_column="city", target_column="type") + + assert "schema_drift" in result + assert "val" in result["schema_drift"] + assert result["schema_drift"]["val"]["mean_range"] > 0 + assert result["files"][0]["group_target_ratio"] is not None + + +def test_multi_csv_large_row_count(tmp_path): + p = tmp_path / "big.csv" + lines = ["city,val,type"] + for i in range(5000): + lines.append(f"seoul,{i % 100},A") + p.write_text("\n".join(lines) + "\n", encoding="utf-8") + + result = analyze_multiple_csv([p], "대용량") + + assert result["total_row_count"] == 5000 + assert result["files"][0]["summary"]["row_count"] == 5000 + + +def test_multi_csv_semantic_type_and_insights(tmp_path): + p = tmp_path / "typed.csv" + p.write_text("dt,lat,val,cat\n2024-01-01,37.5,1,A\n2024-01-02,37.6,1000,A\n", encoding="utf-8") + + result = analyze_multiple_csv([p], "의미타입") + prof = result["files"][0]["column_profiles"] + + assert prof["dt"]["semantic_type"] == "date" + assert prof["lat"]["semantic_type"] in {"geo_latitude", "numeric"} + assert isinstance(result.get("insights"), list) + + +def test_multi_csv_cache_created(tmp_path, monkeypatch): + import bitnet_tools.multi_csv as multi + + monkeypatch.setattr(multi, "CACHE_DIR", tmp_path / ".cache") + p = tmp_path / "cache.csv" + p.write_text("a,b\n1,2\n", encoding="utf-8") + + result = multi.analyze_multiple_csv([p], "캐시") + assert result["file_count"] == 1 + assert any((tmp_path / ".cache").glob("*.json")) diff --git a/tests/test_cli.py b/tests/test_cli.py index 56d148c..3350590 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -27,3 +27,151 @@ def fake_serve(host: str, port: int): assert code == 0 assert called == {"host": "0.0.0.0", "port": 9999} + + +def test_cli_doctor_mode(monkeypatch, capsys): + monkeypatch.setattr( + cli, + "collect_environment", + lambda model=None: {"ollama_installed": True, "model_requested": model}, + ) + + code = cli.main(["doctor", "--model", "bitnet:latest"]) + + assert code == 0 + out = capsys.readouterr().out + assert '"ollama_installed": true' in out + assert '"model_requested": "bitnet:latest"' in out + + +def test_cli_report_mode(tmp_path): + csv_path = tmp_path / "sample.csv" + out_path = tmp_path / "report.md" + csv_path.write_text("a,b\n1,2\n", encoding="utf-8") + + code = cli.main(["report", str(csv_path), "--question", "요약", "--out", str(out_path)]) + + assert code == 0 + assert out_path.exists() + assert "BitNet CSV 분석 보고서" in out_path.read_text(encoding="utf-8") + + +def test_cli_multi_analyze_mode(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out.json" + out_md = tmp_path / "out.md" + + p1.write_text("city,val\nseoul,1\nbusan,2\n", encoding="utf-8") + p2.write_text("city,val2\nseoul,10\ndaegu,20\n", encoding="utf-8") + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "다중 비교", + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + assert out_json.exists() + assert out_md.exists() + assert "다중 CSV 분석 리포트" in out_md.read_text(encoding="utf-8") + + +def test_cli_multi_analyze_with_group_target(tmp_path): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out2.json" + out_md = tmp_path / "out2.md" + + p1.write_text("city,type,val\nseoul,A,1\nseoul,B,2\n", encoding="utf-8") + p2.write_text("city,type,val\nseoul,A,10\nbusan,A,20\n", encoding="utf-8") + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "그룹비율", + "--group-column", + "city", + "--target-column", + "type", + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + body = out_json.read_text(encoding="utf-8") + assert "group_target_ratio" in body + + +def test_cli_multi_analyze_with_charts(tmp_path, monkeypatch): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out3.json" + out_md = tmp_path / "out3.md" + charts_dir = tmp_path / "charts" + + p1.write_text("city,val\nseoul,1\n", encoding="utf-8") + p2.write_text("city,val\nbusan,2\n", encoding="utf-8") + + monkeypatch.setattr(cli, "create_multi_charts", lambda paths, out: {str(paths[0]): ["chart1.png"]}) + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "차트", + "--charts-dir", + str(charts_dir), + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + body = out_json.read_text(encoding="utf-8") + assert "charts" in body + + +def test_cli_multi_analyze_chart_error_fallback(tmp_path, monkeypatch): + p1 = tmp_path / "a.csv" + p2 = tmp_path / "b.csv" + out_json = tmp_path / "out4.json" + out_md = tmp_path / "out4.md" + + p1.write_text("city,val\nseoul,1\n", encoding="utf-8") + p2.write_text("city,val\nbusan,2\n", encoding="utf-8") + + def boom(paths, out): + raise RuntimeError("matplotlib is required for chart generation") + + monkeypatch.setattr(cli, "create_multi_charts", boom) + + code = cli.main([ + "multi-analyze", + str(p1), + str(p2), + "--question", + "차트실패", + "--charts-dir", + str(tmp_path / "charts"), + "--out-json", + str(out_json), + "--out-report", + str(out_md), + ]) + + assert code == 0 + body = out_json.read_text(encoding="utf-8") + assert "charts_error" in body