Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

## 0) 현재 완성도 빠른 진단

현 시점 기준 기능 완성도(실사용 관점): **약 97%**
현 시점 기준 기능 완성도(실사용 관점): **약 98%**

- 완료
- CSV 기초 요약(행/열/결측/숫자 통계)
Expand All @@ -22,14 +22,14 @@
- 웹 UI 대시보드(JSON 붙여넣기 기반 KPI/인사이트 뷰)
- **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)**
- 남은 과제
- 대시보드 상호작용 고도화(파일 업로드 기반 멀티 분석 원클릭)
- 대규모 차트 생성 최적화(청크-스트리밍 렌더러)
- 대시보드 상호작용 고도화(파일 업로드 기반 멀티 분석 원클릭 + 필터)
- 차트 렌더링 백엔드 비동기 작업 큐(대형 배치용)

### 처리 규모 가이드

- 단일/다중 CSV 분석(`analyze`, `multi-analyze`)은 스트리밍 누적 통계를 사용해 수십 MB 수준까지 안정 처리하도록 개선됨
- `multi-analyze`는 파일 단위 캐시(`.bitnet_cache`)를 사용해 재실행 성능을 개선
- 차트 생성(`--charts-dir`)은 matplotlib 기반이며 파일을 메모리에 적재해 그리므로 더 큰 파일에서는 샘플링 전략 권장
- 차트 생성(`--charts-dir`)은 matplotlib 기반이며 샘플링 기반 차트 템플릿으로 메모리 사용을 제한해 대형 파일 대응성을 개선

### 파일 붙여넣기 분석 가능 범위

Expand Down
160 changes: 105 additions & 55 deletions bitnet_tools/visualize.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,14 @@
from __future__ import annotations

import csv
import random
from collections import Counter
from pathlib import Path
from typing import Any


def _is_numeric_column(rows: list[dict[str, str]], col: str) -> bool:
seen = 0
for row in rows:
raw = (row.get(col) or "").strip()
if not raw:
continue
seen += 1
try:
float(raw)
except ValueError:
return False
return seen > 0
SAMPLE_CAP = 20000
TOP_K = 10


def _safe_stem(path: Path) -> str:
Expand All @@ -35,6 +27,77 @@ def _ensure_matplotlib():
raise RuntimeError("matplotlib is required for chart generation") from exc


def _reservoir_float(values: list[float], value: float, seen: int, cap: int) -> None:
if cap <= 0:
return
if len(values) < cap:
values.append(value)
return
idx = random.randint(0, seen - 1)
if idx < cap:
values[idx] = value


def _reservoir_pair(xs: list[float], ys: list[float], x: float, y: float, seen: int, cap: int) -> None:
if cap <= 0:
return
if len(xs) < cap:
xs.append(x)
ys.append(y)
return
idx = random.randint(0, seen - 1)
if idx < cap:
xs[idx] = x
ys[idx] = y


def _collect_profiles(csv_path: Path) -> tuple[list[str], dict[str, dict[str, Any]]]:
with csv_path.open("r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
return [], {}
columns = [str(c) for c in reader.fieldnames]

profiles: dict[str, dict[str, Any]] = {
c: {
"seen": 0,
"numeric_seen": 0,
"is_numeric": True,
"missing": 0,
"values": [],
"counter": Counter(),
}
for c in columns
}

for row in reader:
for c in columns:
raw = (row.get(c) or "").strip()
if not raw:
profiles[c]["missing"] += 1
continue
profiles[c]["seen"] += 1

if profiles[c]["is_numeric"]:
try:
num = float(raw)
profiles[c]["numeric_seen"] += 1
_reservoir_float(
profiles[c]["values"], num, profiles[c]["numeric_seen"], SAMPLE_CAP
)
except ValueError:
profiles[c]["is_numeric"] = False
profiles[c]["values"] = []

profiles[c]["counter"][raw] += 1

for c in columns:
if profiles[c]["numeric_seen"] == 0:
profiles[c]["is_numeric"] = False

return columns, profiles


def create_file_charts(
csv_path: Path,
out_dir: Path,
Expand All @@ -44,35 +107,25 @@ def create_file_charts(
plt = _ensure_matplotlib()

out_dir.mkdir(parents=True, exist_ok=True)
with csv_path.open("r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
return []
columns = [str(c) for c in reader.fieldnames]
rows = list(reader)
columns, profiles = _collect_profiles(csv_path)
if not columns:
return []

numeric_cols = [c for c in columns if _is_numeric_column(rows, c)][:max_numeric]
categorical_cols = [c for c in columns if c not in numeric_cols][:max_categorical]
numeric_cols = [c for c in columns if profiles[c]["is_numeric"]][:max_numeric]
categorical_cols = [c for c in columns if not profiles[c]["is_numeric"]][:max_categorical]

artifacts: list[str] = []
stem = _safe_stem(csv_path)

# template 1: numeric histogram + boxplot
for col in numeric_cols:
values = []
missing = 0
for row in rows:
raw = (row.get(col) or "").strip()
if raw:
values.append(float(raw))
else:
missing += 1
values: list[float] = profiles[col]["values"]
missing = profiles[col]["missing"]
if not values:
continue

fig = plt.figure(figsize=(7, 4))
plt.hist(values, bins=20)
plt.title(f"{stem} - {col} histogram")
plt.title(f"{stem} - {col} histogram(sample)")
plt.xlabel(col)
plt.ylabel("count")
plt.tight_layout()
Expand All @@ -83,35 +136,27 @@ def create_file_charts(

fig = plt.figure(figsize=(5, 4))
plt.boxplot(values, vert=True)
plt.title(f"{stem} - {col} boxplot")
plt.title(f"{stem} - {col} boxplot(sample)")
plt.ylabel(col)
plt.tight_layout()
out = out_dir / f"{stem}_{col}_box.png"
fig.savefig(out)
plt.close(fig)
artifacts.append(str(out))

# template 2: numeric missing ratio mini chart
total = len(values) + missing
total = profiles[col]["seen"] + missing
if total > 0:
fig = plt.figure(figsize=(5, 3))
plt.bar(["non_missing", "missing"], [len(values), missing])
plt.bar(["non_missing", "missing"], [profiles[col]["seen"], missing])
plt.title(f"{stem} - {col} missing overview")
plt.tight_layout()
out = out_dir / f"{stem}_{col}_missing.png"
fig.savefig(out)
plt.close(fig)
artifacts.append(str(out))

# template 3: categorical top-value bar
for col in categorical_cols:
counter: dict[str, int] = {}
for row in rows:
raw = (row.get(col) or "").strip()
if not raw:
continue
counter[raw] = counter.get(raw, 0) + 1
items = sorted(counter.items(), key=lambda x: x[1], reverse=True)[:10]
items = profiles[col]["counter"].most_common(TOP_K)
if not items:
continue

Expand All @@ -127,22 +172,30 @@ def create_file_charts(
plt.close(fig)
artifacts.append(str(out))

# template 4: scatter for first 2 numeric columns
if len(numeric_cols) >= 2:
x_col, y_col = numeric_cols[0], numeric_cols[1]
xs: list[float] = []
ys: list[float] = []
for row in rows:
x_raw = (row.get(x_col) or "").strip()
y_raw = (row.get(y_col) or "").strip()
if not x_raw or not y_raw:
continue
xs.append(float(x_raw))
ys.append(float(y_raw))
seen = 0
with csv_path.open("r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is not None:
for row in reader:
x_raw = (row.get(x_col) or "").strip()
y_raw = (row.get(y_col) or "").strip()
if not x_raw or not y_raw:
continue
try:
x, y = float(x_raw), float(y_raw)
except ValueError:
continue
seen += 1
_reservoir_pair(xs, ys, x, y, seen, SAMPLE_CAP)

if xs and ys:
fig = plt.figure(figsize=(6, 5))
plt.scatter(xs, ys, alpha=0.6, s=12)
plt.title(f"{stem} - {x_col} vs {y_col}")
plt.title(f"{stem} - {x_col} vs {y_col} scatter(sample)")
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.tight_layout()
Expand All @@ -154,10 +207,7 @@ def create_file_charts(
return artifacts


def create_multi_charts(
csv_paths: list[Path],
out_dir: Path,
) -> dict[str, Any]:
def create_multi_charts(csv_paths: list[Path], out_dir: Path) -> dict[str, Any]:
results: dict[str, Any] = {}
for p in csv_paths:
results[str(p)] = create_file_charts(p, out_dir)
Expand Down