Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,26 @@

## 0) 현재 완성도 빠른 진단

현 시점 기준 기능 완성도(실사용 관점): **약 95%**
현 시점 기준 기능 완성도(실사용 관점): **약 96%**

- 완료
- CSV 기초 요약(행/열/결측/숫자 통계)
- BitNet용 프롬프트 자동 생성
- 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`)
- 컬럼별 결측/고유/상위값 비율 산출
- 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성)
- 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar, matplotlib 설치 시)
- 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar/scatter/missing-bar, matplotlib 설치 시)
- 브라우저 UI(`bitnet-analyze ui`)
- **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)**
- 남은 과제
- 대시보드형 시각화 UI 고도화(필터/드릴다운)
- 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화
- 수십 MB 이상 다중 파일에서 차트 생성 최적화(샘플링/청크화)

### 처리 규모 가이드

- 단일/다중 CSV 분석(`analyze`, `multi-analyze`)은 스트리밍 누적 통계를 사용해 수십 MB 수준까지 안정 처리하도록 개선됨
- 차트 생성(`--charts-dir`)은 matplotlib 기반이며 파일을 메모리에 적재해 그리므로 더 큰 파일에서는 샘플링 전략 권장

### 파일 붙여넣기 분석 가능 범위

Expand Down
173 changes: 122 additions & 51 deletions bitnet_tools/multi_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import csv
import json
import math
import random
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any

from .analysis import _to_float, summarize_reader
from .analysis import _to_float


def _quantile(sorted_values: list[float], q: float) -> float:
Expand Down Expand Up @@ -39,14 +40,22 @@ def _outlier_ratio(values: list[float]) -> float:
return round(outliers / len(sorted_values), 6)


def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: str) -> dict[str, Any]:
table: dict[str, Counter[str]] = defaultdict(Counter)
for row in rows:
g = (row.get(group_col) or "").strip()
t = (row.get(target_col) or "").strip()
if g and t:
table[g][t] += 1
def _reservoir_sample(values: list[float], new_value: float, seen: int, cap: int) -> None:
if cap <= 0:
return
if len(values) < cap:
values.append(new_value)
return
idx = random.randint(0, seen - 1)
if idx < cap:
values[idx] = new_value


def _finalize_group_ratio_table(
table: dict[str, Counter[str]],
group_col: str,
target_col: str,
) -> dict[str, Any]:
ratio_table: dict[str, Any] = {}
for g, counter in table.items():
total = sum(counter.values())
Expand All @@ -65,46 +74,98 @@ def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: s
}


def _profile_rows(
rows: list[dict[str, str]],
columns: list[str],
def _profile_csv_stream(
path: Path,
group_column: str | None = None,
target_column: str | None = None,
outlier_sample_cap: int = 20000,
) -> dict[str, Any]:
row_count = len(rows)
missing = {c: 0 for c in columns}
non_missing = {c: 0 for c in columns}
uniques: dict[str, set[str]] = {c: set() for c in columns}
value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns}

numeric_positive = {c: 0 for c in columns}
numeric_zero = {c: 0 for c in columns}
numeric_negative = {c: 0 for c in columns}
numeric_values: dict[str, list[float]] = {c: [] for c in columns}

for row in rows:
for col in columns:
raw = (row.get(col) or "").strip()
if not raw:
missing[col] += 1
continue
non_missing[col] += 1
uniques[col].add(raw)
value_counts[col][raw] += 1

num = _to_float(raw)
if num is not None:
numeric_values[col].append(num)
with path.open("r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
raise ValueError(f"CSV header not found: {path}")
columns = [str(c) for c in reader.fieldnames]

missing = {c: 0 for c in columns}
non_missing = {c: 0 for c in columns}
unique_sets: dict[str, set[str]] = {c: set() for c in columns}
value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns}

numeric_positive = {c: 0 for c in columns}
numeric_zero = {c: 0 for c in columns}
numeric_negative = {c: 0 for c in columns}
numeric_counts = {c: 0 for c in columns}
numeric_sums = {c: 0.0 for c in columns}
numeric_mins: dict[str, float] = {}
numeric_maxs: dict[str, float] = {}
text_seen = {c: False for c in columns}

numeric_outlier_samples: dict[str, list[float]] = {c: [] for c in columns}

group_target_counter: dict[str, Counter[str]] = defaultdict(Counter)
row_count = 0

for row in reader:
row_count += 1
if group_column and target_column and group_column in columns and target_column in columns:
g = (row.get(group_column) or "").strip()
t = (row.get(target_column) or "").strip()
if g and t:
group_target_counter[g][t] += 1

for col in columns:
raw = (row.get(col) or "").strip()
if raw == "":
missing[col] += 1
continue

non_missing[col] += 1
unique_sets[col].add(raw)
value_counts[col][raw] += 1

num = _to_float(raw)
if num is None:
text_seen[col] = True
continue

numeric_counts[col] += 1
numeric_sums[col] += num
if col not in numeric_mins or num < numeric_mins[col]:
numeric_mins[col] = num
if col not in numeric_maxs or num > numeric_maxs[col]:
numeric_maxs[col] = num

if num > 0:
numeric_positive[col] += 1
elif num < 0:
numeric_negative[col] += 1
else:
numeric_zero[col] += 1

summary = summarize_reader(rows, columns)
_reservoir_sample(
numeric_outlier_samples[col],
num,
numeric_counts[col],
outlier_sample_cap,
)

dtypes: dict[str, str] = {}
numeric_stats: dict[str, dict[str, float]] = {}
profiles: dict[str, Any] = {}

for col in columns:
count = numeric_counts[col]
if count > 0 and not text_seen[col]:
dtypes[col] = "float"
numeric_stats[col] = {
"count": float(count),
"mean": float(numeric_sums[col] / count),
"min": float(numeric_mins[col]),
"max": float(numeric_maxs[col]),
}
else:
dtypes[col] = "string"

nn = non_missing[col]
top = value_counts[col].most_common(5)
top_values = [
Expand All @@ -123,28 +184,41 @@ def _profile_rows(
"positive_ratio": round(numeric_positive[col] / numeric_total, 6),
"zero_ratio": round(numeric_zero[col] / numeric_total, 6),
"negative_ratio": round(numeric_negative[col] / numeric_total, 6),
"outlier_ratio": _outlier_ratio(numeric_values[col]),
"outlier_ratio": _outlier_ratio(numeric_outlier_samples[col]),
}

dominant_value_ratio = top_values[0]["ratio"] if top_values else 0.0
profiles[col] = {
"missing_count": missing[col],
"missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0,
"non_missing_count": nn,
"unique_count": len(uniques[col]),
"unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0,
"unique_count": len(unique_sets[col]),
"unique_ratio": round(len(unique_sets[col]) / nn, 6) if nn else 0.0,
"dominant_value_ratio": dominant_value_ratio,
"top_values": top_values,
"numeric_distribution": numeric_distribution,
"dtype": summary.dtypes[col],
"dtype": dtypes[col],
}

summary = {
"row_count": row_count,
"column_count": len(columns),
"columns": columns,
"dtypes": dtypes,
"missing_counts": missing,
"numeric_stats": numeric_stats,
}

group_target_ratio: dict[str, Any] | None = None
if group_column and target_column and group_column in columns and target_column in columns:
group_target_ratio = _group_ratio_table(rows, group_column, target_column)
group_target_ratio = _finalize_group_ratio_table(
group_target_counter,
group_column,
target_column,
)

return {
"summary": summary.to_dict(),
"summary": summary,
"column_profiles": profiles,
"group_target_ratio": group_target_ratio,
}
Expand Down Expand Up @@ -189,16 +263,13 @@ def analyze_multiple_csv(
if not path.exists():
raise FileNotFoundError(f"CSV file not found: {path}")

with path.open("r", encoding="utf-8-sig", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
raise ValueError(f"CSV header not found: {path}")
columns = [str(c) for c in reader.fieldnames]
rows = list(reader)

profiled = _profile_rows(rows, columns, group_column=group_column, target_column=target_column)
profiled = _profile_csv_stream(
path,
group_column=group_column,
target_column=target_column,
)
total_rows += profiled["summary"]["row_count"]
all_columns.append(set(columns))
all_columns.append(set(profiled["summary"]["columns"]))

files.append(
{
Expand Down
43 changes: 43 additions & 0 deletions bitnet_tools/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ def _safe_stem(path: Path) -> str:
def _ensure_matplotlib():
try:
import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt

return plt
except Exception as exc:
raise RuntimeError("matplotlib is required for chart generation") from exc
Expand Down Expand Up @@ -55,12 +57,16 @@ def create_file_charts(
artifacts: list[str] = []
stem = _safe_stem(csv_path)

# template 1: numeric histogram + boxplot
for col in numeric_cols:
values = []
missing = 0
for row in rows:
raw = (row.get(col) or "").strip()
if raw:
values.append(float(raw))
else:
missing += 1
if not values:
continue

Expand All @@ -85,6 +91,19 @@ def create_file_charts(
plt.close(fig)
artifacts.append(str(out))

# template 2: numeric missing ratio mini chart
total = len(values) + missing
if total > 0:
fig = plt.figure(figsize=(5, 3))
plt.bar(["non_missing", "missing"], [len(values), missing])
plt.title(f"{stem} - {col} missing overview")
plt.tight_layout()
out = out_dir / f"{stem}_{col}_missing.png"
fig.savefig(out)
plt.close(fig)
artifacts.append(str(out))

# template 3: categorical top-value bar
for col in categorical_cols:
counter: dict[str, int] = {}
for row in rows:
Expand All @@ -108,6 +127,30 @@ def create_file_charts(
plt.close(fig)
artifacts.append(str(out))

# template 4: scatter for first 2 numeric columns
if len(numeric_cols) >= 2:
x_col, y_col = numeric_cols[0], numeric_cols[1]
xs: list[float] = []
ys: list[float] = []
for row in rows:
x_raw = (row.get(x_col) or "").strip()
y_raw = (row.get(y_col) or "").strip()
if not x_raw or not y_raw:
continue
xs.append(float(x_raw))
ys.append(float(y_raw))
if xs and ys:
fig = plt.figure(figsize=(6, 5))
plt.scatter(xs, ys, alpha=0.6, s=12)
plt.title(f"{stem} - {x_col} vs {y_col}")
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.tight_layout()
out = out_dir / f"{stem}_{x_col}_{y_col}_scatter.png"
fig.savefig(out)
plt.close(fig)
artifacts.append(str(out))

return artifacts


Expand Down
13 changes: 13 additions & 0 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,16 @@ def test_multi_csv_schema_drift_and_group_ratio(tmp_path):
assert "val" in result["schema_drift"]
assert result["schema_drift"]["val"]["mean_range"] > 0
assert result["files"][0]["group_target_ratio"] is not None


def test_multi_csv_large_row_count(tmp_path):
p = tmp_path / "big.csv"
lines = ["city,val,type"]
for i in range(5000):
lines.append(f"seoul,{i % 100},A")
p.write_text("\n".join(lines) + "\n", encoding="utf-8")

result = analyze_multiple_csv([p], "대용량")

assert result["total_row_count"] == 5000
assert result["files"][0]["summary"]["row_count"] == 5000