diff --git a/README.md b/README.md index f5cf342..045f3db 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ## 0) 현재 완성도 빠른 진단 -현 시점 기준 기능 완성도(실사용 관점): **약 95%** +현 시점 기준 기능 완성도(실사용 관점): **약 96%** - 완료 - CSV 기초 요약(행/열/결측/숫자 통계) @@ -15,12 +15,18 @@ - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`) - 컬럼별 결측/고유/상위값 비율 산출 - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성) - - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar, matplotlib 설치 시) + - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar/scatter/missing-bar, matplotlib 설치 시) - 브라우저 UI(`bitnet-analyze ui`) - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)** - 남은 과제 - 대시보드형 시각화 UI 고도화(필터/드릴다운) - 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화 + - 수십 MB 이상 다중 파일에서 차트 생성 최적화(샘플링/청크화) + +### 처리 규모 가이드 + +- 단일/다중 CSV 분석(`analyze`, `multi-analyze`)은 스트리밍 누적 통계를 사용해 수십 MB 수준까지 안정 처리하도록 개선됨 +- 차트 생성(`--charts-dir`)은 matplotlib 기반이며 파일을 메모리에 적재해 그리므로 더 큰 파일에서는 샘플링 전략 권장 ### 파일 붙여넣기 분석 가능 범위 diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py index fef1bc2..729c78a 100644 --- a/bitnet_tools/multi_csv.py +++ b/bitnet_tools/multi_csv.py @@ -3,11 +3,12 @@ import csv import json import math +import random from collections import Counter, defaultdict from pathlib import Path from typing import Any -from .analysis import _to_float, summarize_reader +from .analysis import _to_float def _quantile(sorted_values: list[float], q: float) -> float: @@ -39,14 +40,22 @@ def _outlier_ratio(values: list[float]) -> float: return round(outliers / len(sorted_values), 6) -def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: str) -> dict[str, Any]: - table: dict[str, Counter[str]] = defaultdict(Counter) - for row in rows: - g = (row.get(group_col) or "").strip() - t = (row.get(target_col) or "").strip() - if g and t: - table[g][t] += 1 +def _reservoir_sample(values: list[float], new_value: float, seen: int, cap: int) -> None: + if cap <= 0: + return + if len(values) < cap: + values.append(new_value) + return + idx = random.randint(0, seen - 1) + if idx < cap: + values[idx] = new_value + +def _finalize_group_ratio_table( + table: dict[str, Counter[str]], + group_col: str, + target_col: str, +) -> dict[str, Any]: ratio_table: dict[str, Any] = {} for g, counter in table.items(): total = sum(counter.values()) @@ -65,36 +74,67 @@ def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: s } -def _profile_rows( - rows: list[dict[str, str]], - columns: list[str], +def _profile_csv_stream( + path: Path, group_column: str | None = None, target_column: str | None = None, + outlier_sample_cap: int = 20000, ) -> dict[str, Any]: - row_count = len(rows) - missing = {c: 0 for c in columns} - non_missing = {c: 0 for c in columns} - uniques: dict[str, set[str]] = {c: set() for c in columns} - value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns} - - numeric_positive = {c: 0 for c in columns} - numeric_zero = {c: 0 for c in columns} - numeric_negative = {c: 0 for c in columns} - numeric_values: dict[str, list[float]] = {c: [] for c in columns} - - for row in rows: - for col in columns: - raw = (row.get(col) or "").strip() - if not raw: - missing[col] += 1 - continue - non_missing[col] += 1 - uniques[col].add(raw) - value_counts[col][raw] += 1 - - num = _to_float(raw) - if num is not None: - numeric_values[col].append(num) + with path.open("r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError(f"CSV header not found: {path}") + columns = [str(c) for c in reader.fieldnames] + + missing = {c: 0 for c in columns} + non_missing = {c: 0 for c in columns} + unique_sets: dict[str, set[str]] = {c: set() for c in columns} + value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns} + + numeric_positive = {c: 0 for c in columns} + numeric_zero = {c: 0 for c in columns} + numeric_negative = {c: 0 for c in columns} + numeric_counts = {c: 0 for c in columns} + numeric_sums = {c: 0.0 for c in columns} + numeric_mins: dict[str, float] = {} + numeric_maxs: dict[str, float] = {} + text_seen = {c: False for c in columns} + + numeric_outlier_samples: dict[str, list[float]] = {c: [] for c in columns} + + group_target_counter: dict[str, Counter[str]] = defaultdict(Counter) + row_count = 0 + + for row in reader: + row_count += 1 + if group_column and target_column and group_column in columns and target_column in columns: + g = (row.get(group_column) or "").strip() + t = (row.get(target_column) or "").strip() + if g and t: + group_target_counter[g][t] += 1 + + for col in columns: + raw = (row.get(col) or "").strip() + if raw == "": + missing[col] += 1 + continue + + non_missing[col] += 1 + unique_sets[col].add(raw) + value_counts[col][raw] += 1 + + num = _to_float(raw) + if num is None: + text_seen[col] = True + continue + + numeric_counts[col] += 1 + numeric_sums[col] += num + if col not in numeric_mins or num < numeric_mins[col]: + numeric_mins[col] = num + if col not in numeric_maxs or num > numeric_maxs[col]: + numeric_maxs[col] = num + if num > 0: numeric_positive[col] += 1 elif num < 0: @@ -102,9 +142,30 @@ def _profile_rows( else: numeric_zero[col] += 1 - summary = summarize_reader(rows, columns) + _reservoir_sample( + numeric_outlier_samples[col], + num, + numeric_counts[col], + outlier_sample_cap, + ) + + dtypes: dict[str, str] = {} + numeric_stats: dict[str, dict[str, float]] = {} profiles: dict[str, Any] = {} + for col in columns: + count = numeric_counts[col] + if count > 0 and not text_seen[col]: + dtypes[col] = "float" + numeric_stats[col] = { + "count": float(count), + "mean": float(numeric_sums[col] / count), + "min": float(numeric_mins[col]), + "max": float(numeric_maxs[col]), + } + else: + dtypes[col] = "string" + nn = non_missing[col] top = value_counts[col].most_common(5) top_values = [ @@ -123,7 +184,7 @@ def _profile_rows( "positive_ratio": round(numeric_positive[col] / numeric_total, 6), "zero_ratio": round(numeric_zero[col] / numeric_total, 6), "negative_ratio": round(numeric_negative[col] / numeric_total, 6), - "outlier_ratio": _outlier_ratio(numeric_values[col]), + "outlier_ratio": _outlier_ratio(numeric_outlier_samples[col]), } dominant_value_ratio = top_values[0]["ratio"] if top_values else 0.0 @@ -131,20 +192,33 @@ def _profile_rows( "missing_count": missing[col], "missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0, "non_missing_count": nn, - "unique_count": len(uniques[col]), - "unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0, + "unique_count": len(unique_sets[col]), + "unique_ratio": round(len(unique_sets[col]) / nn, 6) if nn else 0.0, "dominant_value_ratio": dominant_value_ratio, "top_values": top_values, "numeric_distribution": numeric_distribution, - "dtype": summary.dtypes[col], + "dtype": dtypes[col], } + summary = { + "row_count": row_count, + "column_count": len(columns), + "columns": columns, + "dtypes": dtypes, + "missing_counts": missing, + "numeric_stats": numeric_stats, + } + group_target_ratio: dict[str, Any] | None = None if group_column and target_column and group_column in columns and target_column in columns: - group_target_ratio = _group_ratio_table(rows, group_column, target_column) + group_target_ratio = _finalize_group_ratio_table( + group_target_counter, + group_column, + target_column, + ) return { - "summary": summary.to_dict(), + "summary": summary, "column_profiles": profiles, "group_target_ratio": group_target_ratio, } @@ -189,16 +263,13 @@ def analyze_multiple_csv( if not path.exists(): raise FileNotFoundError(f"CSV file not found: {path}") - with path.open("r", encoding="utf-8-sig", newline="") as f: - reader = csv.DictReader(f) - if reader.fieldnames is None: - raise ValueError(f"CSV header not found: {path}") - columns = [str(c) for c in reader.fieldnames] - rows = list(reader) - - profiled = _profile_rows(rows, columns, group_column=group_column, target_column=target_column) + profiled = _profile_csv_stream( + path, + group_column=group_column, + target_column=target_column, + ) total_rows += profiled["summary"]["row_count"] - all_columns.append(set(columns)) + all_columns.append(set(profiled["summary"]["columns"])) files.append( { diff --git a/bitnet_tools/visualize.py b/bitnet_tools/visualize.py index 4c3b101..c3b34ff 100644 --- a/bitnet_tools/visualize.py +++ b/bitnet_tools/visualize.py @@ -26,8 +26,10 @@ def _safe_stem(path: Path) -> str: def _ensure_matplotlib(): try: import matplotlib + matplotlib.use("Agg") import matplotlib.pyplot as plt + return plt except Exception as exc: raise RuntimeError("matplotlib is required for chart generation") from exc @@ -55,12 +57,16 @@ def create_file_charts( artifacts: list[str] = [] stem = _safe_stem(csv_path) + # template 1: numeric histogram + boxplot for col in numeric_cols: values = [] + missing = 0 for row in rows: raw = (row.get(col) or "").strip() if raw: values.append(float(raw)) + else: + missing += 1 if not values: continue @@ -85,6 +91,19 @@ def create_file_charts( plt.close(fig) artifacts.append(str(out)) + # template 2: numeric missing ratio mini chart + total = len(values) + missing + if total > 0: + fig = plt.figure(figsize=(5, 3)) + plt.bar(["non_missing", "missing"], [len(values), missing]) + plt.title(f"{stem} - {col} missing overview") + plt.tight_layout() + out = out_dir / f"{stem}_{col}_missing.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + + # template 3: categorical top-value bar for col in categorical_cols: counter: dict[str, int] = {} for row in rows: @@ -108,6 +127,30 @@ def create_file_charts( plt.close(fig) artifacts.append(str(out)) + # template 4: scatter for first 2 numeric columns + if len(numeric_cols) >= 2: + x_col, y_col = numeric_cols[0], numeric_cols[1] + xs: list[float] = [] + ys: list[float] = [] + for row in rows: + x_raw = (row.get(x_col) or "").strip() + y_raw = (row.get(y_col) or "").strip() + if not x_raw or not y_raw: + continue + xs.append(float(x_raw)) + ys.append(float(y_raw)) + if xs and ys: + fig = plt.figure(figsize=(6, 5)) + plt.scatter(xs, ys, alpha=0.6, s=12) + plt.title(f"{stem} - {x_col} vs {y_col}") + plt.xlabel(x_col) + plt.ylabel(y_col) + plt.tight_layout() + out = out_dir / f"{stem}_{x_col}_{y_col}_scatter.png" + fig.savefig(out) + plt.close(fig) + artifacts.append(str(out)) + return artifacts diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 41de94b..5fa0806 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -89,3 +89,16 @@ def test_multi_csv_schema_drift_and_group_ratio(tmp_path): assert "val" in result["schema_drift"] assert result["schema_drift"]["val"]["mean_range"] > 0 assert result["files"][0]["group_target_ratio"] is not None + + +def test_multi_csv_large_row_count(tmp_path): + p = tmp_path / "big.csv" + lines = ["city,val,type"] + for i in range(5000): + lines.append(f"seoul,{i % 100},A") + p.write_text("\n".join(lines) + "\n", encoding="utf-8") + + result = analyze_multiple_csv([p], "대용량") + + assert result["total_row_count"] == 5000 + assert result["files"][0]["summary"]["row_count"] == 5000