rad1092 · rad1092 · Feb 14, 2026 · Feb 14, 2026 · Feb 14, 2026
diff --git a/README.md b/README.md
@@ -7,20 +7,26 @@
 
 ## 0) 현재 완성도 빠른 진단
 
-현 시점 기준 기능 완성도(실사용 관점): **약 95%**
+현 시점 기준 기능 완성도(실사용 관점): **약 96%**
 
 - 완료
   - CSV 기초 요약(행/열/결측/숫자 통계)
   - BitNet용 프롬프트 자동 생성
   - 단일 CSV + 다중 CSV CLI 분석(`report`, `multi-analyze`)
   - 컬럼별 결측/고유/상위값 비율 산출
   - 다중 CSV 분석용 코드 가이드(판다스 예시 코드 자동 생성)
-  - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar, matplotlib 설치 시)
+  - 다중 CSV 자동 시각화 차트 생성(histogram/boxplot/top bar/scatter/missing-bar, matplotlib 설치 시)
   - 브라우저 UI(`bitnet-analyze ui`)
   - **윈도우 데스크톱 UI(`bitnet-analyze desktop`, `BitNet_Desktop_Start.bat`)**
 - 남은 과제
   - 대시보드형 시각화 UI 고도화(필터/드릴다운)
   - 데이터 전처리 규칙(날짜/카테고리 자동 인식) 고도화
+  - 수십 MB 이상 다중 파일에서 차트 생성 최적화(샘플링/청크화)
+
+### 처리 규모 가이드
+
+- 단일/다중 CSV 분석(`analyze`, `multi-analyze`)은 스트리밍 누적 통계를 사용해 수십 MB 수준까지 안정 처리하도록 개선됨
+- 차트 생성(`--charts-dir`)은 matplotlib 기반이며 파일을 메모리에 적재해 그리므로 더 큰 파일에서는 샘플링 전략 권장
 
 ### 파일 붙여넣기 분석 가능 범위
 

diff --git a/bitnet_tools/multi_csv.py b/bitnet_tools/multi_csv.py
@@ -3,11 +3,12 @@
 import csv
 import json
 import math
+import random
 from collections import Counter, defaultdict
 from pathlib import Path
 from typing import Any
 
-from .analysis import _to_float, summarize_reader
+from .analysis import _to_float
 
 
 def _quantile(sorted_values: list[float], q: float) -> float:
@@ -39,14 +40,22 @@ def _outlier_ratio(values: list[float]) -> float:
     return round(outliers / len(sorted_values), 6)
 
 
-def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: str) -> dict[str, Any]:
-    table: dict[str, Counter[str]] = defaultdict(Counter)
-    for row in rows:
-        g = (row.get(group_col) or "").strip()
-        t = (row.get(target_col) or "").strip()
-        if g and t:
-            table[g][t] += 1
+def _reservoir_sample(values: list[float], new_value: float, seen: int, cap: int) -> None:
+    if cap <= 0:
+        return
+    if len(values) < cap:
+        values.append(new_value)
+        return
+    idx = random.randint(0, seen - 1)
+    if idx < cap:
+        values[idx] = new_value
 
+
+def _finalize_group_ratio_table(
+    table: dict[str, Counter[str]],
+    group_col: str,
+    target_col: str,
+) -> dict[str, Any]:
     ratio_table: dict[str, Any] = {}
     for g, counter in table.items():
         total = sum(counter.values())
@@ -65,46 +74,98 @@ def _group_ratio_table(rows: list[dict[str, str]], group_col: str, target_col: s
     }
 
 
-def _profile_rows(
-    rows: list[dict[str, str]],
-    columns: list[str],
+def _profile_csv_stream(
+    path: Path,
     group_column: str | None = None,
     target_column: str | None = None,
+    outlier_sample_cap: int = 20000,
 ) -> dict[str, Any]:
-    row_count = len(rows)
-    missing = {c: 0 for c in columns}
-    non_missing = {c: 0 for c in columns}
-    uniques: dict[str, set[str]] = {c: set() for c in columns}
-    value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns}
-
-    numeric_positive = {c: 0 for c in columns}
-    numeric_zero = {c: 0 for c in columns}
-    numeric_negative = {c: 0 for c in columns}
-    numeric_values: dict[str, list[float]] = {c: [] for c in columns}
-
-    for row in rows:
-        for col in columns:
-            raw = (row.get(col) or "").strip()
-            if not raw:
-                missing[col] += 1
-                continue
-            non_missing[col] += 1
-            uniques[col].add(raw)
-            value_counts[col][raw] += 1
-
-            num = _to_float(raw)
-            if num is not None:
-                numeric_values[col].append(num)
+    with path.open("r", encoding="utf-8-sig", newline="") as f:
+        reader = csv.DictReader(f)
+        if reader.fieldnames is None:
+            raise ValueError(f"CSV header not found: {path}")
+        columns = [str(c) for c in reader.fieldnames]
+
+        missing = {c: 0 for c in columns}
+        non_missing = {c: 0 for c in columns}
+        unique_sets: dict[str, set[str]] = {c: set() for c in columns}
+        value_counts: dict[str, Counter[str]] = {c: Counter() for c in columns}
+
+        numeric_positive = {c: 0 for c in columns}
+        numeric_zero = {c: 0 for c in columns}
+        numeric_negative = {c: 0 for c in columns}
+        numeric_counts = {c: 0 for c in columns}
+        numeric_sums = {c: 0.0 for c in columns}
+        numeric_mins: dict[str, float] = {}
+        numeric_maxs: dict[str, float] = {}
+        text_seen = {c: False for c in columns}
+
+        numeric_outlier_samples: dict[str, list[float]] = {c: [] for c in columns}
+
+        group_target_counter: dict[str, Counter[str]] = defaultdict(Counter)
+        row_count = 0
+
+        for row in reader:
+            row_count += 1
+            if group_column and target_column and group_column in columns and target_column in columns:
+                g = (row.get(group_column) or "").strip()
+                t = (row.get(target_column) or "").strip()
+                if g and t:
+                    group_target_counter[g][t] += 1
+
+            for col in columns:
+                raw = (row.get(col) or "").strip()
+                if raw == "":
+                    missing[col] += 1
+                    continue
+
+                non_missing[col] += 1
+                unique_sets[col].add(raw)
+                value_counts[col][raw] += 1
+
+                num = _to_float(raw)
+                if num is None:
+                    text_seen[col] = True
+                    continue
+
+                numeric_counts[col] += 1
+                numeric_sums[col] += num
+                if col not in numeric_mins or num < numeric_mins[col]:
+                    numeric_mins[col] = num
+                if col not in numeric_maxs or num > numeric_maxs[col]:
+                    numeric_maxs[col] = num
+
                 if num > 0:
                     numeric_positive[col] += 1
                 elif num < 0:
                     numeric_negative[col] += 1
                 else:
                     numeric_zero[col] += 1
 
-    summary = summarize_reader(rows, columns)
+                _reservoir_sample(
+                    numeric_outlier_samples[col],
+                    num,
+                    numeric_counts[col],
+                    outlier_sample_cap,
+                )
+
+    dtypes: dict[str, str] = {}
+    numeric_stats: dict[str, dict[str, float]] = {}
     profiles: dict[str, Any] = {}
+
     for col in columns:
+        count = numeric_counts[col]
+        if count > 0 and not text_seen[col]:
+            dtypes[col] = "float"
+            numeric_stats[col] = {
+                "count": float(count),
+                "mean": float(numeric_sums[col] / count),
+                "min": float(numeric_mins[col]),
+                "max": float(numeric_maxs[col]),
+            }
+        else:
+            dtypes[col] = "string"
+
         nn = non_missing[col]
         top = value_counts[col].most_common(5)
         top_values = [
@@ -123,28 +184,41 @@ def _profile_rows(
                 "positive_ratio": round(numeric_positive[col] / numeric_total, 6),
                 "zero_ratio": round(numeric_zero[col] / numeric_total, 6),
                 "negative_ratio": round(numeric_negative[col] / numeric_total, 6),
-                "outlier_ratio": _outlier_ratio(numeric_values[col]),
+                "outlier_ratio": _outlier_ratio(numeric_outlier_samples[col]),
             }
 
         dominant_value_ratio = top_values[0]["ratio"] if top_values else 0.0
         profiles[col] = {
             "missing_count": missing[col],
             "missing_ratio": round(missing[col] / row_count, 6) if row_count else 0.0,
             "non_missing_count": nn,
-            "unique_count": len(uniques[col]),
-            "unique_ratio": round(len(uniques[col]) / nn, 6) if nn else 0.0,
+            "unique_count": len(unique_sets[col]),
+            "unique_ratio": round(len(unique_sets[col]) / nn, 6) if nn else 0.0,
             "dominant_value_ratio": dominant_value_ratio,
             "top_values": top_values,
             "numeric_distribution": numeric_distribution,
-            "dtype": summary.dtypes[col],
+            "dtype": dtypes[col],
         }
 
+    summary = {
+        "row_count": row_count,
+        "column_count": len(columns),
+        "columns": columns,
+        "dtypes": dtypes,
+        "missing_counts": missing,
+        "numeric_stats": numeric_stats,
+    }
+
     group_target_ratio: dict[str, Any] | None = None
     if group_column and target_column and group_column in columns and target_column in columns:
-        group_target_ratio = _group_ratio_table(rows, group_column, target_column)
+        group_target_ratio = _finalize_group_ratio_table(
+            group_target_counter,
+            group_column,
+            target_column,
+        )
 
     return {
-        "summary": summary.to_dict(),
+        "summary": summary,
         "column_profiles": profiles,
         "group_target_ratio": group_target_ratio,
     }
@@ -189,16 +263,13 @@ def analyze_multiple_csv(
         if not path.exists():
             raise FileNotFoundError(f"CSV file not found: {path}")
 
-        with path.open("r", encoding="utf-8-sig", newline="") as f:
-            reader = csv.DictReader(f)
-            if reader.fieldnames is None:
-                raise ValueError(f"CSV header not found: {path}")
-            columns = [str(c) for c in reader.fieldnames]
-            rows = list(reader)
-
-        profiled = _profile_rows(rows, columns, group_column=group_column, target_column=target_column)
+        profiled = _profile_csv_stream(
+            path,
+            group_column=group_column,
+            target_column=target_column,
+        )
         total_rows += profiled["summary"]["row_count"]
-        all_columns.append(set(columns))
+        all_columns.append(set(profiled["summary"]["columns"]))
 
         files.append(
             {

diff --git a/bitnet_tools/visualize.py b/bitnet_tools/visualize.py
@@ -26,8 +26,10 @@ def _safe_stem(path: Path) -> str:
 def _ensure_matplotlib():
     try:
         import matplotlib
+
         matplotlib.use("Agg")
         import matplotlib.pyplot as plt
+
         return plt
     except Exception as exc:
         raise RuntimeError("matplotlib is required for chart generation") from exc
@@ -55,12 +57,16 @@ def create_file_charts(
     artifacts: list[str] = []
     stem = _safe_stem(csv_path)
 
+    # template 1: numeric histogram + boxplot
     for col in numeric_cols:
         values = []
+        missing = 0
         for row in rows:
             raw = (row.get(col) or "").strip()
             if raw:
                 values.append(float(raw))
+            else:
+                missing += 1
         if not values:
             continue
 
@@ -85,6 +91,19 @@ def create_file_charts(
         plt.close(fig)
         artifacts.append(str(out))
 
+        # template 2: numeric missing ratio mini chart
+        total = len(values) + missing
+        if total > 0:
+            fig = plt.figure(figsize=(5, 3))
+            plt.bar(["non_missing", "missing"], [len(values), missing])
+            plt.title(f"{stem} - {col} missing overview")
+            plt.tight_layout()
+            out = out_dir / f"{stem}_{col}_missing.png"
+            fig.savefig(out)
+            plt.close(fig)
+            artifacts.append(str(out))
+
+    # template 3: categorical top-value bar
     for col in categorical_cols:
         counter: dict[str, int] = {}
         for row in rows:
@@ -108,6 +127,30 @@ def create_file_charts(
         plt.close(fig)
         artifacts.append(str(out))
 
+    # template 4: scatter for first 2 numeric columns
+    if len(numeric_cols) >= 2:
+        x_col, y_col = numeric_cols[0], numeric_cols[1]
+        xs: list[float] = []
+        ys: list[float] = []
+        for row in rows:
+            x_raw = (row.get(x_col) or "").strip()
+            y_raw = (row.get(y_col) or "").strip()
+            if not x_raw or not y_raw:
+                continue
+            xs.append(float(x_raw))
+            ys.append(float(y_raw))
+        if xs and ys:
+            fig = plt.figure(figsize=(6, 5))
+            plt.scatter(xs, ys, alpha=0.6, s=12)
+            plt.title(f"{stem} - {x_col} vs {y_col}")
+            plt.xlabel(x_col)
+            plt.ylabel(y_col)
+            plt.tight_layout()
+            out = out_dir / f"{stem}_{x_col}_{y_col}_scatter.png"
+            fig.savefig(out)
+            plt.close(fig)
+            artifacts.append(str(out))
+
     return artifacts
 
 

diff --git a/tests/test_analysis.py b/tests/test_analysis.py
@@ -89,3 +89,16 @@ def test_multi_csv_schema_drift_and_group_ratio(tmp_path):
     assert "val" in result["schema_drift"]
     assert result["schema_drift"]["val"]["mean_range"] > 0
     assert result["files"][0]["group_target_ratio"] is not None
+
+
+def test_multi_csv_large_row_count(tmp_path):
+    p = tmp_path / "big.csv"
+    lines = ["city,val,type"]
+    for i in range(5000):
+        lines.append(f"seoul,{i % 100},A")
+    p.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+    result = analyze_multiple_csv([p], "대용량")
+
+    assert result["total_row_count"] == 5000
+    assert result["files"][0]["summary"]["row_count"] == 5000