-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_load.py
More file actions
53 lines (44 loc) · 1.81 KB
/
data_load.py
File metadata and controls
53 lines (44 loc) · 1.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""Load ELSA core wave .dta files and normalise column names."""
from __future__ import annotations
from pathlib import Path
from typing import Iterable, Optional
import pandas as pd
import config
def find_wave_path(stata_dir: Path, wave: int) -> Path:
"""Return path to core wave file matching wave_*_elsa_data*.dta."""
if not stata_dir.is_dir():
raise FileNotFoundError(
f"Stata directory not found: {stata_dir}. "
"Set ELSA_STATA_DIR or place data under data/stata13_se (local)."
)
matches = sorted(stata_dir.glob(f"wave_{wave}_elsa_data*.dta"))
if not matches:
raise FileNotFoundError(
f"No file matching wave_{wave}_elsa_data*.dta in {stata_dir}"
)
return matches[0]
def load_wave(
wave: int,
stata_dir: Optional[Path] = None,
columns: Optional[Iterable[str]] = None,
lowercase: bool = True,
) -> pd.DataFrame:
"""
Read one wave .dta. Preserves numeric codes (no categorical conversion)
so survey missing codes stay numeric for recode.py.
"""
path = find_wave_path(stata_dir or config.STATA_DIR, wave)
df = pd.read_stata(path, columns=list(columns) if columns else None, convert_categoricals=False)
if lowercase:
df.columns = [str(c).lower() for c in df.columns]
return df
def normalise_id_column(df: pd.DataFrame, id_col: str = config.ID_COL) -> pd.DataFrame:
"""Ensure ID column exists; try common alternatives if renamed."""
lower = {c.lower(): c for c in df.columns}
key = id_col.lower()
if key in lower:
return df.rename(columns={lower[key]: id_col})
for alt in ("idauniq", "idauniq_w", "id", "pid"):
if alt in lower:
return df.rename(columns={lower[alt]: id_col})
raise KeyError(f"No ID column found (tried {id_col} and common aliases).")