-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrecode.py
More file actions
49 lines (39 loc) · 1.51 KB
/
recode.py
File metadata and controls
49 lines (39 loc) · 1.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""Survey missing codes, CES-D sum, binary outcome."""
from __future__ import annotations
from typing import Iterable, Optional, Sequence, Union
import numpy as np
import pandas as pd
import config
def apply_missing_codes(
df: pd.DataFrame,
columns: Sequence[str],
codes: Iterable[Union[int, float]] = config.MISSING_CODES,
) -> pd.DataFrame:
"""Replace survey missing codes with NaN for listed columns."""
out = df.copy()
cols = [c for c in columns if c in out.columns]
if not cols:
return out
for code in codes:
out[cols] = out[cols].replace(code, np.nan)
return out
def cesd_sum_score(df: pd.DataFrame, items: Optional[Sequence[str]] = None) -> pd.Series:
"""
Sum of CES-D item scores (count of depressive-symptom endorsements).
ELSA often codes items as 1 = symptom present, 2 = absent; we map 2 -> 0 before summing.
If your extract uses 0/1 already, set CESD_MAP_12_TO_01 = False in config.
Rows with any missing item become NaN in the sum.
"""
items = list(items or config.CESD_ITEMS)
present = [c for c in items if c in df.columns]
if not present:
return pd.Series(np.nan, index=df.index)
sub = df[present].apply(pd.to_numeric, errors="coerce")
if config.CESD_MAP_12_TO_01:
sub = sub.replace({2: 0, 1: 1})
return sub.sum(axis=1, min_count=len(present))
def cesd_binary(
sum_score: pd.Series,
threshold: int = config.CESD_BINARY_THRESHOLD,
) -> pd.Series:
return (sum_score >= threshold).astype(int)