-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_acs_data.py
More file actions
71 lines (57 loc) · 3 KB
/
clean_acs_data.py
File metadata and controls
71 lines (57 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
ACS Data Cleaning Script
Cleans the circus-like ACS data and creates random samples by income tertile
"""
import pandas as pd
import numpy as np
def clean_acs_data(filepath):
# Load and transpose the data
df = pd.read_csv(filepath)
clean_df = df.T.reset_index()
clean_df.columns = ['Full_Label', 'Value']
# Split labels and pivot
clean_df[['County', 'Metric']] = clean_df['Full_Label'].str.split('!!', expand=True)
clean_df = clean_df.pivot(index='County', columns='Metric', values='Value').reset_index()
# Clean up and remove NaN column
clean_df = clean_df[['County', 'Estimate', 'Margin of Error']]
clean_df.columns = ['County', 'Income', 'MOE']
clean_df = clean_df[~clean_df['County'].str.contains('Label|United States')]
# Convert income to numeric
clean_df['Income'] = clean_df['Income'].str.replace(',', '').str.replace('±', '')
clean_df['Income'] = pd.to_numeric(clean_df['Income'], errors='coerce')
clean_df = clean_df.dropna(subset=['Income'])
return clean_df
def create_random_sample(clean_df, total_samples=100, random_seed=42):
# Create tertiles and random sample
clean_df['Tertile'] = pd.qcut(clean_df['Income'], q=3, labels=['Low', 'Medium', 'High'])
np.random.seed(random_seed)
n_low = 33 if (clean_df['Tertile'] == 'Low').sum() >= 33 else (clean_df['Tertile'] == 'Low').sum()
n_medium = 33 if (clean_df['Tertile'] == 'Medium').sum() >= 33 else (clean_df['Tertile'] == 'Medium').sum()
n_high = 34 if (clean_df['Tertile'] == 'High').sum() >= 34 else (clean_df['Tertile'] == 'High').sum()
low_sample = clean_df[clean_df['Tertile'] == 'Low'].sample(n=n_low, random_state=random_seed)
medium_sample = clean_df[clean_df['Tertile'] == 'Medium'].sample(n=n_medium, random_state=random_seed)
high_sample = clean_df[clean_df['Tertile'] == 'High'].sample(n=n_high, random_state=random_seed)
return pd.concat([low_sample, medium_sample, high_sample])
# Example usage
if __name__ == "__main__":
cleaned_data = clean_acs_data('/Users/phinnmarkson/Desktop/ACS_cleaned_.csv')
random_sample = create_random_sample(cleaned_data)
random_sample.to_csv('random_county_sample.csv', index=False)
print("Done! Saved random_county_sample.csv")
# Optional: save cleaned data for transparency
cleaned_data.to_csv('cleaned_income_data.csv', index=False)
print("Saved cleaned_income_data.csv")
# Reload merged file to double-check contents
import os
merged_path = os.path.expanduser("~/Desktop/acs-data/random_county_sample_edit.csv")
if os.path.exists(merged_path):
merged_df = pd.read_csv(merged_path)
print("Merged sample preview:")
print(merged_df.head())
else:
print("Merged dataset not found at", merged_path)
# Auto git commit if file exists
os.system("git add cleaned_income_data.csv")
os.system("git add random_county_sample.csv")
os.system('git commit -m "Add income tertile sample and cleaned data export"')
print("Git commit created (if repo present)")