-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDatasetImprover.R
More file actions
96 lines (79 loc) · 3.9 KB
/
DatasetImprover.R
File metadata and controls
96 lines (79 loc) · 3.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
## This script loads the nationwide pollution data and adds information that is useful to me:
library(dplyr)
library(tidyr)
library(ggmap)
library(censusapi)
df = read.csv("pollution_us_2000_2016.csv", header = TRUE)
mycensuskey = "8d1c885da552038d22b3e2f10785a819bf822261"
# Get U.S. census data to extract populations for each city of interest:
popestimate <- getCensus(name = "pep/population",
key = mycensuskey,
vintage = 2016,
vars = c("POP", "GEONAME", "DATE_DESC"),
region = "place:*")
popestimate$City_State = gsub(' city', '', popestimate$GEONAME)
# Remove any incomplete entries:
df = df %>% filter(complete.cases(.))
# Remove locations "not in a city":
df = df %>% filter(!(City == 'Not in a city'))
df$Date.Local = as.xts(df$Date.Local)
# Average all data down to a city level:
df = df %>%
group_by(City, Date.Local) %>%
summarise(
State = first(State), County = first(County),
NO2.Units = first(NO2.Units), NO2.Mean = mean(NO2.Mean), NO2.AQI = mean(NO2.AQI),
O3.Units = first(O3.Units), O3.Mean = mean(O3.Mean), O3.AQI = mean(O3.AQI),
SO2.Units = first(SO2.Units), SO2.Mean = mean(SO2.Mean), SO2.AQI = mean(SO2.AQI),
CO.Units = first(CO.Units), CO.Mean = mean(CO.Mean), CO.AQI = mean(CO.AQI)
)
# Assign each date a month
df <- df %>%
mutate(measurementyear = year(Date.Local),
month = factor(months(Date.Local),
levels = c('January','February','March','April','May','June','July',
'August','September','October','November','December')
)
)
# Fetch location (lat, long) information for every city:
# To limit queries, reduce list into unique set of cities:
df$City_State = paste(as.character(df$City),as.character(df$State), sep = ', ')
allCityStates = as.character(distinct(df,City_State)[["City_State"]])
citycoords = geocode(allCityStates, source = "dsk")
citycoords$City_State = as.character(distinct(df,City_State)[["City_State"]])
# Join the location into main dataset on City_State
df = inner_join(df, citycoords, by = 'City_State')
# Join population
df = inner_join(df, popestimate, by = "City_State")
# Find number of measurements for each location:
df %>% group_by(City) %>%
summarize(NumberOfMeasurements = n())
# Convert measurements to a common unit:
# Assign AQI labels to numeric values:
df$NO2.AQItext = cut(df$NO2.AQI,
breaks = c(0,50,100,150,200,300,500),
labels = c('Good','Moderate',
'Unhealthy for Sensitive Groups',
'Unhealthy','Very Unhealthy','Hazardous'))
df$CO.AQItext = cut(df$CO.AQI,
breaks = c(0,50,100,150,200,300,500),
labels = c('Good','Moderate',
'Unhealthy for Sensitive Groups',
'Unhealthy','Very Unhealthy','Hazardous'))
df$SO2.AQItext = cut(df$SO2.AQI,
breaks = c(0,50,100,150,200,300,500),
labels = c('Good','Moderate',
'Unhealthy for Sensitive Groups',
'Unhealthy','Very Unhealthy','Hazardous'))
df$O3.AQItext = cut(df$O3.AQI,
breaks = c(0,50,100,150,200,300,500),
labels = c('Good','Moderate',
'Unhealthy for Sensitive Groups',
'Unhealthy','Very Unhealthy','Hazardous'))
# Remove strange data before 2001:
# azdata <- azdata %>% filter(year(Date.Local) > 2001)
# Treat each pollutant measurement as a different observation:
# df <- df %>% gather(., "Pollutant", "Concentration", c('NO2.Mean', 'CO.Mean', 'SO2.Mean', 'O3.Mean'))
# df <- df %>% gather(., "Pollutant", "AQI", c('NO2.AQI', 'CO.AQI', 'SO2.AQI', 'O3.AQI'))
# Save the file at the very end:
write.csv(df, file = "PreppedPollutionData.csv")