-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchapter_eleven_learning.R
More file actions
116 lines (91 loc) · 4.19 KB
/
chapter_eleven_learning.R
File metadata and controls
116 lines (91 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#install.packages('tidyverse')
#install.packages('pscl')
library("dplyr")
library("pscl")
View(presidentialElections)
#select the year and demVotes (percentage of votes won by the Democrat) from the presidential elections dataframe
votes <- select(presidentialElections, year, demVote)
View(votes)
#select columns `state` through `year`
View(select(presidentialElections, state:year))
#select all columns except for south
View(select(presidentialElections, -south))
#filter function allows a person to choose what rows they desire
votes_2008 <- filter(presidentialElections, year ==2008)
View(votes_2008)
#filter will extract rows that match ALL given conditions
votes_Colorado_2008 <- filter(presidentialElections, year ==2008, state=='Colorado')
View(votes_Colorado_2008)
#note: dlpyr library does not keep row names, if those are neded, make the row names a column (feature)
#of the data, this can be done via the mutate() function
#add an `other_parties_vote` column that is a percentage of the votes for other parties
#also add an `abs_vote_difference ` column of the differences between percentages
presidentialElections <- mutate(
presidentialElections,
other_parties_vote = 100 - demVote,
abs_vote_difference = abs(demVote - other_parties_vote)
)
#note: mutate() does not change the original data frame, but instead creates a new data frame
presidentialElections
#Use arrange to sort the rows of a data fram by some feature (column value)
#arrange rows in decreasing order by year, then demVote within each Year
presidentialElections <- arrange(presidentialElections, -year, demVote)
View(presidentialElections) #again like mutate, the data is stored in a new data frame rather than old one
#compute summary statistics for the `presidentialElections` Data frame
average_votes <- summarize(
presidentialElections,
mean_dem_vote = mean(demVote),
mean_other_parties = mean(other_parties_vote)
)
average_votes
#how to find the state with the highest 2008 `demVote` percentage
#filter down to 2008 vote
votes_2008 <- filter(presidentialElections, year ==2008)
#filter to state with highest demVote
most_dem_votes <- filter(votes_2008, demVote ==max(demVote))
#select name of state
most_dem_state <- select(most_dem_votes, state)
most_dem_state
#a better way to do it to avoid having to change the algorithm frequently later:
most_dem_state <- select(
filter(
filter(
presidentialElections,
year == 2008
),
demVote == max(demVote)
),
state
)
#process above uses `anonymous variables` result values not assgined to variables but instead immediately
#used as arguments to other functions
#dplyer pipe operator, written as %>%, takes the result from one operation and passes it
#to the next function as the function's first argument
most_dem_state <- presidentialElections %>%
filter(year==2008) %>%
filter(demVote==max(demVote)) %>%
select(state)
most_dem_state
#another interesting piece of dplyer functions is that they can by applied to groups of rows
#in a data set
#can use the group_by() function to create associations among groups of rows
#so you can easily perform aggregations
grouped <- group_by(presidentialElections, state)
grouped
#group_by allows you to apply operations to groups of data without breaking the data into different
#variables, essently, group_by splits the data into different groups of data corresponding to different values in a column
#for example: create summary statistics by state: average percentages across the years
state_voting_summary <- presidentialElections %>%
group_by(state) %>%
summarize(
mean_dem_vote = mean(demVote),
mean_other_parties = mean(other_parties_vote)
)
View(state_voting_summary)
#Joins
#sometimes you may want to access data from multiple data frames, basically, combining the frames
#via a reference to both tables through a column that corresponds to both tables
#the columns are used as identifiers to determine which rows correspond to one another
#left_join() looks for the matching columns in question, and returns a new data frame consisting
#of the data frame from the first argument (the `left` one) with extra columns from the right (the second) argument
#you can specify which columns you want to match via the 'by' argument