-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathImdb_R_Interstellar.R
More file actions
141 lines (102 loc) · 3.79 KB
/
Imdb_R_Interstellar.R
File metadata and controls
141 lines (102 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#IMDB reviews - Interstellar Movie
library(rvest)
library(XML)
library(magrittr)
#IMDB Interstellar movie
imdb_aurl="https://www.imdb.com/title/tt0816692/reviews?ref_=tturv_ql_3"
imdb_reviews <- NULL
## Web scraping- "Page source"
#Under the "text" we can find the content
murl <- read_html(as.character(imdb_aurl))
rev <- murl %>% html_nodes(".text") %>% html_text()
imdb_reviews <- c(imdb_reviews,rev)
#Copy the data
txt_imdb <- imdb_reviews
#corpus
x <- Corpus(VectorSource(txt_imdb))
x <- tm_map(x, function(x) iconv(enc2utf8(x), sub='byte'))
# Data Cleansing
#Text into lower case
x1 <- tm_map(x,tolower)
#Removing the punctuations in the text
x1 <- tm_map(x1,removePunctuation)
#removing the numbers
x1 <- tm_map(x1,removeNumbers)
#removing english stop words- i.e a,an,the,pronouns
stopwords <- readLines(file.choose())
x1 <- tm_map(x1,removeWords,stopwords("English"))
# striping white spaces
x1 <- tm_map(x1,stripWhitespace)
inspect(x1[1])
# Term document matrix
# converting unstructured data to structured format using TDM
tdm <- TermDocumentMatrix(x1)
tdm
dtm <- t(tdm) # transpose
dtm <- DocumentTermMatrix(x1)
# To remove sparse entries upon a specific value - which are empty
corpus.dtm.frequent <- removeSparseTerms(tdm, 0.99)
tdm <- as.matrix(tdm)
dim(tdm)
# Create term document matrices from raw text
tdm[1:1731, 1:25]
w <- rowSums(tdm)
w
#Generate the bar plot for observing the word frequency
w_sub <- subset(w, w>=10)
w_sub
barplot(w_sub, las=2, col = rainbow(10))
# "movie","cinema","film","say","us","interstellar" repeats maximum number of times
x1 <- tm_map(x1, removeWords, c("movie","just","will","like","can","cinema","film","say","us","interstellar","way","see","nolan"))
x1 <- tm_map(x1, stripWhitespace)
tdm <- TermDocumentMatrix(x1)
dim(tdm)
#After removal of stop words
tdm <- as.matrix(tdm)
tdm[1:1723, 1:25]
w <- rowSums(tdm)
w
w_sub <- subset(w, w>=10)
w_sub
barplot(w_sub, las=2, col = rainbow(10))
######################################################
library(wordcloud)
wordcloud(words = names(w), freq = w)
w_sub1 <- sort(rowSums(tdm), decreasing = TRUE)
head(w_sub1)
wordcloud(words = names(w_sub1), freq = w_sub1) # all words are considered
# better visualization
wordcloud(words = names(w_sub1), freq = w_sub1, random.order=F, colors= rainbow(30),scale=c(3,0.5),rot.per=0.3)
# Wordcloud2 - shapes of circle,triangle,star
installed.packages("wordcloud2")
library(wordcloud2)
w1 <- data.frame(names(w_sub),w_sub)
colnames(w1) <- c('word', 'freq')
#special type of word cloud
wordcloud2(w1, size=0.3, shape = 'star')
#####################################
# lOADING Positive and Negative words
pos.words <- readLines(file.choose()) # read-in positive-words.txt
neg.words <- readLines(file.choose()) # read-in negative-words.txt
#Positive word cloud - IMDB Insterstellar
pos.matches <- match(names(w), pos.words)
pos.matches <- !is.na(pos.matches)
freq_pos <- w[pos.matches]
names <- names(freq_pos)
windows()
wordcloud(names, freq_pos, scale=c(4,1), colors = brewer.pal(8,"Dark2"))
#Negative word cloud - IMDB Insterstellar
neg.matches <- match(names(w), neg.words)
neg.matches <- !is.na(neg.matches)
freq_neg <- w[neg.matches]
names <- names(freq_neg)
windows()
wordcloud(names, freq_neg, scale=c(4,.5), colors = brewer.pal(8, "Dark2"))
#Bigram Anlysis on Interstellar movie
library(RWeka)
library(wordcloud)
minfreq_bigram <- 2
bitoken <- NGramTokenizer(x1, Weka_control(min = 2, max = 2))
two_word <- data.frame(table(bitoken))
sort_two <- two_word[order(two_word$Freq, decreasing = TRUE), ]
wordcloud(sort_two$bitoken, sort_two$Freq, random.order = F, scale = c(2, 0.35), min.freq = minfreq_bigram, colors = brewer.pal(8, "Dark2"), max.words = 150)