codebusters/PRE_PROCESSING_FINAL.R at master · salbrec/codebusters · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#################Clean one########################


#Library Declaration
library(readr)
library(textmineR)
library(wordcloud)
library(caret)
library(pROC)
library(glmnet)

#Data Input and preliminary stuff
set.seed(123)
data_path="C:/Users/diior/Desktop/NewsDatasetLabeled.csv"
data <- read_csv(data_path) #importing data
data$Date<-as.Date(data$Date)
data=data[data$Date>as.Date("2018-12-31"),] #Removing unlabelled observation
Relevant=data[data$class==1,] #Relevant News data
Not_Relevant=data[data$class==0,] #Not relevant News Data

N_Rel=dim(Relevant)[1]
N_Not=dim(Not_Relevant)[1]
N=dim(data)[1]

Ratio=round(N_Rel/N,2) #Percentage of relevant on the whole data set.

Test_size=200 #Test data size


#Sample from relevant news to obtain relevant part of test set.
sample_rel=sample(1:N_Rel,size=Ratio*Test_size,replace=F)
test_data_Rel=Relevant[sample_rel,]
train_data_Rel=Relevant[-sample_rel,]

#Sample from relevant news to obtain not relevant part of test set.

sample_not_rel=sample(1:N_Not,size=((1-Ratio)*Test_size),replace=F)
test_data_not_Rel=Not_Relevant[sample_not_rel,]
train_data_not_Rel=Not_Relevant[-sample_not_rel,]

#Merge the two data to obtain one train data set.

test_data=rbind(test_data_Rel,test_data_not_Rel) #this test set has the same ratio of relevant case of the original set
train_data=rbind(train_data_Rel,train_data_not_Rel)

N_Rel_train=dim(train_data_Rel)[1]
N_Not_train=dim(train_data_not_Rel)[1]
N_train=dim(train_data)[1]

fulldata=rbind(train_data,test_data)

#Preprocessing data

dtm <- CreateDtm(doc_vec = fulldata$Text, # character vector of documents
                 doc_names = fulldata$NumArt, # document names
                 ngram_window = c(1, 2), # minimum and maximum n-gram length
                 stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
                                  stopwords::stopwords(source = "smart")), # this is the default value
                 lower = TRUE, # lowercase
                 remove_punctuation = TRUE, # punctuation
                 stem_lemma_function = function(x) SnowballC::wordStem(x, "porter"),#some stemming
                 remove_numbers = TRUE, # numbers
                 verbose = FALSE, # Turn off status bar
                 cpus=2) # parallelizing


Tf=TermDocFreq(dtm) #creating term frequency and tf-idf
tfidf <- t(dtm[ , Tf$term ]) * Tf$idf
tfidf <- t(tfidf)

summary(colSums(tfidf)) #some descriptives about cumulative distribution of tfidf

# 300 400 600
to_keep=which(colSums(tfidf)>300) #Keep columns with a cumulative tfidf bigger than 300 in order to reduce features.
#it can be cross-validated or chosen in a more smart way. This treshold is strictly data dependent!!!!!!!!!!

tfidf <- tfidf[,to_keep]

dim(tfidf)

#Make the tfidf matrix a full matrix (not a sparse one).
M_tfidf=matrix(0,dim(tfidf)[1],dim(tfidf)[2]) #memory allocation
idx=summary(tfidf)
M_tfidf[cbind(idx$i,idx$j)]<-idx$x
colnames(M_tfidf)<-Tf$term[to_keep]
Full=data.frame(fulldata$class,M_tfidf)
colnames(Full)[1]<-"class"

train_data_full=Full[1:N_train,]
test_data_full=Full[-(1:N_train),]

#####################AUGMENTING DATA##########################

synt=c()  #allocating some memory

#Following function creates synthetic relevant cases sampling with replacement
#from empirical distribution of words of relevant news. We are not using relevant cases that are in the test set,
#in order to avoid bias.

synt=apply(train_data_full[train_data_full$class==1,],2,
           function(x) sample(x,size=(N_Not_train-N_Rel_train),replace=T)) #fast way to act on matrices

train_data_full=rbind(train_data_full,synt) #augmented train data

write.table(train_data_full,"train_data_full_685.csv",row.names=FALSE)
write.table(test_data_full,"test_data_full_685.csv",row.names=FALSE)