-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextAnalysis(Tweets).R
54 lines (54 loc) · 1.76 KB
/
TextAnalysis(Tweets).R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Text analytics
rm(list=ls())
tweets=read.csv("C:/Users/Zahid/Downloads/tweets.csv", stringsAsFactors=FALSE)
str(tweets)
tweets$Negative=as.factor(tweets$Avg<=-1)
table(tweets$Negative)
install.packages("tm") #text mining
library(tm)
install.packages("SnowballC") #Word stemmer
library(SnowballC)
corpus=Corpus(VectorSource(tweets$Tweet))
corpus
corpus[[1]]
corpus=tm_map(corpus, tolower)
corpus=tm_map(corpus, removePunctuation)
stopwords("english")[1:100]
corpus=tm_map(corpus, removeWords, c("apple", stopwords("english")))
corpus=tm_map(corpus, stemDocument)
frequencies=DocumentTermMatrix(corpus)
frequencies
inspect(frequencies[1000:1005,505:515])
findFreqTerms(frequencies,lowfreq=20)
sparse=removeSparseTerms(frequencies, 0.995)
?removeSparseTerms
sparse
tweetsSparse=as.data.frame(as.matrix(sparse))
tweetsSparse
colnames(tweetsSparse)=make.names(colnames(tweetsSparse))
make.names(colnames(tweetsSparse))
colnames(tweetsSparse)
tweetsSparse$Negative
tweetsSparse$Negative=tweets$Negative
str(tweets$Negative)
str(tweetsSparse$Negative)
library(caTools)
set.seed(123)
split=sample.split(tweetsSparse$Negative, SplitRatio=0.7)
trainSparse=subset(tweetsSparse, split==TRUE)
testSparse=subset(tweetsSparse, split==FALSE)
library(rpart)
library(rpart.plot)
tweetCART=rpart(Negative~., data=trainSparse, method="class")
prp(tweetCART)
trainSparse
predictCART=predict(tweetCART, newdata=testSparse, type="class")
table(testSparse$Negative, predictCART)
library(randomForest)
set.seed(123)
tweetRF=randomForest(Negative~., data=trainSparse)
predictRF=predict(tweetRF, newdata=testSparse)
table(testSparse$Negative, predictRF)
tweetLog=glm(Negative~.,data=trainSparse, family=binomial)
predictions = predict(tweetLog, newdata=testSparse, type="response")
table(testSparse$Negative, predictions>=0.5)