-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_pull.R
64 lines (52 loc) · 1.83 KB
/
data_pull.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#install.packages("rjson")
library("rjson")
file <- "https://www.reddit.com/r/ApplyingToCollege/top.json"
#data <- fromJSON(paste(readLines(file), collapse=""))
data <- fromJSON(file=file)
num_posts <- length(data$data$children)
txt <- c()
ttl <- c()
for (i in 1:num_posts){
text <- data$data$children[[i]]$data$selftext
title <- data$data$children[[i]]$data$title
txt <- append(txt, text)
ttl <- append(ttl, title)
}
#or
file <- "https://www.reddit.com/r/ApplyingToCollege/hot/.json?limit=100" #can change limit to get more
data <- fromJSON(file=file)
num_posts <- length(data$data$children)
txt <- c()
for (i in 1:num_posts){
text <- data$data$children[[i]]$data$selftext
txt <- append(txt, text)
}
# for reading from PDF: boring, old-school, slow
# install.packages("pdftools")
library(pdftools)
setwd("/Users/hk/Desktop")
files <- list.files(pattern = "pdf$")
opinions <- lapply(files, pdf_text)
opinion_str <- opinions[[1]]
library(tm)
corp <- Corpus(URISource(files),readerControl = list(reader = readPDF))
opinions.tdm <- TermDocumentMatrix(corp,
control =
list(removePunctuation = FALSE,
stopwords = FALSE,
tolower = FALSE,
stemming = FALSE,
removeNumbers = TRUE,
bounds = list(global = c(1, Inf))))
inspect(opinions.tdm[1:10,])
findFreqTerms(opinions.tdm, lowfreq = 100, highfreq = Inf)
# for reading from iMessage Archives: broken
days_in_month <- c(31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
year <- "2019"
month <- "01"
month_ind <- 1
day <- "01"
data <- readChar(file.choose())
date <- paste(year, month, day, sep="-")
while (date != "2019-11-10"){
}