-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjd-analysis.R
193 lines (167 loc) · 7.31 KB
/
jd-analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# (1) Load sample dataset and assemble corpus -------------------------------------------
# Load relevant libraries
library(extrafont)
# Import relevant functions from associated functions file
source("jd-functions.R")
# Obtain working color scheme
colors = getColors(2)
# Determine relevant file to be loaded
filePaths <- c("responsibilities.txt","qualifications.txt")
text.resp <- readLines(filePaths[1])
text.qual <- readLines(filePaths[2])
# resp = data.frame(Description=text1)
# qual = data.frame(Description=text2)
subtitle.qual <- "Based on sample \"qualifications\" corpus."
plotExperience(text.qual, subtitle.qual)
# Create document-term matrices for responsibilities,
# qualifications
dtm.resp <- preprocessCorpus(text.resp)
dtm.qual <- preprocessCorpus(text.qual)
# Combine respective DTMs into collective matrix
dtm <- c(dtm.resp,dtm.qual)
# Create entire dataset of documents and terms as predictors
dataset <- as.data.frame(as.matrix(dtm))
# Create a label of responsibilities and qualifications
dataset$label = matrix(0,nrow(dataset),1)
# Label all responsibilities as 1, qualifications as 0
dataset$label[1:dtm.resp$nrow] = 1
# Convert output variable to categorical factor
dataset$label = as.factor(dataset$label)
# Construct visualizations based on term frequencies
# (A) Wordclouds for each corpus
plotWordcloud(dtm.resp)
plotWordcloud(dtm.qual)
# (B) Dendrogram for entire corpora
plotDendrogram(dtm)
# (2) Train a (bag-of-words) random forest classifier on sample dataset ----------------------
# Then use trained classifier to classify all future test "documents"
# for further analysis
# Split overall dataset into training and test sets
library(caTools)
set.seed(123)
# Split ratio is 80-20% (Train-Test)
split = sample.split(dataset$label, SplitRatio = 0.8)
train = subset(dataset, split == TRUE)
test = subset(dataset, split == FALSE)
# Fitting Random Forest Classifier to the Training set
# Retain predictor importances
library(randomForest)
classifier = randomForest(x = train[-ncol(train)],
y = train$label,
importance = TRUE,
ntree = 500)
# Predict classifier performance on test set results
y_pred = predict(classifier, newdata = test[-ncol(train)])
# ADDITIONAL (4/21/19): Fitting Logistic Regression Classifier to the Training set
# Fitting Logistic Regression to the Training set
classifier = glm(formula = label ~ .,
family = binomial,
data = train)
# Predict classifier performance on test set results
prob_pred = predict(classifier, type = 'response', newdata = test[-ncol(train)])
y_pred = as.factor(ifelse(prob_pred > 0.5, 1, 0))
# ADDITIONAL (4/21/19): Fitting xgboost classifier to the Training set
# Fitting xgboost to the Training set
library(xgboost)
classifier = xgboost(data = as.matrix(train[-ncol(train)]),
label = as.numeric(train$label)-1,
objective = "binary:logistic",
nrounds = 100)
# Predicting classifier performance on test set results
y_pred = predict(classifier, newdata = as.matrix(test[-ncol(train)]))
y_pred = as.factor(as.integer(y_pred >= 0.5))
# Construct the Confusion Matrix
library(caret)
cm = confusionMatrix(y_pred, test$label)
cm
# Applying k-Fold Cross Validation to calculate
# and confirm mean predictive accuracy over k=10 folds
folds = createFolds(train$label, k = 10)
cv = lapply(folds, function(x) {
train_fold = train[-x, ]
test_fold = train[x, ]
classifier = randomForest(x = train_fold[-ncol(train_fold)],
y = train_fold$label,
ntree = 500)
y_pred = predict(classifier, newdata = test_fold[-ncol(test_fold)])
cm = table(test_fold$label, y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
# Mean cross-validation accuracy
mean(as.numeric(cv))
# Standard deviation of cross-validation accuracy
sd(as.numeric(cv))
# Produce variable importance plot
# Identifies important words that affects the accuracy of the model.
varImp = importance(classifier)
plotImportance(varImp)
# Find most frequent terms in responsibilities and qualifications text files
# Most important terms in responsibilities
findMostFreqTerms(dtm.resp, n = 10L, INDEX = rep(1, dtm.resp$nrow))
# Most important terms in qualifications
findMostFreqTerms(dtm.qual, n = 10L, INDEX = rep(1, dtm.qual$nrow))
# (3) Web scraping from Indeed to obtain job summaries------------------------------------------------
# Give option to load pre-existing dataset or scrape new data
have.data = TRUE
if(have.data){
# Read in prescraped dataset
df <- read.csv("summaries.csv")
}else{
# Obtain dataset from web scraping
df <- scrapeSummary()
# Write scraped dataframe to .csv file (in case)
write.csv(df, file = "summaries.csv")
}
# Visualize years of experience gleaned from job summaries
subtitle.summary <- "Based on the Indeed job summary corpus"
plotExperience(unlist(df), subtitle.summary)
# Perform necessary operations to preprocess job summaries
library(tm)
df$Description = tolower(df$Description)
df$Description = removeWords(df$Description, stopwords("english"))
df$Description = removePunctuation(df$Description)
df$Description = removeNumbers(df$Description)
# (4) Word Vectorization of Job Summaries using Global Vector (GloVe) algorithm-------------------------------------
library(text2vec)
# Collect all unique terms from document to form vocabulary
iter = itoken(df$Description, preprocessor = tolower,
tokenizer = word_tokenizer)
# Look at uni- and bigrams
vocab <- create_vocabulary(iter, ngram = c(1L, 2L))
# Prune vocabulary to retain words whose frequencies greater than 10
vocab <- prune_vocabulary(vocab, term_count_min = 10,
doc_proportion_min = 0.001)
# Use our filtered vocabulary
vectorizer <- vocab_vectorizer(vocab)
# Use window size of 10 for context words (captures most document lengths)
tcm <- create_tcm(iter, vectorizer, skip_grams_window = 10L)
set.seed(123)
# Initialize and train GloVe embeddings model
glove <- GlobalVectors$new(word_vectors_size = 100, vocabulary = vocab, x_max = 50)
wv_main <- glove$fit_transform(tcm, n_iter = 100)
wv_context <- glove$components
# Combine both main and context vectors
wv <- t(wv_main) + wv_context
# Compare prominent/interesting terms in corpus
plotSimilarTerms("data_science","machine_learning",wv)
plotSimilarTerms("degree","experience",wv)
plotSimilarTerms("ai","quantitative",wv)
plotSimilarTerms("clients","customers",wv)
plotSimilarTerms("python","r",wv)
# (5) Apply multidimensional scaling (MDS) to visualize vectors in 2 dimensions--------------------------
# (A) Euclidean distance
set.seed(123)
vectordata = dist(scale(t(wv)))
mds.euc <- data.frame(cmdscale(vectordata))
head(mds.euc)
# Label each point with corresponding word and color according to frequency
words <- colnames(wv)
LogFrequency <- log10(vocab$term_count)
plotGloveMDS(mds.euc,words,LogFrequency,"Euclidean Distance")
# (B) Cosine Distance
set.seed(123)
cosdata = 1-sim2(t(wv),t(wv),method='cosine',norm='l2')
mds.cos <- data.frame(cmdscale(cosdata))
head(mds.cos)
plotGloveMDS(mds.cos,words,LogFrequency,"Cosine Distance")