diff --git a/NAMESPACE b/NAMESPACE index 816efe4..e2f823c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ S3method(predict,word2vec) S3method(predict,word2vec_trained) S3method(summary,word2vec) S3method(summary,word2vec_trained) -S3method(word2vec,character) S3method(word2vec,list) export(doc2vec) export(read.word2vec) diff --git a/R/RcppExports.R b/R/RcppExports.R index 279c425..d6c90b8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,16 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) -} - -w2v_load_model <- function(file, normalize = TRUE) { - .Call('_word2vec_w2v_load_model', PACKAGE = 'word2vec', file, normalize) -} - -w2v_save_model <- function(ptr, file) { - .Call('_word2vec_w2v_save_model', PACKAGE = 'word2vec', ptr, file) +w2v_train <- function(texts_, types_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, types_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) } w2v_dictionary <- function(ptr) { @@ -33,11 +25,3 @@ w2v_read_binary <- function(modelFile, normalize, n) { .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n) } -d2vec <- function(ptr, x, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { - .Call('_word2vec_d2vec', PACKAGE = 'word2vec', ptr, x, wordDelimiterChars) -} - -d2vec_nearest <- function(ptr_w2v, ptr_d2v, x, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { - .Call('_word2vec_d2vec_nearest', PACKAGE = 'word2vec', ptr_w2v, ptr_d2v, x, wordDelimiterChars) -} - diff --git a/R/word2vec.R b/R/word2vec.R index 2376f01..6cfbcea 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -126,75 +126,6 @@ word2vec <- function(x, UseMethod("word2vec") } -#' @inherit word2vec title description params details seealso return references examples -#' @param split a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x} -#' @param encoding the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. -#' Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument -#' is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector. -#' @param useBytes logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}. -#' @export -word2vec.character <- function(x, - type = c("cbow", "skip-gram"), - dim = 50, window = ifelse(type == "cbow", 5L, 10L), - iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), - threads = 1L, - split = c(" \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", - ".\n?!"), - encoding = "UTF-8", - useBytes = TRUE, - ...){ - type <- match.arg(type) - stopw <- stopwords - model <- file.path(tempdir(), "w2v.bin") - if(length(stopw) == 0){ - stopw <- "" - } - file_stopwords <- tempfile() - filehandle_stopwords <- file(file_stopwords, open = "wt", encoding = encoding) - writeLines(stopw, con = filehandle_stopwords, useBytes = useBytes) - close(filehandle_stopwords) - on.exit({ - if (file.exists(file_stopwords)) file.remove(file_stopwords) - }) - if(length(x) == 1){ - file_train <- x - }else{ - file_train <- tempfile(pattern = "textspace_", fileext = ".txt") - on.exit({ - if (file.exists(file_stopwords)) file.remove(file_stopwords) - if (file.exists(file_train)) file.remove(file_train) - }) - filehandle_train <- file(file_train, open = "wt", encoding = encoding) - writeLines(text = x, con = filehandle_train, useBytes = useBytes) - close(filehandle_train) - } - #expTableSize <- 1000L - #expValueMax <- 6L - #expTableSize <- as.integer(expTableSize) - #expValueMax <- as.integer(expValueMax) - min_count <- as.integer(min_count) - dim <- as.integer(dim) - window <- as.integer(window) - iter <- as.integer(iter) - sample <- as.numeric(sample) - hs <- as.logical(hs) - negative <- as.integer(negative) - threads <- as.integer(threads) - iter <- as.integer(iter) - lr <- as.numeric(lr) - skipgram <- as.logical(type %in% "skip-gram") - split <- as.character(split) - model <- w2v_train(list(), character(), - trainFile = file_train, modelFile = model, stopWordsFile = file_stopwords, - minWordFreq = min_count, - size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, - sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, wordDelimiterChars = split[1], endOfSentenceChars = split[2], ...) - model$data$stopwords <- stopwords - model -} - #' @inherit word2vec title description params details seealso return references #' @export #' @examples @@ -229,12 +160,12 @@ word2vec.list <- function(x, type = c("cbow", "skip-gram"), dim = 50, window = ifelse(type == "cbow", 5L, 10L), iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), + stopwords = integer(), threads = 1L, ...){ - x <- lapply(x, as.character) + #x <- lapply(x, as.character) type <- match.arg(type) - stopwords <- as.character(stopwords) + stopwords <- as.integer(stopwords) model <- file.path(tempdir(), "w2v.bin") #expTableSize <- 1000L #expValueMax <- 6L @@ -253,11 +184,11 @@ word2vec.list <- function(x, skipgram <- as.logical(type %in% "skip-gram") encoding <- "UTF-8" model <- w2v_train(x, stopwords, - trainFile = "", modelFile = model, stopWordsFile = "", + modelFile = model, minWordFreq = min_count, size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, wordDelimiterChars = "", endOfSentenceChars = "", ...) + alpha = lr, withSG = skipgram, ...) model$data$stopwords <- stopwords model } diff --git a/man/word2vec.character.Rd b/man/word2vec.character.Rd deleted file mode 100644 index 6a4aaa9..0000000 --- a/man/word2vec.character.Rd +++ /dev/null @@ -1,167 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/word2vec.R -\name{word2vec.character} -\alias{word2vec.character} -\title{Train a word2vec model on text} -\usage{ -\method{word2vec}{character}( - x, - type = c("cbow", "skip-gram"), - dim = 50, - window = ifelse(type == "cbow", 5L, 10L), - iter = 5L, - lr = 0.05, - hs = FALSE, - negative = 5L, - sample = 0.001, - min_count = 5L, - stopwords = character(), - threads = 1L, - split = c(" \\n,.-!?:;/\\"#$\%&'()*+<=>@[]\\\\^_`{|}~\\t\\v\\f\\r", ".\\n?!"), - encoding = "UTF-8", - useBytes = TRUE, - ... -) -} -\arguments{ -\item{x}{a character vector with text or the path to the file on disk containing training data or a list of tokens. See the examples.} - -\item{type}{the type of algorithm to use, either 'cbow' or 'skip-gram'. Defaults to 'cbow'} - -\item{dim}{dimension of the word vectors. Defaults to 50.} - -\item{window}{skip length between words. Defaults to 5.} - -\item{iter}{number of training iterations. Defaults to 5.} - -\item{lr}{initial learning rate also known as alpha. Defaults to 0.05} - -\item{hs}{logical indicating to use hierarchical softmax instead of negative sampling. Defaults to FALSE indicating to do negative sampling.} - -\item{negative}{integer with the number of negative samples. Only used in case hs is set to FALSE} - -\item{sample}{threshold for occurrence of words. Defaults to 0.001} - -\item{min_count}{integer indicating the number of time a word should occur to be considered as part of the training vocabulary. Defaults to 5.} - -\item{stopwords}{a character vector of stopwords to exclude from training} - -\item{threads}{number of CPU threads to use. Defaults to 1.} - -\item{split}{a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x}} - -\item{encoding}{the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. -Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument -is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector.} - -\item{useBytes}{logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}.} - -\item{...}{further arguments passed on to the methods \code{\link{word2vec.character}}, \code{\link{word2vec.list}} as well as the C++ function \code{w2v_train} - for expert use only} -} -\value{ -an object of class \code{w2v_trained} which is a list with elements -\itemize{ -\item{model: a Rcpp pointer to the model} -\item{data: a list with elements file: the training data used, stopwords: the character vector of stopwords, n} -\item{vocabulary: the number of words in the vocabulary} -\item{success: logical indicating if training succeeded} -\item{error_log: the error log in case training failed} -\item{control: as list of the training arguments used, namely min_count, dim, window, iter, lr, skipgram, hs, negative, sample, split_words, split_sents, expTableSize and expValueMax} -} -} -\description{ -Construct a word2vec model on text. The algorithm is explained at \url{https://arxiv.org/pdf/1310.4546.pdf} -} -\details{ -Some advice on the optimal set of parameters to use for training as defined by Mikolov et al. -\itemize{ -\item{argument type: skip-gram (slower, better for infrequent words) vs cbow (fast)} -\item{argument hs: the training algorithm: hierarchical softmax (better for infrequent words) vs negative sampling (better for frequent words, better with low dimensional vectors)} -\item{argument dim: dimensionality of the word vectors: usually more is better, but not always} -\item{argument window: for skip-gram usually around 10, for cbow around 5} -\item{argument sample: sub-sampling of frequent words: can improve both accuracy and speed for large data sets (useful values are in range 0.001 to 0.00001)} -} -} -\examples{ -\dontshow{if(require(udpipe))\{} -library(udpipe) -## Take data and standardise it a bit -data(brussels_reviews, package = "udpipe") -x <- subset(brussels_reviews, language == "nl") -x <- tolower(x$feedback) - -## Build the model get word embeddings and nearest neighbours -model <- word2vec(x = x, dim = 15, iter = 20) -emb <- as.matrix(model) -head(emb) -emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding") -emb -nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5) -nn - -## Get vocabulary -vocab <- summary(model, type = "vocabulary") - -# Do some calculations with the vectors and find similar terms to these -emb <- as.matrix(model) -vector <- emb["buurt", ] - emb["rustige", ] + emb["restaurants", ] -predict(model, vector, type = "nearest", top_n = 10) - -vector <- emb["gastvrouw", ] - emb["gastvrij", ] -predict(model, vector, type = "nearest", top_n = 5) - -vectors <- emb[c("gastheer", "gastvrouw"), ] -vectors <- rbind(vectors, avg = colMeans(vectors)) -predict(model, vectors, type = "nearest", top_n = 10) - -## Save the model to hard disk -path <- "mymodel.bin" -\dontshow{ -path <- tempfile(pattern = "w2v", fileext = ".bin") -} -write.word2vec(model, file = path) -model <- read.word2vec(path) - -\dontshow{ -file.remove(path) -} -## -## Example of word2vec with a list of tokens -## -toks <- strsplit(x, split = "[[:space:][:punct:]]+") -model <- word2vec(x = toks, dim = 15, iter = 20) -emb <- as.matrix(model) -emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding") -emb -nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5) -nn - -## -## Example getting word embeddings -## which are different depending on the parts of speech tag -## Look to the help of the udpipe R package -## to get parts of speech tags on text -## -library(udpipe) -data(brussels_reviews_anno, package = "udpipe") -x <- subset(brussels_reviews_anno, language == "fr") -x <- subset(x, grepl(xpos, pattern = paste(LETTERS, collapse = "|"))) -x$text <- sprintf("\%s/\%s", x$lemma, x$xpos) -x <- subset(x, !is.na(lemma)) -x <- split(x$text, list(x$doc_id, x$sentence_id)) - -model <- word2vec(x = x, dim = 15, iter = 20) -emb <- as.matrix(model) -nn <- predict(model, c("cuisine/NN", "rencontrer/VB"), type = "nearest") -nn -nn <- predict(model, c("accueillir/VBN", "accueillir/VBG"), type = "nearest") -nn - -\dontshow{\} # End of main if statement running only if the required packages are installed} -} -\references{ -\url{https://github.com/maxoodf/word2vec}, \url{https://arxiv.org/pdf/1310.4546.pdf} -} -\seealso{ -\code{\link{predict.word2vec}}, \code{\link{as.matrix.word2vec}}, \code{\link{word2vec}}, \code{\link{word2vec.character}}, \code{\link{word2vec.list}} -} diff --git a/man/word2vec.list.Rd b/man/word2vec.list.Rd index c5d93e3..b92d8f8 100644 --- a/man/word2vec.list.Rd +++ b/man/word2vec.list.Rd @@ -15,7 +15,7 @@ negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), + stopwords = integer(), threads = 1L, ... ) diff --git a/src/Makevars b/src/Makevars index 9f2426e..620ba58 100644 --- a/src/Makevars +++ b/src/Makevars @@ -6,7 +6,6 @@ SOURCES = word2vec/lib/huffmanTree.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ - word2vec/lib/vocabulary.cpp \ word2vec/lib/word2vec.cpp \ rcpp_word2vec.cpp \ RcppExports.cpp diff --git a/src/Makevars.win b/src/Makevars.win index 0affdf1..459c5a1 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -6,7 +6,6 @@ SOURCES = word2vec/lib/huffmanTree.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ - word2vec/lib/vocabulary.cpp \ word2vec/lib/win/mman.cpp \ word2vec/lib/word2vec.cpp \ rcpp_word2vec.cpp \ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 9b55f46..ebd9209 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,17 +5,20 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string trainFile, std::string modelFile, std::string stopWordsFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP trainFileSEXP, SEXP modelFileSEXP, SEXP stopWordsFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector types_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP types_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); - Rcpp::traits::input_parameter< std::string >::type trainFile(trainFileSEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type types_(types_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); - Rcpp::traits::input_parameter< std::string >::type stopWordsFile(stopWordsFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); Rcpp::traits::input_parameter< uint8_t >::type window(windowSEXP); @@ -28,35 +31,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< uint8_t >::type iterations(iterationsSEXP); Rcpp::traits::input_parameter< float >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< bool >::type withSG(withSGSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - Rcpp::traits::input_parameter< std::string >::type endOfSentenceChars(endOfSentenceCharsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); - return rcpp_result_gen; -END_RCPP -} -// w2v_load_model -Rcpp::List w2v_load_model(std::string file, bool normalize); -RcppExport SEXP _word2vec_w2v_load_model(SEXP fileSEXP, SEXP normalizeSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); - Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_load_model(file, normalize)); - return rcpp_result_gen; -END_RCPP -} -// w2v_save_model -bool w2v_save_model(SEXP ptr, std::string file); -RcppExport SEXP _word2vec_w2v_save_model(SEXP ptrSEXP, SEXP fileSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); - Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_save_model(ptr, file)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, types_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -124,45 +101,14 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// d2vec -Rcpp::List d2vec(SEXP ptr, Rcpp::StringVector x, std::string wordDelimiterChars); -RcppExport SEXP _word2vec_d2vec(SEXP ptrSEXP, SEXP xSEXP, SEXP wordDelimiterCharsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); - Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - rcpp_result_gen = Rcpp::wrap(d2vec(ptr, x, wordDelimiterChars)); - return rcpp_result_gen; -END_RCPP -} -// d2vec_nearest -Rcpp::DataFrame d2vec_nearest(SEXP ptr_w2v, SEXP ptr_d2v, Rcpp::StringVector x, std::string wordDelimiterChars); -RcppExport SEXP _word2vec_d2vec_nearest(SEXP ptr_w2vSEXP, SEXP ptr_d2vSEXP, SEXP xSEXP, SEXP wordDelimiterCharsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr_w2v(ptr_w2vSEXP); - Rcpp::traits::input_parameter< SEXP >::type ptr_d2v(ptr_d2vSEXP); - Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - rcpp_result_gen = Rcpp::wrap(d2vec_nearest(ptr_w2v, ptr_d2v, x, wordDelimiterChars)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { - {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 21}, - {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, - {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, + {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2}, {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4}, {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3}, - {"_word2vec_d2vec", (DL_FUNC) &_word2vec_d2vec, 3}, - {"_word2vec_d2vec_nearest", (DL_FUNC) &_word2vec_d2vec_nearest, 4}, {NULL, NULL, 0} }; diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 8be203d..dc22480 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,10 +11,8 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, - Rcpp::CharacterVector stopWords_, - std::string trainFile, // NOTE: remove - std::string modelFile, - std::string stopWordsFile, // NOTE: remove + Rcpp::CharacterVector types_, + std::string modelFile = "", uint16_t minWordFreq = 5, uint16_t size = 100, uint8_t window = 5, @@ -27,8 +25,6 @@ Rcpp::List w2v_train(Rcpp::List texts_, uint8_t iterations = 5, float alpha = 0.05, bool withSG = false, - std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", - std::string endOfSentenceChars = ".\n?!", bool verbose = false, bool normalize = true) { @@ -51,9 +47,17 @@ Rcpp::List w2v_train(Rcpp::List texts_, */ texts_t texts = Rcpp::as(texts_); - words_t stopWords = Rcpp::as(stopWords_); - w2v::corpus_t corpus(texts, stopWords); - + types_t types = Rcpp::as(types_); + + w2v::corpus_t corpus(texts, types); + corpus.setWordFreq(); + + // Rcpp::List out2 = Rcpp::List::create( + // Rcpp::Named("frequency") = corpus.frequency + // ); + // + // return out2; + w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; trainSettings.size = size; @@ -67,40 +71,40 @@ Rcpp::List w2v_train(Rcpp::List texts_, trainSettings.iterations = iterations; trainSettings.alpha = alpha; trainSettings.withSG = withSG; - trainSettings.wordDelimiterChars = wordDelimiterChars; - trainSettings.endOfSentenceChars = endOfSentenceChars; + //trainSettings.wordDelimiterChars = wordDelimiterChars; + //trainSettings.endOfSentenceChars = endOfSentenceChars; Rcpp::XPtr model(new w2v::w2vModel_t(), true); bool trained; std::size_t vocWords; std::size_t trainWords; std::size_t totalWords; - if (verbose) { + if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, - trainFile, stopWordsFile, // NOTE: remove - [&p] (float _percent) { - p.update(_percent/2); - /* - std::cout << "\rParsing train data... " - << std::fixed << std::setprecision(2) - << _percent << "%" << std::flush; - */ - }, - [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - /* - Rcpp::Rcerr << std::endl - << "Finished reading data: " << std::endl - << "Vocabulary size: " << _vocWords << std::endl - << "Train words: " << _trainWords << std::endl - << "Total words: " << _totalWords << std::endl - << "Start training" << std::endl - << std::endl; - */ - vocWords = _vocWords; - trainWords = _trainWords; - totalWords = _totalWords; - }, + //trainFile, stopWordsFile, // NOTE: remove + // [&p] (float _percent) { + // p.update(_percent / 2); + // /* + // std::cout << "\rParsing train data... " + // << std::fixed << std::setprecision(2) + // << _percent << "%" << std::flush; + // */ + // }, + // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { + // /* + // Rcpp::Rcerr << std::endl + // << "Finished reading data: " << std::endl + // << "Vocabulary size: " << _vocWords << std::endl + // << "Train words: " << _trainWords << std::endl + // << "Total words: " << _totalWords << std::endl + // << "Start training" << std::endl + // << std::endl; + // */ + // vocWords = _vocWords; + // trainWords = _trainWords; + // totalWords = _totalWords; + // }, [&p] (float _alpha, float _percent) { /* std::cout << '\r' @@ -112,30 +116,32 @@ Rcpp::List w2v_train(Rcpp::List texts_, << _percent << "%" << std::flush; */ - p.update(50+(_percent/2)); + p.update(_percent); } ); //std::cout << std::endl; } else { trained = model->train(trainSettings, corpus, - trainFile, stopWordsFile, // NOTE: remove - nullptr, - [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - /* - Rcpp::Rcerr << std::endl - << "Finished reading data: " << std::endl - << "Vocabulary size: " << _vocWords << std::endl - << "Train words: " << _trainWords << std::endl - << "Total words: " << _totalWords << std::endl - << "Start training" << std::endl - << std::endl; - */ - vocWords = _vocWords; - trainWords = _trainWords; - totalWords = _totalWords; - }, + //trainFile, stopWordsFile, // NOTE: remove + // nullptr, + // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { + // /* + // Rcpp::Rcerr << std::endl + // << "Finished reading data: " << std::endl + // << "Vocabulary size: " << _vocWords << std::endl + // << "Train words: " << _trainWords << std::endl + // << "Total words: " << _totalWords << std::endl + // << "Start training" << std::endl + // << std::endl; + // */ + // vocWords = _vocWords; + // trainWords = _trainWords; + // totalWords = _totalWords; + // }, nullptr); } + Rcpp::Rcout << "Training done\n"; + //return Rcpp::List::create(); bool success = true; if (!trained) { Rcpp::Rcout << "Training failed: " << model->errMsg() << std::endl; @@ -144,7 +150,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, // NORMALISE UPFRONT - DIFFERENT THAN ORIGINAL CODE // - original code dumps data to disk, next imports it and during import normalisation happens after which we can do nearest calculations // - the R wrapper only writes to disk at request so we need to normalise upfront in order to do directly nearest calculations - if(normalize){ + if (normalize) { //Rcpp::Rcout << "Finished training: finalising with embedding normalisation" << std::endl; model->normalize(); } @@ -153,8 +159,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::List out = Rcpp::List::create( Rcpp::Named("model") = model, Rcpp::Named("data") = Rcpp::List::create( - Rcpp::Named("file") = trainFile, - Rcpp::Named("stopwords") = stopWordsFile, + //Rcpp::Named("file") = trainFile, + //Rcpp::Named("stopwords") = stopWordsFile, Rcpp::Named("n") = totalWords, Rcpp::Named("n_vocabulary") = trainWords ), @@ -172,16 +178,16 @@ Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::Named("negative") = negative, Rcpp::Named("sample") = sample, Rcpp::Named("expTableSize") = expTableSize, - Rcpp::Named("expValueMax") = expValueMax, - Rcpp::Named("split_words") = wordDelimiterChars, - Rcpp::Named("split_sents") = endOfSentenceChars + Rcpp::Named("expValueMax") = expValueMax + //Rcpp::Named("split_words") = wordDelimiterChars, + //Rcpp::Named("split_sents") = endOfSentenceChars ) ); out.attr("class") = "word2vec_trained"; return out; } - +/* // [[Rcpp::export]] Rcpp::List w2v_load_model(std::string file, bool normalize = true) { bool normalise = normalize; @@ -205,6 +211,7 @@ bool w2v_save_model(SEXP ptr, std::string file) { bool success = model->save(file); return success; } +*/ // [[Rcpp::export]] std::vector w2v_dictionary(SEXP ptr) { @@ -409,6 +416,7 @@ Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, return embedding_default; } +/* NOTE: temporarily disabled // [[Rcpp::export]] @@ -465,3 +473,5 @@ Rcpp::DataFrame d2vec_nearest(SEXP ptr_w2v, SEXP ptr_d2v, Rcpp::StringVector x, ); return out; } + + */ diff --git a/src/word2vec/lib/CMakeLists.txt b/src/word2vec/lib/CMakeLists.txt index c15279d..ad6c414 100644 --- a/src/word2vec/lib/CMakeLists.txt +++ b/src/word2vec/lib/CMakeLists.txt @@ -18,8 +18,8 @@ set(PRJ_SRCS # ${PROJECT_INCLUDE_DIR}/word2vec.h # ${PROJECT_SOURCE_DIR}/c_binding.cpp ${PROJECT_SOURCE_DIR}/mapper.cpp - ${PROJECT_SOURCE_DIR}/vocabulary.hpp - ${PROJECT_SOURCE_DIR}/vocabulary.cpp +# ${PROJECT_SOURCE_DIR}/vocabulary.hpp +# ${PROJECT_SOURCE_DIR}/vocabulary.cpp ${PROJECT_SOURCE_DIR}/huffmanTree.hpp ${PROJECT_SOURCE_DIR}/huffmanTree.cpp ${PROJECT_SOURCE_DIR}/nsDistribution.hpp diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index c89e222..ad52cc5 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -13,24 +13,19 @@ namespace w2v { m_sharedData(_sharedData), m_randomDevice(), m_randomGenerator(m_randomDevice()), m_rndWindowShift(0, static_cast((m_sharedData.trainSettings->window - 1))), m_downSampling(), m_nsDistribution(), m_hiddenLayerVals(), m_hiddenLayerErrors(), - m_wordReader(), m_thread() { + m_thread() { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); } - if (!m_sharedData.vocabulary) { - throw std::runtime_error("vocabulary object is not initialized"); - } if (m_sharedData.trainSettings->sample > 0.0f) { m_downSampling.reset(new downSampling_t(m_sharedData.trainSettings->sample, - m_sharedData.vocabulary->trainWords())); + m_sharedData.corpus->trainWords)); } if (m_sharedData.trainSettings->negative > 0) { - std::vector frequencies; - m_sharedData.vocabulary->frequencies(frequencies); - m_nsDistribution.reset(new nsDistribution_t(frequencies)); + m_nsDistribution.reset(new nsDistribution_t(m_sharedData.corpus->frequency)); } if (m_sharedData.trainSettings->withHS && !m_sharedData.huffmanTree) { @@ -42,43 +37,31 @@ namespace w2v { m_hiddenLayerVals.reset(new std::vector(m_sharedData.trainSettings->size)); } - if (!m_sharedData.corpus && !m_sharedData.fileMapper) { - throw std::runtime_error("corpus and file mapper objects are not initialized"); - } - if (m_sharedData.fileMapper) { - auto shift = m_sharedData.fileMapper->size() / m_sharedData.trainSettings->threads; - auto startFrom = shift * _id; - auto stopAt = (_id == m_sharedData.trainSettings->threads - 1) - ? (m_sharedData.fileMapper->size() - 1) : (shift * (_id + 1)); - m_wordReader.reset(new wordReader_t(*m_sharedData.fileMapper, - m_sharedData.trainSettings->wordDelimiterChars, - m_sharedData.trainSettings->endOfSentenceChars, - startFrom, stopAt)); - } else { - // NOTE: specify range for workers - auto n = m_sharedData.corpus->texts.size(); - auto threads = m_sharedData.trainSettings->threads; - range = std::make_pair(floor((n / threads) * _id), - floor((n / threads) * (_id + 1)) - 1); + if (!m_sharedData.corpus) { + throw std::runtime_error("corpus object is not initialized"); } + + // NOTE: specify range for workers + auto n = m_sharedData.corpus->texts.size(); + auto threads = m_sharedData.trainSettings->threads; + range = std::make_pair(floor((n / threads) * _id), + floor((n / threads) * (_id + 1)) - 1); + } void trainThread_t::worker(std::vector &_trainMatrix) noexcept { for (auto g = m_sharedData.trainSettings->iterations; g > 0; --g) { - //Rcpp::Rcout << "g: " << (int)g << "\n"; - bool exitFlag = false; + std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - if (m_sharedData.fileMapper) - m_wordReader->reset(); - - std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations - * m_sharedData.vocabulary->trainWords(); + * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; - while (!exitFlag) { + + for (std::size_t h = range.first; h <= range.second; ++h) { + // calc alpha if (threadProcessedWords - prvThreadProcessedWords > wordsPerAlpha) { // next 0.01% processed *m_sharedData.processedWords += threadProcessedWords - prvThreadProcessedWords; @@ -98,79 +81,42 @@ namespace w2v { } } + text_t text = m_sharedData.corpus->texts[h]; + // read sentence - std::vector sentence; - if (m_sharedData.fileMapper) { - while (true) { - std::string word; - if (!m_wordReader->nextWord(word)) { - exitFlag = true; // EOF or end of requested region - break; - } - if (word.empty()) { - break; // end of sentence - } - - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } - - threadProcessedWords++; - - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - continue; // skip this word - } - } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); - } - - } else { - // Rcpp::Rcout << "h: " << h << "\n"; - if (h > range.second) { - exitFlag = true; // EOF or end of requested region - break; + std::vector sentence; + sentence.reserve(text.size()); + for (size_t i = 0; i < text.size(); ++i) { + + auto &word = text[i]; + if (word == 0) { // padding + continue; } - text_t text = m_sharedData.corpus->texts[h]; - - for (size_t i = 0; i < text.size(); i++) { - std::string word = text[i]; - if (word.empty()) { - continue; // padding - } - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } - - threadProcessedWords++; - - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - continue; // skip this word - } + threadProcessedWords++; + + if (m_sharedData.trainSettings->sample > 0.0f) { + if ((*m_downSampling)(m_sharedData.corpus->frequency[word - 1], m_randomGenerator)) { + continue; // skip this word } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); } + sentence.push_back(word - 1); // zero-based index of words } + if (m_sharedData.trainSettings->withSG) { skipGram(sentence, _trainMatrix); } else { cbow(sentence, _trainMatrix); } - h++; // move to next text } } } - inline void trainThread_t::cbow(const std::vector &_sentence, + inline void trainThread_t::cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { + + if (_sentence.size() == 0) + return; for (std::size_t i = 0; i < _sentence.size(); ++i) { // hidden layers initialized with 0 values std::memset(m_hiddenLayerVals->data(), 0, m_hiddenLayerVals->size() * sizeof(float)); @@ -188,7 +134,7 @@ namespace w2v { continue; } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { - (*m_hiddenLayerVals)[k] += _trainMatrix[k + _sentence[posRndWindow]->index + (*m_hiddenLayerVals)[k] += _trainMatrix[k + _sentence[posRndWindow] * m_sharedData.trainSettings->size]; } cw++; @@ -199,13 +145,13 @@ namespace w2v { for (std::size_t j = 0; j < m_sharedData.trainSettings->size; j++) { (*m_hiddenLayerVals)[j] /= cw; } - + if (m_sharedData.trainSettings->withHS) { - hierarchicalSoftmax(_sentence[i]->index, *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); + hierarchicalSoftmax(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } else { - negativeSampling(_sentence[i]->index, *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); + negativeSampling(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } - + // hidden -> in for (auto j = rndShift; j < m_sharedData.trainSettings->window * 2 + 1 - rndShift; ++j) { if (j == m_sharedData.trainSettings->window) { @@ -217,15 +163,17 @@ namespace w2v { continue; } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { - _trainMatrix[k + _sentence[posRndWindow]->index * m_sharedData.trainSettings->size] + _trainMatrix[k + _sentence[posRndWindow] * m_sharedData.trainSettings->size] += (*m_hiddenLayerErrors)[k]; } } } } - inline void trainThread_t::skipGram(const std::vector &_sentence, + inline void trainThread_t::skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { + if (_sentence.size() == 0) + return; for (std::size_t i = 0; i < _sentence.size(); ++i) { auto rndShift = m_rndWindowShift(m_randomGenerator); for (auto j = rndShift; j < m_sharedData.trainSettings->window * 2 + 1 - rndShift; ++j) { @@ -238,15 +186,15 @@ namespace w2v { continue; } // shift to the selected word vector in the matrix - auto shift = _sentence[posRndWindow]->index * m_sharedData.trainSettings->size; + auto shift = _sentence[posRndWindow] * m_sharedData.trainSettings->size; // hidden layer initialized with 0 values std::memset(m_hiddenLayerErrors->data(), 0, m_hiddenLayerErrors->size() * sizeof(float)); if (m_sharedData.trainSettings->withHS) { - hierarchicalSoftmax(_sentence[i]->index, (*m_hiddenLayerErrors), _trainMatrix, shift); + hierarchicalSoftmax(_sentence[i], (*m_hiddenLayerErrors), _trainMatrix, shift); } else { - negativeSampling(_sentence[i]->index, (*m_hiddenLayerErrors), _trainMatrix, shift); + negativeSampling(_sentence[i], (*m_hiddenLayerErrors), _trainMatrix, shift); } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 3e3b0ac..f7af72a 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -18,8 +18,8 @@ #include #include "word2vec.hpp" -#include "wordReader.hpp" -#include "vocabulary.hpp" +//#include "wordReader.hpp" +//#include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" #include "downSampling.hpp" @@ -41,9 +41,9 @@ namespace w2v { */ struct sharedData_t final { std::shared_ptr trainSettings; ///< trainSettings structure - std::shared_ptr vocabulary; ///< words data + //std::shared_ptr vocabulary; ///< words data std::shared_ptr corpus; ///< train data - std::shared_ptr fileMapper; /// NOTE: remove + //std::shared_ptr fileMapper; /// NOTE: remove std::shared_ptr> bpWeights; ///< back propagation weights std::shared_ptr> expTable; ///< exp(x) / (exp(x) + 1) values lookup table std::shared_ptr huffmanTree; ///< Huffman tree used by hierarchical softmax @@ -65,7 +65,6 @@ namespace w2v { std::unique_ptr m_nsDistribution; std::unique_ptr> m_hiddenLayerVals; std::unique_ptr> m_hiddenLayerErrors; - std::unique_ptr> m_wordReader; std::unique_ptr m_thread; public: @@ -91,9 +90,9 @@ namespace w2v { private: void worker(std::vector &_trainMatrix) noexcept; - inline void cbow(const std::vector &_sentence, + inline void cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept; - inline void skipGram(const std::vector &_sentence, + inline void skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept; inline void hierarchicalSoftmax(std::size_t _index, std::vector &_hiddenLayer, diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 22f6216..550e729 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -11,9 +11,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, - const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { trainThread_t::sharedData_t sharedData; @@ -22,18 +20,12 @@ namespace w2v { } sharedData.trainSettings = _trainSettings; - if (!_vocabulary) { - throw std::runtime_error("vocabulary object is not initialized"); - } - sharedData.vocabulary = _vocabulary; - - if (!_corpus && !_fileMapper) { - throw std::runtime_error("corpus and file mapper objects are not initialized"); + if (!_corpus) { + throw std::runtime_error("corpus is object is not initialized"); } sharedData.corpus = _corpus; - sharedData.fileMapper = _fileMapper; - sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _vocabulary->size(), 0.0f)); + sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _corpus->types.size(), 0.0f)); sharedData.expTable.reset(new std::vector(_trainSettings->expTableSize)); for (uint16_t i = 0; i < _trainSettings->expTableSize; ++i) { // Precompute the exp() table @@ -45,9 +37,7 @@ namespace w2v { } if (_trainSettings->withHS) { - std::vector frequencies; - _vocabulary->frequencies(frequencies); - sharedData.huffmanTree.reset(new huffmanTree_t(frequencies));; + sharedData.huffmanTree.reset(new huffmanTree_t(_corpus->frequency));; } if (_progressCallback != nullptr) { @@ -57,21 +47,12 @@ namespace w2v { sharedData.processedWords.reset(new std::atomic(0)); sharedData.alpha.reset(new std::atomic(_trainSettings->alpha)); - // if (_corpus) { - // // NOTE : corpus has no sentence delimiter - // m_matrixSize = sharedData.trainSettings->size * sharedData.vocabulary->size() + 100; - // } else { - m_matrixSize = sharedData.trainSettings->size * sharedData.vocabulary->size(); - //} - //Rcpp::Rcout << "corpus->texts.size(): " << sharedData.corpus->texts.size() << "\n"; - //Rcpp::Rcout << "vocabulary->size(): " << sharedData.vocabulary->size() << "\n"; - //Rcpp::Rcout << "_trainSettings->threads: " << (int)_trainSettings->threads << "\n"; + // NOTE: consider setting size elsewhere + m_matrixSize = sharedData.trainSettings->size * sharedData.corpus->types.size(); + for (uint8_t i = 0; i < _trainSettings->threads; ++i) { - // trainThread_t t(i, sharedData); - // Rcpp::Rcout << "thread: " << (int)i << " from " << t.range.first << " to " << t.range.second << "\n"; m_threads.emplace_back(new trainThread_t(i, sharedData)); } - //throw std::runtime_error("m_threads.emplace_back()"); } void trainer_t::operator()(std::vector &_trainMatrix) noexcept { @@ -83,13 +64,13 @@ namespace w2v { std::generate(_trainMatrix.begin(), _trainMatrix.end(), [&]() { float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); return v; - //return rndMatrixInitializer(randomGenerator); + //return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? }); - + for (auto &i:m_threads) { i->launch(_trainMatrix); } - + for (auto &i:m_threads) { i->join(); } diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index 19acd0b..30bc26f 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -15,8 +15,6 @@ #include #include "word2vec.hpp" -#include "wordReader.hpp" -#include "vocabulary.hpp" #include "trainThread.hpp" namespace w2v { @@ -40,9 +38,7 @@ namespace w2v { * @param _progressCallback callback function to be called on each new 0.01% processed train data */ trainer_t(const std::shared_ptr &_trainSettings, - const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback); /** diff --git a/src/word2vec/lib/vocabulary.cpp b/src/word2vec/lib/vocabulary.cpp deleted file mode 100644 index 7c4a471..0000000 --- a/src/word2vec/lib/vocabulary.cpp +++ /dev/null @@ -1,211 +0,0 @@ -/** - * @file - * @brief vocabulary class containing word map, words frequencies and word indexes - * @author Max Fomichev - * @date 16.12.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#include "vocabulary.hpp" -#include "wordReader.hpp" - -namespace w2v { - vocabulary_t::vocabulary_t(std::shared_ptr &_trainWordsMapper, - std::shared_ptr &_stopWordsMapper, - const std::string &_wordDelimiterChars, - const std::string &_endOfSentenceChars, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept: m_words() { - // load stop-words - std::vector stopWords; - if (_stopWordsMapper) { - wordReader_t wordReader(*_stopWordsMapper, _wordDelimiterChars, _endOfSentenceChars); - std::string word; - while (wordReader.nextWord(word)) { - stopWords.push_back(word); - } - } - - // load words and calculate their frequencies - struct tmpWordData_t { - std::size_t frequency = 0; - std::string word; - }; - std::unordered_map tmpWords; - off_t progressOffset = 0; - if (_trainWordsMapper) { - wordReader_t wordReader(*_trainWordsMapper, _wordDelimiterChars, _endOfSentenceChars); - std::string word; - while (wordReader.nextWord(word)) { - if (word.empty()) { - word = ""; - } - auto &i = tmpWords[word]; - if (i.frequency == 0) { - i.word = word; - } - i.frequency++; - m_totalWords++; - - if (_progressCallback != nullptr) { - if (wordReader.offset() - progressOffset >= _trainWordsMapper->size() / 10000 - 1) { - _progressCallback(static_cast(wordReader.offset()) - / _trainWordsMapper->size() * 100.0f); - progressOffset = wordReader.offset(); - } - } - } - } - - // remove stop words from the words set - for (auto &i:stopWords) { - tmpWords.erase(i); - } - - // remove sentence delimiter from the words set - { - std::string word = ""; - auto i = tmpWords.find(word); - if (i != tmpWords.end()) { - m_totalWords -= i->second.frequency; - tmpWords.erase(i); - } - } - - // prepare vector sorted by word frequencies - std::vector> wordsFreq; - // delimiter is the first word - wordsFreq.emplace_back(std::pair("", 0LU)); - for (auto const &i:tmpWords) { - //Rcpp::Rcout << i.first << ": " << i.second.frequency << "\n"; - if (i.second.frequency >= _minFreq) { - wordsFreq.emplace_back(std::pair(i.first, i.second.frequency)); - m_trainWords += i.second.frequency; - } - } - - // sorting, from more frequent to less frequent, skip delimiter (first word) - if (wordsFreq.size() > 1) { - std::sort(wordsFreq.begin() + 1, wordsFreq.end(), [](const std::pair &_what, - const std::pair &_with) { - if(_what.second == _with.second){ - return _what.first > _with.first; - }else{ - return _what.second > _with.second; - } - }); - // make delimiter frequency more then the most frequent word - wordsFreq[0].second = wordsFreq[1].second + 1; - // restore sentence delimiter - auto &i = tmpWords[""]; - i.word = ""; - i.frequency = wordsFreq[0].second; - } - // fill index values - for (std::size_t i = 0; i < wordsFreq.size(); ++i) { - auto &w = tmpWords[wordsFreq[i].first]; - m_words[wordsFreq[i].first] = wordData_t(i, w.frequency); - //Rcpp::Rcout << i << " " << wordsFreq[i].first << ": " << wordsFreq[i].second << "\n"; - } - - if (_statsCallback != nullptr) { - _statsCallback(m_words.size(), m_trainWords, m_totalWords); - } - } - - vocabulary_t::vocabulary_t(std::shared_ptr &_corpus, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept: m_words() { - - // load words and calculate their frequencies - struct tmpWordData_t { - std::size_t frequency = 0; - std::string word; - }; - std::unordered_map tmpWords; - std::string word; - //off_t progressOffset = 0; - - for (auto &text:_corpus->texts) { - for (auto &word:text) { - // padding - if (word.empty()) { - continue; - } - auto &tmpWordData = tmpWords[word]; - if (tmpWordData.frequency == 0) { - tmpWordData.word = word; - } - tmpWordData.frequency++; - m_totalWords++; - - // if (_progressCallback != nullptr) { - // if (wordReader.offset() - progressOffset >= _trainWordsMapper->size() / 10000 - 1) { - // _progressCallback(static_cast(wordReader.offset()) - // / _trainWordsMapper->size() * 100.0f); - // progressOffset = wordReader.offset(); - // } - // } - } - } - - // remove stop words from the words set - for (auto &i:_corpus->stopWords) { - tmpWords.erase(i); - } - - // remove sentence delimiter from the words set - // { - // std::string word = ""; - // auto i = tmpWords.find(word); - // if (i != tmpWords.end()) { - // m_totalWords -= i->second.frequency; - // tmpWords.erase(i); - // } - // } - - // prepare vector sorted by word frequencies - std::vector> wordsFreq; - // delimiter is the first word - wordsFreq.emplace_back(std::pair("", 0LU)); - for (auto const &i:tmpWords) { - if (i.second.frequency >= _minFreq) { - wordsFreq.emplace_back(std::pair(i.first, i.second.frequency)); - m_trainWords += i.second.frequency; - } - } - - // sorting, from more frequent to less frequent, skip delimiter (first word) - if (wordsFreq.size() > 1) { - std::sort(wordsFreq.begin() + 1, wordsFreq.end(), [](const std::pair &_what, - const std::pair &_with) { - if(_what.second == _with.second){ - return _what.first > _with.first; - }else{ - return _what.second > _with.second; - } - - }); - // NOTE: should the index 0 be non word? - // make delimiter frequency more then the most frequent word - wordsFreq[0].second = wordsFreq[1].second + 1; - // restore sentence delimiter - auto &i = tmpWords[""]; - i.word = ""; - i.frequency = wordsFreq[0].second; - } - // fill index values - //wordsFreq.emplace(wordsFreq.begin(), 0, std::pair("", 0U)); // NOTE: insert dummy - for (std::size_t i = 0; i < wordsFreq.size(); ++i) { - auto &w = tmpWords[wordsFreq[i].first]; - m_words[wordsFreq[i].first] = wordData_t(i, w.frequency); - //Rcpp::Rcout << i << " " << wordsFreq[i].first << ": " << wordsFreq[i].second << "\n"; - } - - if (_statsCallback != nullptr) { - _statsCallback(m_words.size(), m_trainWords, m_totalWords); - } - } -} diff --git a/src/word2vec/lib/vocabulary.hpp b/src/word2vec/lib/vocabulary.hpp deleted file mode 100644 index 05793fe..0000000 --- a/src/word2vec/lib/vocabulary.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/** - * @file - * @brief vocabulary class containing word map, words frequencies and word indexes - * @author Max Fomichev - * @date 16.12.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ -#ifndef WORD2VEC_VOCABULARY_H -#define WORD2VEC_VOCABULARY_H - -#include -#include -#include -#include -#include -#include - -#include "word2vec.hpp" -#include "mapper.hpp" - -namespace w2v { - /** - * @brief vocabulary class - implements fast access to a words storage with their data - index and frequency. - * - * Vocabulary contains parsed words with minimum defined frequency, excluding stop words defined in a text file. - * Base word storage is the std::unordered_map object. - * - */ - class vocabulary_t final { - public: - /** - * @brief wordData structure is a stored word parameters - index and frequency - */ - struct wordData_t final { - std::size_t index; ///< word index (more frequent words have lower index value) - std::size_t frequency; ///< word frequency in a train data set - - /// Constructs an empty wordData object - wordData_t() noexcept: index(0), frequency(0) {} - /// Constructs a wordObject with the specified parameters - wordData_t(std::size_t _index, std::size_t _frequency) noexcept: - index(_index), frequency(_frequency) {} - }; - - private: - // word (key) with its index and frequency - using wordMap_t = std::unordered_map; - - std::size_t m_trainWords = 0; - std::size_t m_totalWords = 0; - - wordMap_t m_words; - - public: - /** - * Constructs a vocabulary object from the specified files and parameters - * @param _trainWordsMapper smart pointer to fileMapper object related to a train data set file - * @param _stopWordsMapper smart pointer to fileMapper object related to a file with stop-words. - * In case of unititialized pointer, _stopWordsMapper will be ignored. - * @param _minFreq minimum word frequency to include into vocabulary - * @param _progressCallback callback function to be called on each new 0.01% processed train data - * @param _statsCallback callback function to be called on train data loaded event to pass vocabulary size, - * train words and total words amounts. - */ - vocabulary_t(std::shared_ptr &_trainWordsMapper, - std::shared_ptr &_stopWordsMapper, - const std::string &_wordDelimiterChars, - const std::string &_endOfSentenceChars, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; - - vocabulary_t(std::shared_ptr &_corpus, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; - - /** - * Requests a data (index, frequency, word) associated with the _word - * @param[in] _word key value - * @return pointer to a wordData object or nullptr if the word is not a member of vocabulary - */ - inline const wordData_t *data(const std::string &_word) const noexcept { - auto i = m_words.find(_word); - if (i != m_words.end()) { - return &(i->second); - } else { - return nullptr; - } - } - - /// @retrns vocabulary size - inline std::size_t size() const noexcept { - return m_words.size(); - } - - /// @returns total words amount parsed from a train data set - inline std::size_t totalWords() const noexcept { - return m_totalWords; - } - - /// @returns train words amount (totalWords - amount(stop words) - amount(words with low frequency)) - inline std::size_t trainWords() const noexcept { - return m_trainWords; - } - - /** - * Requests word frequencies - * @param[out] _output - vector of word frequencies where vector indexes are word indexes and vector values - * are word frequencies - */ - inline void frequencies(std::vector &_output) const noexcept { - _output.resize(m_words.size()); - for (auto const &i:m_words) { - _output[i.second.index] = i.second.frequency; - } - } - - /** - * Requests words descending sorted by their frequencies - * @param[out] _words vector of word descending sorted by their frequencies - */ - inline void words(std::vector &_words) const noexcept { - _words.clear(); - std::vector> indexedWords; - for (auto const &i:m_words) { - indexedWords.emplace_back(std::pair(i.second.index, i.first)); - } - std::sort(indexedWords.begin(), indexedWords.end(), [](const std::pair &_what, - const std::pair &_with) { - return _what.first < _with.first; - }); - for (auto const &i:indexedWords) { - _words.push_back(i.second); - } - } - }; -} - -#endif // WORD2VEC_VOCABULARY_H diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index ea717a3..0699812 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -8,65 +8,30 @@ #include #include "word2vec.hpp" #include "wordReader.hpp" -#include "vocabulary.hpp" +//#include "vocabulary.hpp" #include "trainer.hpp" namespace w2v { bool w2vModel_t::train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - const std::string &_trainFile, // NOTE: remove - const std::string &_stopWordsFile, // NOTE: remove - vocabularyProgressCallback_t _vocabularyProgressCallback, - vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { try { // store tokens std::shared_ptr corpus(new corpus_t(_corpus)); - // map train data set file to memory - std::shared_ptr trainWordsMapper; - if (!_trainFile.empty()) { - trainWordsMapper.reset(new fileMapper_t(_trainFile)); - } - // map stop-words file to memory - std::shared_ptr stopWordsMapper; - if (!_stopWordsFile.empty()) { - stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); - } - - // build vocabulary, skip stop-words and words with frequency < minWordFreq - std::shared_ptr vocabulary; - if (!_trainFile.empty()) { - vocabulary.reset(new vocabulary_t(trainWordsMapper, - stopWordsMapper, - _trainSettings.wordDelimiterChars, - _trainSettings.endOfSentenceChars, - _trainSettings.minWordFreq, - _vocabularyProgressCallback, - _vocabularyStatsCallback)); - } else { - vocabulary.reset(new vocabulary_t(corpus, - _trainSettings.minWordFreq, - _vocabularyProgressCallback, - _vocabularyStatsCallback)); - } - // key words descending ordered by their indexes - std::vector words; - vocabulary->words(words); m_vectorSize = _trainSettings.size; - m_mapSize = vocabulary->size(); - + m_mapSize = corpus->types.size(); + // train model std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), - vocabulary, corpus, - trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); - //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; - + + // NOTE: directly make matrix from _trainMatrix std::size_t wordIndex = 0; - for (auto const &i:words) { + for (auto const &i : corpus->types) { + //Rcpp::Rcout << i << "\n"; auto &v = m_map[i]; v.resize(m_vectorSize); std::copy(&_trainMatrix[wordIndex * m_vectorSize], @@ -84,7 +49,7 @@ namespace w2v { return false; } - + /* bool w2vModel_t::save(const std::string &_modelFile) const noexcept { try { // save trained data in original word2vec format @@ -334,4 +299,5 @@ namespace w2v { i /= med; } } + */ } diff --git a/src/word2vec/include/word2vec.hpp b/src/word2vec/lib/word2vec.hpp similarity index 88% rename from src/word2vec/include/word2vec.hpp rename to src/word2vec/lib/word2vec.hpp index fd9f1d2..3485e3c 100644 --- a/src/word2vec/include/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,9 +19,13 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; +typedef std::vector types_t; +// typedef std::vector words_t; +// typedef std::vector text_t; +typedef std::vector words_t; +typedef std::vector text_t; typedef std::vector texts_t; +typedef std::vector frequency_t; namespace w2v { @@ -31,12 +35,44 @@ namespace w2v { class corpus_t final { public: texts_t texts; - words_t stopWords; + types_t types; + frequency_t frequency; + size_t totalWords; + size_t trainWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, words_t _stopWords): texts(_texts), stopWords(_stopWords) {} - + corpus_t(texts_t _texts, types_t _types): + texts(_texts), types(_types) {} + + void setWordFreq() { + + frequency = frequency_t(types.size(), 0); + totalWords = 0; + trainWords = 0; + for (size_t h = 0; h < texts.size(); h++) { + text_t text = texts[h]; + for (size_t i = 0; i < text.size(); i++) { + totalWords++; + auto &word = text[i]; + //Rcpp::Rcout << i << ": " << word << "\n"; + if (word < 0 || types.size() < word) + throw std::range_error("setWordFreq: invalid types"); + if (word == 0) // padding + continue; + // if (types[word - 1].empty()) { + // word = 0; // remove and pad + // continue; + // } + frequency[word - 1]++; + trainWords++; + } + } + Rcpp::Rcout << "trainWords: " << trainWords << "\n"; + Rcpp::Rcout << "totalWords: " << totalWords << "\n"; + Rcpp::Rcout << "frequency.size(): " << frequency.size() << "\n"; + Rcpp::Rcout << "types.size(): " << types.size() << "\n"; + } }; /** @@ -55,6 +91,7 @@ namespace w2v { uint8_t iterations = 5; ///< train iterations float alpha = 0.05f; ///< starting learn rate bool withSG = false; ///< use Skip-Gram instead of CBOW + // TODO: remove std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r"; std::string endOfSentenceChars = ".\n?!"; trainSettings_t() = default; @@ -169,11 +206,6 @@ namespace w2v { /// Direct access to the word-vector map const map_t &map() {return m_map;} - /// pure virtual method to save model of a derived class - virtual bool save(const std::string &_modelFile) const noexcept = 0; - /// pure virtual method to load model of a derived class - virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; - /** * Vector access by key value * @param _key key value uniquely identifying vector in model @@ -291,16 +323,8 @@ namespace w2v { */ bool train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - const std::string &_trainFile, // NOTE: remove - const std::string &_stopWordsFile, // NOTE: remove - vocabularyProgressCallback_t _vocabularyProgressCallback, - vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; - /// saves word vectors to file with _modelFile name - bool save(const std::string &_modelFile) const noexcept override; - /// loads word vectors from file with _modelFile name - bool load(const std::string &_modelFile, bool normalize = true) noexcept override; /** * Normalise vectors */ @@ -354,9 +378,9 @@ namespace w2v { m_mapSize = m_map.size(); } /// saves document vectors to file with _modelFile name - bool save(const std::string &_modelFile) const noexcept override; + //bool save(const std::string &_modelFile) const noexcept override; /// loads document vectors from file with _modelFile name - bool load(const std::string &_modelFile, bool normalize = true) noexcept override; + //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; }; /** diff --git a/tests/test.R b/tests/test.R new file mode 100644 index 0000000..e081c84 --- /dev/null +++ b/tests/test.R @@ -0,0 +1,45 @@ +library(quanteda) +library(word2vec) +library(LSX) + +data_corpus_guardian <- readRDS("/home/kohei/Dropbox/Public/data_corpus_guardian2016-10k.rds") +corp <- data_corpus_guardian %>% +#corp <- data_corpus_inaugural %>% + corpus_reshape() +toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% + tokens_remove(stopwords(), padding = TRUE) %>% + tokens_tolower() +ndoc(toks) + +mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, + iterations = 5, minWordFreq = 5, threads = 6) +dim(as.matrix(mod)) +predict(mod, c("people", "american"), type = "nearest") + +dfmt <- dfm(toks, remove_padding = TRUE) %>% + dfm_trim(min_termfreq = 5) +lss <- textmodel_lss(dfmt, c("good" = 1, "bad" = -1), cache = TRUE) +head(coef(lss)) +tail(coef(lss)) + +lss2 <- as.textmodel_lss(t(as.matrix(mod)), c("good" = 1, "bad" = -1)) +head(coef(lss2)) +tail(coef(lss2)) + +lis <- as.list(toks) +mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, + verbose = TRUE, threads = 4) +emb_lis <- as.matrix(mod_lis) +dim(emb_lis) +pred_lis <- predict(mod_lis, c("people", "American"), type = "nearest") + +#saveRDS(mod_lis, "tests/word2vec_v04.RDS") + +microbenchmark::microbenchmark( + "lis" = word2vec(lis, dim = 50, iter = 5, min_count = 5, + verbose = FALSE, threads = 10), + "txt" = word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, + verbose = FALSE, threads = 10), + times = 10 +) +