From 3c09c711799808020b55d335c7e9d569ca2d12e6 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 07:39:50 +0900 Subject: [PATCH 01/26] Add test file --- tests/train.R | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/train.R diff --git a/tests/train.R b/tests/train.R new file mode 100644 index 0000000..030fc2b --- /dev/null +++ b/tests/train.R @@ -0,0 +1,30 @@ +library(quanteda) +library(word2vec) + +corp <- data_corpus_inaugural %>% + corpus_reshape() +toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) +lis <- as.list(toks) +txt <- stringi::stri_c_list(lis, " ") + +mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, + verbose = TRUE, threads = 4) +emb_lis <- as.matrix(mod_lis) +dim(emb_lis) +predict(mod_lis, c("people", "American"), type = "nearest") + +mod_txt <- word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, + verbose = TRUE, threads = 4) +emb_txt <- as.matrix(mod_txt) +dim(emb_txt) +predict(mod_txt, c("people", "American"), type = "nearest") + + +microbenchmark::microbenchmark( + "lis" = word2vec(lis, dim = 50, iter = 5, min_count = 5, + verbose = FALSE, threads = 10), + "txt" = word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, + verbose = FALSE, threads = 10), + times = 10 +) + From 8bd538532befe31e33a337448549460dcb490dcc Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 07:40:19 +0900 Subject: [PATCH 02/26] Change to serialized tokens --- src/rcpp_word2vec.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 8be203d..5161f7a 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -75,7 +75,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, std::size_t vocWords; std::size_t trainWords; std::size_t totalWords; - if (verbose) { + if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, trainFile, stopWordsFile, // NOTE: remove @@ -112,7 +112,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, << _percent << "%" << std::flush; */ - p.update(50+(_percent/2)); + p.update(50 + (_percent / 2)); } ); //std::cout << std::endl; From 18783ac2ee293a9f01ddf05e27471d9d51354a82 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 07:46:22 +0900 Subject: [PATCH 03/26] Move header file --- src/word2vec/{include => lib}/word2vec.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) rename src/word2vec/{include => lib}/word2vec.hpp (97%) diff --git a/src/word2vec/include/word2vec.hpp b/src/word2vec/lib/word2vec.hpp similarity index 97% rename from src/word2vec/include/word2vec.hpp rename to src/word2vec/lib/word2vec.hpp index fd9f1d2..ef2961e 100644 --- a/src/word2vec/include/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,8 +19,8 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; +typedef std::vector words_t; +typedef std::vector text_t; typedef std::vector texts_t; namespace w2v { @@ -31,11 +31,15 @@ namespace w2v { class corpus_t final { public: texts_t texts; + words_t types; words_t stopWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, words_t _stopWords): texts(_texts), stopWords(_stopWords) {} + // corpus_t(texts_t _texts, words_t _types, words_t _stopWords): + // texts(_texts), types(_types), stopWords(_stopWords) {} + corpus_t(texts_t _texts, words_t _stopWords): + texts(_texts), stopWords(_stopWords) {} }; From 693de35c1a058002ea0f3b2f0baaeb34542d9900 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Tue, 21 Nov 2023 18:26:57 +0900 Subject: [PATCH 04/26] Disable parameters for file inputs --- R/RcppExports.R | 4 ++-- src/RcppExports.cpp | 17 ++++++++------ src/rcpp_word2vec.cpp | 14 +++++------ src/word2vec/lib/trainer.cpp | 15 +++++++----- src/word2vec/lib/trainer.hpp | 2 +- src/word2vec/lib/word2vec.cpp | 44 +++++++++++++++++------------------ src/word2vec/lib/word2vec.hpp | 10 ++++---- 7 files changed, 56 insertions(+), 50 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 279c425..c7b440a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) +w2v_train <- function(texts_, stopWords_, modelFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) } w2v_load_model <- function(file, normalize = TRUE) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 9b55f46..b34d216 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,17 +5,20 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string trainFile, std::string modelFile, std::string stopWordsFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP trainFileSEXP, SEXP modelFileSEXP, SEXP stopWordsFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::IntegerVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); - Rcpp::traits::input_parameter< std::string >::type trainFile(trainFileSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type stopWords_(stopWords_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); - Rcpp::traits::input_parameter< std::string >::type stopWordsFile(stopWordsFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); Rcpp::traits::input_parameter< uint8_t >::type window(windowSEXP); @@ -32,7 +35,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< std::string >::type endOfSentenceChars(endOfSentenceCharsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, trainFile, modelFile, stopWordsFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -153,7 +156,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 21}, + {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 19}, {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 5161f7a..115070b 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,10 +11,8 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, - Rcpp::CharacterVector stopWords_, - std::string trainFile, // NOTE: remove + Rcpp::IntegerVector stopWords_, std::string modelFile, - std::string stopWordsFile, // NOTE: remove uint16_t minWordFreq = 5, uint16_t size = 100, uint8_t window = 5, @@ -78,9 +76,9 @@ Rcpp::List w2v_train(Rcpp::List texts_, if (verbose) { // NOTE: consider removing progress bar Progress p(100, true); trained = model->train(trainSettings, corpus, - trainFile, stopWordsFile, // NOTE: remove + //trainFile, stopWordsFile, // NOTE: remove [&p] (float _percent) { - p.update(_percent/2); + p.update(_percent / 2); /* std::cout << "\rParsing train data... " << std::fixed << std::setprecision(2) @@ -118,7 +116,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, //std::cout << std::endl; } else { trained = model->train(trainSettings, corpus, - trainFile, stopWordsFile, // NOTE: remove + //trainFile, stopWordsFile, // NOTE: remove nullptr, [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { /* @@ -153,8 +151,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::List out = Rcpp::List::create( Rcpp::Named("model") = model, Rcpp::Named("data") = Rcpp::List::create( - Rcpp::Named("file") = trainFile, - Rcpp::Named("stopwords") = stopWordsFile, + //Rcpp::Named("file") = trainFile, + //Rcpp::Named("stopwords") = stopWordsFile, Rcpp::Named("n") = totalWords, Rcpp::Named("n_vocabulary") = trainWords ), diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 22f6216..d875bf1 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -13,7 +13,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - const std::shared_ptr &_fileMapper, // NOTE: remove + //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { trainThread_t::sharedData_t sharedData; @@ -26,12 +26,15 @@ namespace w2v { throw std::runtime_error("vocabulary object is not initialized"); } sharedData.vocabulary = _vocabulary; - - if (!_corpus && !_fileMapper) { - throw std::runtime_error("corpus and file mapper objects are not initialized"); + + if (!_corpus) { + throw std::runtime_error("corpus is objects is not initialized"); } - sharedData.corpus = _corpus; - sharedData.fileMapper = _fileMapper; + // if (!_corpus && !_fileMapper) { + // throw std::runtime_error("corpus and file mapper objects are not initialized"); + // } + // sharedData.corpus = _corpus; + // sharedData.fileMapper = _fileMapper; sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _vocabulary->size(), 0.0f)); sharedData.expTable.reset(new std::vector(_trainSettings->expTableSize)); diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index 19acd0b..1506cc7 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -42,7 +42,7 @@ namespace w2v { trainer_t(const std::shared_ptr &_trainSettings, const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - const std::shared_ptr &_fileMapper, // NOTE: remove + //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback); /** diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index ea717a3..dbd8caa 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -14,8 +14,8 @@ namespace w2v { bool w2vModel_t::train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - const std::string &_trainFile, // NOTE: remove - const std::string &_stopWordsFile, // NOTE: remove + //const std::string &_trainFile, // NOTE: remove + //const std::string &_stopWordsFile, // NOTE: remove vocabularyProgressCallback_t _vocabularyProgressCallback, vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { @@ -23,32 +23,32 @@ namespace w2v { // store tokens std::shared_ptr corpus(new corpus_t(_corpus)); // map train data set file to memory - std::shared_ptr trainWordsMapper; - if (!_trainFile.empty()) { - trainWordsMapper.reset(new fileMapper_t(_trainFile)); - } - // map stop-words file to memory - std::shared_ptr stopWordsMapper; - if (!_stopWordsFile.empty()) { - stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); - } + // std::shared_ptr trainWordsMapper; + // if (!_trainFile.empty()) { + // trainWordsMapper.reset(new fileMapper_t(_trainFile)); + // } + // // map stop-words file to memory + // std::shared_ptr stopWordsMapper; + // if (!_stopWordsFile.empty()) { + // stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); + // } // build vocabulary, skip stop-words and words with frequency < minWordFreq std::shared_ptr vocabulary; - if (!_trainFile.empty()) { - vocabulary.reset(new vocabulary_t(trainWordsMapper, - stopWordsMapper, - _trainSettings.wordDelimiterChars, - _trainSettings.endOfSentenceChars, - _trainSettings.minWordFreq, - _vocabularyProgressCallback, - _vocabularyStatsCallback)); - } else { + // if (!_trainFile.empty()) { + // vocabulary.reset(new vocabulary_t(trainWordsMapper, + // stopWordsMapper, + // _trainSettings.wordDelimiterChars, + // _trainSettings.endOfSentenceChars, + // _trainSettings.minWordFreq, + // _vocabularyProgressCallback, + // _vocabularyStatsCallback)); + // } else { vocabulary.reset(new vocabulary_t(corpus, _trainSettings.minWordFreq, _vocabularyProgressCallback, _vocabularyStatsCallback)); - } + //} // key words descending ordered by their indexes std::vector words; @@ -61,7 +61,7 @@ namespace w2v { trainer_t(std::make_shared(_trainSettings), vocabulary, corpus, - trainWordsMapper, // NOTE: remove + //trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index ef2961e..fba9f8e 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,8 +19,10 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; +typedef std::vector words_t; +typedef std::vector text_t; +// typedef std::vector words_t; +// typedef std::vector text_t; typedef std::vector texts_t; namespace w2v { @@ -295,8 +297,8 @@ namespace w2v { */ bool train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - const std::string &_trainFile, // NOTE: remove - const std::string &_stopWordsFile, // NOTE: remove + //const std::string &_trainFile, // NOTE: remove + //const std::string &_stopWordsFile, // NOTE: remove vocabularyProgressCallback_t _vocabularyProgressCallback, vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; From 0664202c49674ce0bdf7570230691c33aab57eb9 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 24 Nov 2023 17:47:38 +0900 Subject: [PATCH 05/26] Remove code for file inputs --- src/rcpp_word2vec.cpp | 19 +++--- src/word2vec/lib/trainThread.cpp | 110 ++++++++++--------------------- src/word2vec/lib/trainThread.hpp | 5 +- src/word2vec/lib/trainer.cpp | 4 +- src/word2vec/lib/trainer.hpp | 2 +- src/word2vec/lib/vocabulary.cpp | 105 +---------------------------- src/word2vec/lib/vocabulary.hpp | 16 ++--- 7 files changed, 60 insertions(+), 201 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 115070b..2eb5021 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,8 +11,8 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, - Rcpp::IntegerVector stopWords_, - std::string modelFile, + Rcpp::CharacterVector stopWords_, + std::string modelFile = "", uint16_t minWordFreq = 5, uint16_t size = 100, uint8_t window = 5, @@ -25,8 +25,6 @@ Rcpp::List w2v_train(Rcpp::List texts_, uint8_t iterations = 5, float alpha = 0.05, bool withSG = false, - std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", - std::string endOfSentenceChars = ".\n?!", bool verbose = false, bool normalize = true) { @@ -65,8 +63,8 @@ Rcpp::List w2v_train(Rcpp::List texts_, trainSettings.iterations = iterations; trainSettings.alpha = alpha; trainSettings.withSG = withSG; - trainSettings.wordDelimiterChars = wordDelimiterChars; - trainSettings.endOfSentenceChars = endOfSentenceChars; + //trainSettings.wordDelimiterChars = wordDelimiterChars; + //trainSettings.endOfSentenceChars = endOfSentenceChars; Rcpp::XPtr model(new w2v::w2vModel_t(), true); bool trained; @@ -170,9 +168,9 @@ Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::Named("negative") = negative, Rcpp::Named("sample") = sample, Rcpp::Named("expTableSize") = expTableSize, - Rcpp::Named("expValueMax") = expValueMax, - Rcpp::Named("split_words") = wordDelimiterChars, - Rcpp::Named("split_sents") = endOfSentenceChars + Rcpp::Named("expValueMax") = expValueMax + //Rcpp::Named("split_words") = wordDelimiterChars, + //Rcpp::Named("split_sents") = endOfSentenceChars ) ); out.attr("class") = "word2vec_trained"; @@ -407,6 +405,7 @@ Rcpp::NumericMatrix w2v_read_binary(const std::string modelFile, bool normalize, return embedding_default; } +/* NOTE: temporarily disabled // [[Rcpp::export]] @@ -463,3 +462,5 @@ Rcpp::DataFrame d2vec_nearest(SEXP ptr_w2v, SEXP ptr_d2v, Rcpp::StringVector x, ); return out; } + + */ diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index c89e222..9edacbb 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -13,7 +13,7 @@ namespace w2v { m_sharedData(_sharedData), m_randomDevice(), m_randomGenerator(m_randomDevice()), m_rndWindowShift(0, static_cast((m_sharedData.trainSettings->window - 1))), m_downSampling(), m_nsDistribution(), m_hiddenLayerVals(), m_hiddenLayerErrors(), - m_wordReader(), m_thread() { + m_thread() { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); @@ -42,25 +42,16 @@ namespace w2v { m_hiddenLayerVals.reset(new std::vector(m_sharedData.trainSettings->size)); } - if (!m_sharedData.corpus && !m_sharedData.fileMapper) { - throw std::runtime_error("corpus and file mapper objects are not initialized"); - } - if (m_sharedData.fileMapper) { - auto shift = m_sharedData.fileMapper->size() / m_sharedData.trainSettings->threads; - auto startFrom = shift * _id; - auto stopAt = (_id == m_sharedData.trainSettings->threads - 1) - ? (m_sharedData.fileMapper->size() - 1) : (shift * (_id + 1)); - m_wordReader.reset(new wordReader_t(*m_sharedData.fileMapper, - m_sharedData.trainSettings->wordDelimiterChars, - m_sharedData.trainSettings->endOfSentenceChars, - startFrom, stopAt)); - } else { - // NOTE: specify range for workers - auto n = m_sharedData.corpus->texts.size(); - auto threads = m_sharedData.trainSettings->threads; - range = std::make_pair(floor((n / threads) * _id), - floor((n / threads) * (_id + 1)) - 1); + if (!m_sharedData.corpus) { + throw std::runtime_error("corpus object is not initialized"); } + + // NOTE: specify range for workers + auto n = m_sharedData.corpus->texts.size(); + auto threads = m_sharedData.trainSettings->threads; + range = std::make_pair(floor((n / threads) * _id), + floor((n / threads) * (_id + 1)) - 1); + } void trainThread_t::worker(std::vector &_trainMatrix) noexcept { @@ -71,9 +62,6 @@ namespace w2v { std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - if (m_sharedData.fileMapper) - m_wordReader->reset(); - std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations * m_sharedData.vocabulary->trainWords(); @@ -100,65 +88,37 @@ namespace w2v { // read sentence std::vector sentence; - if (m_sharedData.fileMapper) { - while (true) { - std::string word; - if (!m_wordReader->nextWord(word)) { - exitFlag = true; // EOF or end of requested region - break; - } - if (word.empty()) { - break; // end of sentence - } - - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } - - threadProcessedWords++; - - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - continue; // skip this word - } - } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); + + // Rcpp::Rcout << "h: " << h << "\n"; + if (h > range.second) { + exitFlag = true; // EOF or end of requested region + break; + } + text_t text = m_sharedData.corpus->texts[h]; + + for (size_t i = 0; i < text.size(); i++) { + + std::string word = text[i]; + if (word.empty()) { + continue; // padding } - - } else { - // Rcpp::Rcout << "h: " << h << "\n"; - if (h > range.second) { - exitFlag = true; // EOF or end of requested region - break; + auto wordData = m_sharedData.vocabulary->data(word); + if (wordData == nullptr) { + continue; // no such word } - text_t text = m_sharedData.corpus->texts[h]; - for (size_t i = 0; i < text.size(); i++) { - - std::string word = text[i]; - if (word.empty()) { - continue; // padding - } - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } - - threadProcessedWords++; - - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - continue; // skip this word - } + threadProcessedWords++; + + if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... + if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { + continue; // skip this word } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); } + //if (h == 1) + // Rcpp::Rcout << word << ": " << wordData->index << "\n"; + sentence.push_back(wordData); } + if (m_sharedData.trainSettings->withSG) { skipGram(sentence, _trainMatrix); } else { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 3e3b0ac..4cafc36 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -18,7 +18,7 @@ #include #include "word2vec.hpp" -#include "wordReader.hpp" +//#include "wordReader.hpp" #include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" @@ -43,7 +43,7 @@ namespace w2v { std::shared_ptr trainSettings; ///< trainSettings structure std::shared_ptr vocabulary; ///< words data std::shared_ptr corpus; ///< train data - std::shared_ptr fileMapper; /// NOTE: remove + //std::shared_ptr fileMapper; /// NOTE: remove std::shared_ptr> bpWeights; ///< back propagation weights std::shared_ptr> expTable; ///< exp(x) / (exp(x) + 1) values lookup table std::shared_ptr huffmanTree; ///< Huffman tree used by hierarchical softmax @@ -65,7 +65,6 @@ namespace w2v { std::unique_ptr m_nsDistribution; std::unique_ptr> m_hiddenLayerVals; std::unique_ptr> m_hiddenLayerErrors; - std::unique_ptr> m_wordReader; std::unique_ptr m_thread; public: diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index d875bf1..5836c55 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -28,8 +28,10 @@ namespace w2v { sharedData.vocabulary = _vocabulary; if (!_corpus) { - throw std::runtime_error("corpus is objects is not initialized"); + throw std::runtime_error("corpus is object is not initialized"); } + sharedData.corpus = _corpus; + // if (!_corpus && !_fileMapper) { // throw std::runtime_error("corpus and file mapper objects are not initialized"); // } diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index 1506cc7..fcfba3b 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -15,7 +15,7 @@ #include #include "word2vec.hpp" -#include "wordReader.hpp" +//#include "wordReader.hpp" #include "vocabulary.hpp" #include "trainThread.hpp" diff --git a/src/word2vec/lib/vocabulary.cpp b/src/word2vec/lib/vocabulary.cpp index 7c4a471..a7d61b4 100644 --- a/src/word2vec/lib/vocabulary.cpp +++ b/src/word2vec/lib/vocabulary.cpp @@ -7,112 +7,9 @@ */ #include "vocabulary.hpp" -#include "wordReader.hpp" +//#include "wordReader.hpp" namespace w2v { - vocabulary_t::vocabulary_t(std::shared_ptr &_trainWordsMapper, - std::shared_ptr &_stopWordsMapper, - const std::string &_wordDelimiterChars, - const std::string &_endOfSentenceChars, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept: m_words() { - // load stop-words - std::vector stopWords; - if (_stopWordsMapper) { - wordReader_t wordReader(*_stopWordsMapper, _wordDelimiterChars, _endOfSentenceChars); - std::string word; - while (wordReader.nextWord(word)) { - stopWords.push_back(word); - } - } - - // load words and calculate their frequencies - struct tmpWordData_t { - std::size_t frequency = 0; - std::string word; - }; - std::unordered_map tmpWords; - off_t progressOffset = 0; - if (_trainWordsMapper) { - wordReader_t wordReader(*_trainWordsMapper, _wordDelimiterChars, _endOfSentenceChars); - std::string word; - while (wordReader.nextWord(word)) { - if (word.empty()) { - word = ""; - } - auto &i = tmpWords[word]; - if (i.frequency == 0) { - i.word = word; - } - i.frequency++; - m_totalWords++; - - if (_progressCallback != nullptr) { - if (wordReader.offset() - progressOffset >= _trainWordsMapper->size() / 10000 - 1) { - _progressCallback(static_cast(wordReader.offset()) - / _trainWordsMapper->size() * 100.0f); - progressOffset = wordReader.offset(); - } - } - } - } - - // remove stop words from the words set - for (auto &i:stopWords) { - tmpWords.erase(i); - } - - // remove sentence delimiter from the words set - { - std::string word = ""; - auto i = tmpWords.find(word); - if (i != tmpWords.end()) { - m_totalWords -= i->second.frequency; - tmpWords.erase(i); - } - } - - // prepare vector sorted by word frequencies - std::vector> wordsFreq; - // delimiter is the first word - wordsFreq.emplace_back(std::pair("", 0LU)); - for (auto const &i:tmpWords) { - //Rcpp::Rcout << i.first << ": " << i.second.frequency << "\n"; - if (i.second.frequency >= _minFreq) { - wordsFreq.emplace_back(std::pair(i.first, i.second.frequency)); - m_trainWords += i.second.frequency; - } - } - - // sorting, from more frequent to less frequent, skip delimiter (first word) - if (wordsFreq.size() > 1) { - std::sort(wordsFreq.begin() + 1, wordsFreq.end(), [](const std::pair &_what, - const std::pair &_with) { - if(_what.second == _with.second){ - return _what.first > _with.first; - }else{ - return _what.second > _with.second; - } - }); - // make delimiter frequency more then the most frequent word - wordsFreq[0].second = wordsFreq[1].second + 1; - // restore sentence delimiter - auto &i = tmpWords[""]; - i.word = ""; - i.frequency = wordsFreq[0].second; - } - // fill index values - for (std::size_t i = 0; i < wordsFreq.size(); ++i) { - auto &w = tmpWords[wordsFreq[i].first]; - m_words[wordsFreq[i].first] = wordData_t(i, w.frequency); - //Rcpp::Rcout << i << " " << wordsFreq[i].first << ": " << wordsFreq[i].second << "\n"; - } - - if (_statsCallback != nullptr) { - _statsCallback(m_words.size(), m_trainWords, m_totalWords); - } - } vocabulary_t::vocabulary_t(std::shared_ptr &_corpus, uint16_t _minFreq, diff --git a/src/word2vec/lib/vocabulary.hpp b/src/word2vec/lib/vocabulary.hpp index 05793fe..28b0a99 100644 --- a/src/word2vec/lib/vocabulary.hpp +++ b/src/word2vec/lib/vocabulary.hpp @@ -16,7 +16,7 @@ #include #include "word2vec.hpp" -#include "mapper.hpp" +//#include "mapper.hpp" namespace w2v { /** @@ -62,13 +62,13 @@ namespace w2v { * @param _statsCallback callback function to be called on train data loaded event to pass vocabulary size, * train words and total words amounts. */ - vocabulary_t(std::shared_ptr &_trainWordsMapper, - std::shared_ptr &_stopWordsMapper, - const std::string &_wordDelimiterChars, - const std::string &_endOfSentenceChars, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; + // vocabulary_t(std::shared_ptr &_trainWordsMapper, + // std::shared_ptr &_stopWordsMapper, + // const std::string &_wordDelimiterChars, + // const std::string &_endOfSentenceChars, + // uint16_t _minFreq, + // w2vModel_t::vocabularyProgressCallback_t _progressCallback, + // w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; vocabulary_t(std::shared_ptr &_corpus, uint16_t _minFreq, From 1e9ab7d4978844800923e6fa1aedba5c6ea450f9 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Thu, 30 Nov 2023 13:58:03 +0900 Subject: [PATCH 06/26] Remove method for character --- NAMESPACE | 1 - R/RcppExports.R | 12 +-- R/word2vec.R | 79 ++---------------- man/word2vec.character.Rd | 167 -------------------------------------- man/word2vec.list.Rd | 2 +- src/RcppExports.cpp | 41 ++-------- 6 files changed, 13 insertions(+), 289 deletions(-) delete mode 100644 man/word2vec.character.Rd diff --git a/NAMESPACE b/NAMESPACE index 816efe4..e2f823c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ S3method(predict,word2vec) S3method(predict,word2vec_trained) S3method(summary,word2vec) S3method(summary,word2vec_trained) -S3method(word2vec,character) S3method(word2vec,list) export(doc2vec) export(read.word2vec) diff --git a/R/RcppExports.R b/R/RcppExports.R index c7b440a..d1533a5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, modelFile, minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", endOfSentenceChars = ".\n?!", verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize) +w2v_train <- function(texts_, stopWords_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) } w2v_load_model <- function(file, normalize = TRUE) { @@ -33,11 +33,3 @@ w2v_read_binary <- function(modelFile, normalize, n) { .Call('_word2vec_w2v_read_binary', PACKAGE = 'word2vec', modelFile, normalize, n) } -d2vec <- function(ptr, x, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { - .Call('_word2vec_d2vec', PACKAGE = 'word2vec', ptr, x, wordDelimiterChars) -} - -d2vec_nearest <- function(ptr_w2v, ptr_d2v, x, wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r") { - .Call('_word2vec_d2vec_nearest', PACKAGE = 'word2vec', ptr_w2v, ptr_d2v, x, wordDelimiterChars) -} - diff --git a/R/word2vec.R b/R/word2vec.R index 2376f01..6cfbcea 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -126,75 +126,6 @@ word2vec <- function(x, UseMethod("word2vec") } -#' @inherit word2vec title description params details seealso return references examples -#' @param split a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x} -#' @param encoding the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. -#' Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument -#' is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector. -#' @param useBytes logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}. -#' @export -word2vec.character <- function(x, - type = c("cbow", "skip-gram"), - dim = 50, window = ifelse(type == "cbow", 5L, 10L), - iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), - threads = 1L, - split = c(" \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", - ".\n?!"), - encoding = "UTF-8", - useBytes = TRUE, - ...){ - type <- match.arg(type) - stopw <- stopwords - model <- file.path(tempdir(), "w2v.bin") - if(length(stopw) == 0){ - stopw <- "" - } - file_stopwords <- tempfile() - filehandle_stopwords <- file(file_stopwords, open = "wt", encoding = encoding) - writeLines(stopw, con = filehandle_stopwords, useBytes = useBytes) - close(filehandle_stopwords) - on.exit({ - if (file.exists(file_stopwords)) file.remove(file_stopwords) - }) - if(length(x) == 1){ - file_train <- x - }else{ - file_train <- tempfile(pattern = "textspace_", fileext = ".txt") - on.exit({ - if (file.exists(file_stopwords)) file.remove(file_stopwords) - if (file.exists(file_train)) file.remove(file_train) - }) - filehandle_train <- file(file_train, open = "wt", encoding = encoding) - writeLines(text = x, con = filehandle_train, useBytes = useBytes) - close(filehandle_train) - } - #expTableSize <- 1000L - #expValueMax <- 6L - #expTableSize <- as.integer(expTableSize) - #expValueMax <- as.integer(expValueMax) - min_count <- as.integer(min_count) - dim <- as.integer(dim) - window <- as.integer(window) - iter <- as.integer(iter) - sample <- as.numeric(sample) - hs <- as.logical(hs) - negative <- as.integer(negative) - threads <- as.integer(threads) - iter <- as.integer(iter) - lr <- as.numeric(lr) - skipgram <- as.logical(type %in% "skip-gram") - split <- as.character(split) - model <- w2v_train(list(), character(), - trainFile = file_train, modelFile = model, stopWordsFile = file_stopwords, - minWordFreq = min_count, - size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, - sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, wordDelimiterChars = split[1], endOfSentenceChars = split[2], ...) - model$data$stopwords <- stopwords - model -} - #' @inherit word2vec title description params details seealso return references #' @export #' @examples @@ -229,12 +160,12 @@ word2vec.list <- function(x, type = c("cbow", "skip-gram"), dim = 50, window = ifelse(type == "cbow", 5L, 10L), iter = 5L, lr = 0.05, hs = FALSE, negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), + stopwords = integer(), threads = 1L, ...){ - x <- lapply(x, as.character) + #x <- lapply(x, as.character) type <- match.arg(type) - stopwords <- as.character(stopwords) + stopwords <- as.integer(stopwords) model <- file.path(tempdir(), "w2v.bin") #expTableSize <- 1000L #expValueMax <- 6L @@ -253,11 +184,11 @@ word2vec.list <- function(x, skipgram <- as.logical(type %in% "skip-gram") encoding <- "UTF-8" model <- w2v_train(x, stopwords, - trainFile = "", modelFile = model, stopWordsFile = "", + modelFile = model, minWordFreq = min_count, size = dim, window = window, #expTableSize = expTableSize, expValueMax = expValueMax, sample = sample, withHS = hs, negative = negative, threads = threads, iterations = iter, - alpha = lr, withSG = skipgram, wordDelimiterChars = "", endOfSentenceChars = "", ...) + alpha = lr, withSG = skipgram, ...) model$data$stopwords <- stopwords model } diff --git a/man/word2vec.character.Rd b/man/word2vec.character.Rd deleted file mode 100644 index 6a4aaa9..0000000 --- a/man/word2vec.character.Rd +++ /dev/null @@ -1,167 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/word2vec.R -\name{word2vec.character} -\alias{word2vec.character} -\title{Train a word2vec model on text} -\usage{ -\method{word2vec}{character}( - x, - type = c("cbow", "skip-gram"), - dim = 50, - window = ifelse(type == "cbow", 5L, 10L), - iter = 5L, - lr = 0.05, - hs = FALSE, - negative = 5L, - sample = 0.001, - min_count = 5L, - stopwords = character(), - threads = 1L, - split = c(" \\n,.-!?:;/\\"#$\%&'()*+<=>@[]\\\\^_`{|}~\\t\\v\\f\\r", ".\\n?!"), - encoding = "UTF-8", - useBytes = TRUE, - ... -) -} -\arguments{ -\item{x}{a character vector with text or the path to the file on disk containing training data or a list of tokens. See the examples.} - -\item{type}{the type of algorithm to use, either 'cbow' or 'skip-gram'. Defaults to 'cbow'} - -\item{dim}{dimension of the word vectors. Defaults to 50.} - -\item{window}{skip length between words. Defaults to 5.} - -\item{iter}{number of training iterations. Defaults to 5.} - -\item{lr}{initial learning rate also known as alpha. Defaults to 0.05} - -\item{hs}{logical indicating to use hierarchical softmax instead of negative sampling. Defaults to FALSE indicating to do negative sampling.} - -\item{negative}{integer with the number of negative samples. Only used in case hs is set to FALSE} - -\item{sample}{threshold for occurrence of words. Defaults to 0.001} - -\item{min_count}{integer indicating the number of time a word should occur to be considered as part of the training vocabulary. Defaults to 5.} - -\item{stopwords}{a character vector of stopwords to exclude from training} - -\item{threads}{number of CPU threads to use. Defaults to 1.} - -\item{split}{a character vector of length 2 where the first element indicates how to split words and the second element indicates how to split sentences in \code{x}} - -\item{encoding}{the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. -Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument -is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector.} - -\item{useBytes}{logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}.} - -\item{...}{further arguments passed on to the methods \code{\link{word2vec.character}}, \code{\link{word2vec.list}} as well as the C++ function \code{w2v_train} - for expert use only} -} -\value{ -an object of class \code{w2v_trained} which is a list with elements -\itemize{ -\item{model: a Rcpp pointer to the model} -\item{data: a list with elements file: the training data used, stopwords: the character vector of stopwords, n} -\item{vocabulary: the number of words in the vocabulary} -\item{success: logical indicating if training succeeded} -\item{error_log: the error log in case training failed} -\item{control: as list of the training arguments used, namely min_count, dim, window, iter, lr, skipgram, hs, negative, sample, split_words, split_sents, expTableSize and expValueMax} -} -} -\description{ -Construct a word2vec model on text. The algorithm is explained at \url{https://arxiv.org/pdf/1310.4546.pdf} -} -\details{ -Some advice on the optimal set of parameters to use for training as defined by Mikolov et al. -\itemize{ -\item{argument type: skip-gram (slower, better for infrequent words) vs cbow (fast)} -\item{argument hs: the training algorithm: hierarchical softmax (better for infrequent words) vs negative sampling (better for frequent words, better with low dimensional vectors)} -\item{argument dim: dimensionality of the word vectors: usually more is better, but not always} -\item{argument window: for skip-gram usually around 10, for cbow around 5} -\item{argument sample: sub-sampling of frequent words: can improve both accuracy and speed for large data sets (useful values are in range 0.001 to 0.00001)} -} -} -\examples{ -\dontshow{if(require(udpipe))\{} -library(udpipe) -## Take data and standardise it a bit -data(brussels_reviews, package = "udpipe") -x <- subset(brussels_reviews, language == "nl") -x <- tolower(x$feedback) - -## Build the model get word embeddings and nearest neighbours -model <- word2vec(x = x, dim = 15, iter = 20) -emb <- as.matrix(model) -head(emb) -emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding") -emb -nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5) -nn - -## Get vocabulary -vocab <- summary(model, type = "vocabulary") - -# Do some calculations with the vectors and find similar terms to these -emb <- as.matrix(model) -vector <- emb["buurt", ] - emb["rustige", ] + emb["restaurants", ] -predict(model, vector, type = "nearest", top_n = 10) - -vector <- emb["gastvrouw", ] - emb["gastvrij", ] -predict(model, vector, type = "nearest", top_n = 5) - -vectors <- emb[c("gastheer", "gastvrouw"), ] -vectors <- rbind(vectors, avg = colMeans(vectors)) -predict(model, vectors, type = "nearest", top_n = 10) - -## Save the model to hard disk -path <- "mymodel.bin" -\dontshow{ -path <- tempfile(pattern = "w2v", fileext = ".bin") -} -write.word2vec(model, file = path) -model <- read.word2vec(path) - -\dontshow{ -file.remove(path) -} -## -## Example of word2vec with a list of tokens -## -toks <- strsplit(x, split = "[[:space:][:punct:]]+") -model <- word2vec(x = toks, dim = 15, iter = 20) -emb <- as.matrix(model) -emb <- predict(model, c("bus", "toilet", "unknownword"), type = "embedding") -emb -nn <- predict(model, c("bus", "toilet"), type = "nearest", top_n = 5) -nn - -## -## Example getting word embeddings -## which are different depending on the parts of speech tag -## Look to the help of the udpipe R package -## to get parts of speech tags on text -## -library(udpipe) -data(brussels_reviews_anno, package = "udpipe") -x <- subset(brussels_reviews_anno, language == "fr") -x <- subset(x, grepl(xpos, pattern = paste(LETTERS, collapse = "|"))) -x$text <- sprintf("\%s/\%s", x$lemma, x$xpos) -x <- subset(x, !is.na(lemma)) -x <- split(x$text, list(x$doc_id, x$sentence_id)) - -model <- word2vec(x = x, dim = 15, iter = 20) -emb <- as.matrix(model) -nn <- predict(model, c("cuisine/NN", "rencontrer/VB"), type = "nearest") -nn -nn <- predict(model, c("accueillir/VBN", "accueillir/VBG"), type = "nearest") -nn - -\dontshow{\} # End of main if statement running only if the required packages are installed} -} -\references{ -\url{https://github.com/maxoodf/word2vec}, \url{https://arxiv.org/pdf/1310.4546.pdf} -} -\seealso{ -\code{\link{predict.word2vec}}, \code{\link{as.matrix.word2vec}}, \code{\link{word2vec}}, \code{\link{word2vec.character}}, \code{\link{word2vec.list}} -} diff --git a/man/word2vec.list.Rd b/man/word2vec.list.Rd index c5d93e3..b92d8f8 100644 --- a/man/word2vec.list.Rd +++ b/man/word2vec.list.Rd @@ -15,7 +15,7 @@ negative = 5L, sample = 0.001, min_count = 5L, - stopwords = character(), + stopwords = integer(), threads = 1L, ... ) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b34d216..c8bfcda 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,13 +11,13 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::IntegerVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, std::string wordDelimiterChars, std::string endOfSentenceChars, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP wordDelimiterCharsSEXP, SEXP endOfSentenceCharsSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type stopWords_(stopWords_SEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); @@ -31,11 +31,9 @@ BEGIN_RCPP Rcpp::traits::input_parameter< uint8_t >::type iterations(iterationsSEXP); Rcpp::traits::input_parameter< float >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< bool >::type withSG(withSGSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - Rcpp::traits::input_parameter< std::string >::type endOfSentenceChars(endOfSentenceCharsSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, wordDelimiterChars, endOfSentenceChars, verbose, normalize)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -127,36 +125,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// d2vec -Rcpp::List d2vec(SEXP ptr, Rcpp::StringVector x, std::string wordDelimiterChars); -RcppExport SEXP _word2vec_d2vec(SEXP ptrSEXP, SEXP xSEXP, SEXP wordDelimiterCharsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); - Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - rcpp_result_gen = Rcpp::wrap(d2vec(ptr, x, wordDelimiterChars)); - return rcpp_result_gen; -END_RCPP -} -// d2vec_nearest -Rcpp::DataFrame d2vec_nearest(SEXP ptr_w2v, SEXP ptr_d2v, Rcpp::StringVector x, std::string wordDelimiterChars); -RcppExport SEXP _word2vec_d2vec_nearest(SEXP ptr_w2vSEXP, SEXP ptr_d2vSEXP, SEXP xSEXP, SEXP wordDelimiterCharsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr_w2v(ptr_w2vSEXP); - Rcpp::traits::input_parameter< SEXP >::type ptr_d2v(ptr_d2vSEXP); - Rcpp::traits::input_parameter< Rcpp::StringVector >::type x(xSEXP); - Rcpp::traits::input_parameter< std::string >::type wordDelimiterChars(wordDelimiterCharsSEXP); - rcpp_result_gen = Rcpp::wrap(d2vec_nearest(ptr_w2v, ptr_d2v, x, wordDelimiterChars)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { - {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 19}, + {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, @@ -164,8 +135,6 @@ static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, {"_word2vec_w2v_nearest_vector", (DL_FUNC) &_word2vec_w2v_nearest_vector, 4}, {"_word2vec_w2v_read_binary", (DL_FUNC) &_word2vec_w2v_read_binary, 3}, - {"_word2vec_d2vec", (DL_FUNC) &_word2vec_d2vec, 3}, - {"_word2vec_d2vec_nearest", (DL_FUNC) &_word2vec_d2vec_nearest, 4}, {NULL, NULL, 0} }; From ebebf341c38ebcbf4df01d91c27015e6783e2844 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Thu, 30 Nov 2023 14:51:05 +0900 Subject: [PATCH 07/26] Change tokens from string to int --- src/rcpp_word2vec.cpp | 10 ++++++--- src/word2vec/lib/trainThread.cpp | 37 ++++++++++++++++---------------- src/word2vec/lib/trainThread.hpp | 4 ++-- src/word2vec/lib/word2vec.hpp | 32 ++++++++++++++++++--------- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 2eb5021..c89179b 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -11,6 +11,7 @@ // [[Rcpp::depends(RcppProgress)]] // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, + Rcpp::CharacterVector types_, Rcpp::CharacterVector stopWords_, std::string modelFile = "", uint16_t minWordFreq = 5, @@ -47,9 +48,12 @@ Rcpp::List w2v_train(Rcpp::List texts_, */ texts_t texts = Rcpp::as(texts_); - words_t stopWords = Rcpp::as(stopWords_); - w2v::corpus_t corpus(texts, stopWords); - + types_t types = Rcpp::as(types_); + types_t stopWords = Rcpp::as(stopWords_); + + w2v::corpus_t corpus(texts, types, stopWords); + corpus.setWordFreq(); + w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; trainSettings.size = size; diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 9edacbb..1844ad4 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -87,7 +87,7 @@ namespace w2v { } // read sentence - std::vector sentence; + std::vector sentence; // Rcpp::Rcout << "h: " << h << "\n"; if (h > range.second) { @@ -98,25 +98,26 @@ namespace w2v { for (size_t i = 0; i < text.size(); i++) { - std::string word = text[i]; - if (word.empty()) { + unsigned int word = text[i]; + if (word == 0) { continue; // padding } - auto wordData = m_sharedData.vocabulary->data(word); - if (wordData == nullptr) { - continue; // no such word - } + // auto wordData = m_sharedData.vocabulary->data(word); + // if (wordData == nullptr) { + // continue; // no such word + // } threadProcessedWords++; if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { + //if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { + if ((*m_downSampling)(m_sharedData.corpus->frequency[word], m_randomGenerator)) { continue; // skip this word } } //if (h == 1) // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(wordData); + sentence.push_back(word); } if (m_sharedData.trainSettings->withSG) { @@ -129,7 +130,7 @@ namespace w2v { } } - inline void trainThread_t::cbow(const std::vector &_sentence, + inline void trainThread_t::cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { for (std::size_t i = 0; i < _sentence.size(); ++i) { // hidden layers initialized with 0 values @@ -148,7 +149,7 @@ namespace w2v { continue; } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { - (*m_hiddenLayerVals)[k] += _trainMatrix[k + _sentence[posRndWindow]->index + (*m_hiddenLayerVals)[k] += _trainMatrix[k + _sentence[posRndWindow] * m_sharedData.trainSettings->size]; } cw++; @@ -161,9 +162,9 @@ namespace w2v { } if (m_sharedData.trainSettings->withHS) { - hierarchicalSoftmax(_sentence[i]->index, *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); + hierarchicalSoftmax(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } else { - negativeSampling(_sentence[i]->index, *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); + negativeSampling(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } // hidden -> in @@ -177,14 +178,14 @@ namespace w2v { continue; } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { - _trainMatrix[k + _sentence[posRndWindow]->index * m_sharedData.trainSettings->size] + _trainMatrix[k + _sentence[posRndWindow] * m_sharedData.trainSettings->size] += (*m_hiddenLayerErrors)[k]; } } } } - inline void trainThread_t::skipGram(const std::vector &_sentence, + inline void trainThread_t::skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { for (std::size_t i = 0; i < _sentence.size(); ++i) { auto rndShift = m_rndWindowShift(m_randomGenerator); @@ -198,15 +199,15 @@ namespace w2v { continue; } // shift to the selected word vector in the matrix - auto shift = _sentence[posRndWindow]->index * m_sharedData.trainSettings->size; + auto shift = _sentence[posRndWindow] * m_sharedData.trainSettings->size; // hidden layer initialized with 0 values std::memset(m_hiddenLayerErrors->data(), 0, m_hiddenLayerErrors->size() * sizeof(float)); if (m_sharedData.trainSettings->withHS) { - hierarchicalSoftmax(_sentence[i]->index, (*m_hiddenLayerErrors), _trainMatrix, shift); + hierarchicalSoftmax(_sentence[i], (*m_hiddenLayerErrors), _trainMatrix, shift); } else { - negativeSampling(_sentence[i]->index, (*m_hiddenLayerErrors), _trainMatrix, shift); + negativeSampling(_sentence[i], (*m_hiddenLayerErrors), _trainMatrix, shift); } for (std::size_t k = 0; k < m_sharedData.trainSettings->size; ++k) { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 4cafc36..63f5f17 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -90,9 +90,9 @@ namespace w2v { private: void worker(std::vector &_trainMatrix) noexcept; - inline void cbow(const std::vector &_sentence, + inline void cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept; - inline void skipGram(const std::vector &_sentence, + inline void skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept; inline void hierarchicalSoftmax(std::size_t _index, std::vector &_hiddenLayer, diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index fba9f8e..aa47de1 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -19,11 +19,12 @@ #include #include -typedef std::vector words_t; -typedef std::vector text_t; -// typedef std::vector words_t; -// typedef std::vector text_t; +typedef std::vector types_t; +// typedef std::vector words_t; +// typedef std::vector text_t; +typedef std::vector text_t; typedef std::vector texts_t; +typedef std::vector frequency_t; namespace w2v { @@ -33,16 +34,27 @@ namespace w2v { class corpus_t final { public: texts_t texts; - words_t types; - words_t stopWords; + types_t types; + types_t stopWords; + frequency_t frequency; // Constructors corpus_t(): texts() {} - // corpus_t(texts_t _texts, words_t _types, words_t _stopWords): - // texts(_texts), types(_types), stopWords(_stopWords) {} - corpus_t(texts_t _texts, words_t _stopWords): - texts(_texts), stopWords(_stopWords) {} + corpus_t(texts_t _texts, types_t _types, types_t _stopWords): + texts(_texts), types(_types), stopWords(_stopWords) {} + //corpus_t(texts_t _texts, words_t _stopWords): + // texts(_texts), stopWords(_stopWords) {} + void setWordFreq() { + frequency = frequency_t(types.size(), 0); + for (size_t h = 0; h < texts.size(); h++) { + text_t text = texts[h]; + for (size_t i = 0; i < text.size(); i++) { + unsigned int word = text[i]; + frequency[word - 1]++; + } + } + } }; /** From 511eeb7d6620993bc6445281168d6faa10b45204 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Wed, 6 Dec 2023 15:01:00 +0900 Subject: [PATCH 08/26] Remove vocabulary --- src/Makevars | 1 - src/Makevars.win | 1 - src/word2vec/lib/vocabulary.cpp | 108 ------------------------ src/word2vec/lib/vocabulary.hpp | 140 -------------------------------- 4 files changed, 250 deletions(-) delete mode 100644 src/word2vec/lib/vocabulary.cpp delete mode 100644 src/word2vec/lib/vocabulary.hpp diff --git a/src/Makevars b/src/Makevars index 9f2426e..620ba58 100644 --- a/src/Makevars +++ b/src/Makevars @@ -6,7 +6,6 @@ SOURCES = word2vec/lib/huffmanTree.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ - word2vec/lib/vocabulary.cpp \ word2vec/lib/word2vec.cpp \ rcpp_word2vec.cpp \ RcppExports.cpp diff --git a/src/Makevars.win b/src/Makevars.win index 0affdf1..459c5a1 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -6,7 +6,6 @@ SOURCES = word2vec/lib/huffmanTree.cpp \ word2vec/lib/nsDistribution.cpp \ word2vec/lib/trainer.cpp \ word2vec/lib/trainThread.cpp \ - word2vec/lib/vocabulary.cpp \ word2vec/lib/win/mman.cpp \ word2vec/lib/word2vec.cpp \ rcpp_word2vec.cpp \ diff --git a/src/word2vec/lib/vocabulary.cpp b/src/word2vec/lib/vocabulary.cpp deleted file mode 100644 index a7d61b4..0000000 --- a/src/word2vec/lib/vocabulary.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/** - * @file - * @brief vocabulary class containing word map, words frequencies and word indexes - * @author Max Fomichev - * @date 16.12.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ - -#include "vocabulary.hpp" -//#include "wordReader.hpp" - -namespace w2v { - - vocabulary_t::vocabulary_t(std::shared_ptr &_corpus, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept: m_words() { - - // load words and calculate their frequencies - struct tmpWordData_t { - std::size_t frequency = 0; - std::string word; - }; - std::unordered_map tmpWords; - std::string word; - //off_t progressOffset = 0; - - for (auto &text:_corpus->texts) { - for (auto &word:text) { - // padding - if (word.empty()) { - continue; - } - auto &tmpWordData = tmpWords[word]; - if (tmpWordData.frequency == 0) { - tmpWordData.word = word; - } - tmpWordData.frequency++; - m_totalWords++; - - // if (_progressCallback != nullptr) { - // if (wordReader.offset() - progressOffset >= _trainWordsMapper->size() / 10000 - 1) { - // _progressCallback(static_cast(wordReader.offset()) - // / _trainWordsMapper->size() * 100.0f); - // progressOffset = wordReader.offset(); - // } - // } - } - } - - // remove stop words from the words set - for (auto &i:_corpus->stopWords) { - tmpWords.erase(i); - } - - // remove sentence delimiter from the words set - // { - // std::string word = ""; - // auto i = tmpWords.find(word); - // if (i != tmpWords.end()) { - // m_totalWords -= i->second.frequency; - // tmpWords.erase(i); - // } - // } - - // prepare vector sorted by word frequencies - std::vector> wordsFreq; - // delimiter is the first word - wordsFreq.emplace_back(std::pair("", 0LU)); - for (auto const &i:tmpWords) { - if (i.second.frequency >= _minFreq) { - wordsFreq.emplace_back(std::pair(i.first, i.second.frequency)); - m_trainWords += i.second.frequency; - } - } - - // sorting, from more frequent to less frequent, skip delimiter (first word) - if (wordsFreq.size() > 1) { - std::sort(wordsFreq.begin() + 1, wordsFreq.end(), [](const std::pair &_what, - const std::pair &_with) { - if(_what.second == _with.second){ - return _what.first > _with.first; - }else{ - return _what.second > _with.second; - } - - }); - // NOTE: should the index 0 be non word? - // make delimiter frequency more then the most frequent word - wordsFreq[0].second = wordsFreq[1].second + 1; - // restore sentence delimiter - auto &i = tmpWords[""]; - i.word = ""; - i.frequency = wordsFreq[0].second; - } - // fill index values - //wordsFreq.emplace(wordsFreq.begin(), 0, std::pair("", 0U)); // NOTE: insert dummy - for (std::size_t i = 0; i < wordsFreq.size(); ++i) { - auto &w = tmpWords[wordsFreq[i].first]; - m_words[wordsFreq[i].first] = wordData_t(i, w.frequency); - //Rcpp::Rcout << i << " " << wordsFreq[i].first << ": " << wordsFreq[i].second << "\n"; - } - - if (_statsCallback != nullptr) { - _statsCallback(m_words.size(), m_trainWords, m_totalWords); - } - } -} diff --git a/src/word2vec/lib/vocabulary.hpp b/src/word2vec/lib/vocabulary.hpp deleted file mode 100644 index 28b0a99..0000000 --- a/src/word2vec/lib/vocabulary.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/** - * @file - * @brief vocabulary class containing word map, words frequencies and word indexes - * @author Max Fomichev - * @date 16.12.2016 - * @copyright Apache License v.2 (http://www.apache.org/licenses/LICENSE-2.0) -*/ -#ifndef WORD2VEC_VOCABULARY_H -#define WORD2VEC_VOCABULARY_H - -#include -#include -#include -#include -#include -#include - -#include "word2vec.hpp" -//#include "mapper.hpp" - -namespace w2v { - /** - * @brief vocabulary class - implements fast access to a words storage with their data - index and frequency. - * - * Vocabulary contains parsed words with minimum defined frequency, excluding stop words defined in a text file. - * Base word storage is the std::unordered_map object. - * - */ - class vocabulary_t final { - public: - /** - * @brief wordData structure is a stored word parameters - index and frequency - */ - struct wordData_t final { - std::size_t index; ///< word index (more frequent words have lower index value) - std::size_t frequency; ///< word frequency in a train data set - - /// Constructs an empty wordData object - wordData_t() noexcept: index(0), frequency(0) {} - /// Constructs a wordObject with the specified parameters - wordData_t(std::size_t _index, std::size_t _frequency) noexcept: - index(_index), frequency(_frequency) {} - }; - - private: - // word (key) with its index and frequency - using wordMap_t = std::unordered_map; - - std::size_t m_trainWords = 0; - std::size_t m_totalWords = 0; - - wordMap_t m_words; - - public: - /** - * Constructs a vocabulary object from the specified files and parameters - * @param _trainWordsMapper smart pointer to fileMapper object related to a train data set file - * @param _stopWordsMapper smart pointer to fileMapper object related to a file with stop-words. - * In case of unititialized pointer, _stopWordsMapper will be ignored. - * @param _minFreq minimum word frequency to include into vocabulary - * @param _progressCallback callback function to be called on each new 0.01% processed train data - * @param _statsCallback callback function to be called on train data loaded event to pass vocabulary size, - * train words and total words amounts. - */ - // vocabulary_t(std::shared_ptr &_trainWordsMapper, - // std::shared_ptr &_stopWordsMapper, - // const std::string &_wordDelimiterChars, - // const std::string &_endOfSentenceChars, - // uint16_t _minFreq, - // w2vModel_t::vocabularyProgressCallback_t _progressCallback, - // w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; - - vocabulary_t(std::shared_ptr &_corpus, - uint16_t _minFreq, - w2vModel_t::vocabularyProgressCallback_t _progressCallback, - w2vModel_t::vocabularyStatsCallback_t _statsCallback) noexcept; - - /** - * Requests a data (index, frequency, word) associated with the _word - * @param[in] _word key value - * @return pointer to a wordData object or nullptr if the word is not a member of vocabulary - */ - inline const wordData_t *data(const std::string &_word) const noexcept { - auto i = m_words.find(_word); - if (i != m_words.end()) { - return &(i->second); - } else { - return nullptr; - } - } - - /// @retrns vocabulary size - inline std::size_t size() const noexcept { - return m_words.size(); - } - - /// @returns total words amount parsed from a train data set - inline std::size_t totalWords() const noexcept { - return m_totalWords; - } - - /// @returns train words amount (totalWords - amount(stop words) - amount(words with low frequency)) - inline std::size_t trainWords() const noexcept { - return m_trainWords; - } - - /** - * Requests word frequencies - * @param[out] _output - vector of word frequencies where vector indexes are word indexes and vector values - * are word frequencies - */ - inline void frequencies(std::vector &_output) const noexcept { - _output.resize(m_words.size()); - for (auto const &i:m_words) { - _output[i.second.index] = i.second.frequency; - } - } - - /** - * Requests words descending sorted by their frequencies - * @param[out] _words vector of word descending sorted by their frequencies - */ - inline void words(std::vector &_words) const noexcept { - _words.clear(); - std::vector> indexedWords; - for (auto const &i:m_words) { - indexedWords.emplace_back(std::pair(i.second.index, i.first)); - } - std::sort(indexedWords.begin(), indexedWords.end(), [](const std::pair &_what, - const std::pair &_with) { - return _what.first < _with.first; - }); - for (auto const &i:indexedWords) { - _words.push_back(i.second); - } - } - }; -} - -#endif // WORD2VEC_VOCABULARY_H From e54321c7db9b41d2a75dca142de621cf1aa78920 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 07:20:14 +0900 Subject: [PATCH 09/26] Set frequency --- src/word2vec/lib/word2vec.hpp | 36 +++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index aa47de1..dea4df2 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -22,9 +22,10 @@ typedef std::vector types_t; // typedef std::vector words_t; // typedef std::vector text_t; +typedef std::vector words_t; typedef std::vector text_t; typedef std::vector texts_t; -typedef std::vector frequency_t; +typedef std::vector frequency_t; namespace w2v { @@ -35,25 +36,52 @@ namespace w2v { public: texts_t texts; types_t types; - types_t stopWords; + words_t stopWords; frequency_t frequency; + size_t totalWords; + size_t trainWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, types_t _types, types_t _stopWords): + corpus_t(texts_t _texts, types_t _types, words_t _stopWords): texts(_texts), types(_types), stopWords(_stopWords) {} //corpus_t(texts_t _texts, words_t _stopWords): // texts(_texts), stopWords(_stopWords) {} void setWordFreq() { + Rcpp::Rcout << "here1\n"; + + std::unordered_set setStopWords; + for (size_t g = 0; g < stopWords.size(); g++) { + setStopWords.insert(stopWords[g]); + } + //Rcpp::Rcout << "here2\n"; + //return; + frequency = frequency_t(types.size(), 0); + totalWords = 0; + trainWords = 0; for (size_t h = 0; h < texts.size(); h++) { text_t text = texts[h]; for (size_t i = 0; i < text.size(); i++) { - unsigned int word = text[i]; + int word = text[i]; + //Rcpp::Rcout << i << ": " << word << "\n"; + if (word == 0) // padding + continue; + if (word < 0 || frequency.size() < word - 1) + throw std::range_error("setWordFreq: invalid types"); frequency[word - 1]++; + totalWords++; + auto it = setStopWords.find(word); + if (it != setStopWords.end()) { + trainWords++; + } else { + texts[h][i] = 0; + } } } + Rcpp::Rcout << "trainWords: " << trainWords << "\n"; + Rcpp::Rcout << "totalWords: " << totalWords << "\n"; } }; From 6d75e56d9c9fad851de06c2bfc053c6770b5fd14 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 09:02:32 +0900 Subject: [PATCH 10/26] Remove stopwords where types == "" --- src/rcpp_word2vec.cpp | 10 +++++++--- src/word2vec/lib/word2vec.hpp | 29 ++++++++++++----------------- tests/{train.R => test.R} | 11 +++++++++-- 3 files changed, 28 insertions(+), 22 deletions(-) rename tests/{train.R => test.R} (78%) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index c89179b..61c9a27 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -12,7 +12,6 @@ // [[Rcpp::export]] Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector types_, - Rcpp::CharacterVector stopWords_, std::string modelFile = "", uint16_t minWordFreq = 5, uint16_t size = 100, @@ -49,11 +48,16 @@ Rcpp::List w2v_train(Rcpp::List texts_, texts_t texts = Rcpp::as(texts_); types_t types = Rcpp::as(types_); - types_t stopWords = Rcpp::as(stopWords_); - w2v::corpus_t corpus(texts, types, stopWords); + w2v::corpus_t corpus(texts, types); corpus.setWordFreq(); + Rcpp::List out2 = Rcpp::List::create( + Rcpp::Named("frequency") = corpus.frequency + ); + + //return out2; + w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; trainSettings.size = size; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index dea4df2..aec14cc 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -36,47 +36,42 @@ namespace w2v { public: texts_t texts; types_t types; - words_t stopWords; frequency_t frequency; size_t totalWords; size_t trainWords; // Constructors corpus_t(): texts() {} - corpus_t(texts_t _texts, types_t _types, words_t _stopWords): - texts(_texts), types(_types), stopWords(_stopWords) {} + corpus_t(texts_t _texts, types_t _types): + texts(_texts), types(_types) {} //corpus_t(texts_t _texts, words_t _stopWords): // texts(_texts), stopWords(_stopWords) {} void setWordFreq() { Rcpp::Rcout << "here1\n"; - std::unordered_set setStopWords; - for (size_t g = 0; g < stopWords.size(); g++) { - setStopWords.insert(stopWords[g]); - } - //Rcpp::Rcout << "here2\n"; - //return; - + std::unordered_set setStopWords; + frequency = frequency_t(types.size(), 0); totalWords = 0; trainWords = 0; for (size_t h = 0; h < texts.size(); h++) { text_t text = texts[h]; for (size_t i = 0; i < text.size(); i++) { - int word = text[i]; + totalWords++; + int &word = text[i]; //Rcpp::Rcout << i << ": " << word << "\n"; if (word == 0) // padding continue; - if (word < 0 || frequency.size() < word - 1) + if (word < 0 || types.size() < word) throw std::range_error("setWordFreq: invalid types"); frequency[word - 1]++; - totalWords++; - auto it = setStopWords.find(word); - if (it != setStopWords.end()) { - trainWords++; + if (types[word - 1].empty()) { + //Rcpp::Rcout << h << " " << i << " remove : " << word << "\n"; + word = 0; // remove and pad } else { - texts[h][i] = 0; + //Rcpp::Rcout << h << " " << i << " count : " << word << "\n"; + trainWords++; } } } diff --git a/tests/train.R b/tests/test.R similarity index 78% rename from tests/train.R rename to tests/test.R index 030fc2b..e232226 100644 --- a/tests/train.R +++ b/tests/test.R @@ -4,8 +4,15 @@ library(word2vec) corp <- data_corpus_inaugural %>% corpus_reshape() toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) -lis <- as.list(toks) -txt <- stringi::stri_c_list(lis, " ") +lis <- unclass(toks) + +type <- types(toks) +type[type %in% stopwords()] <- "" +mod <- word2vec:::w2v_train(toks, type, verbose = TRUE) +dim(as.matrix(mod)) + +mod2 <- word2vec:::w2v_train(unclass(toks)[1:10], types(toks), verbose = TRUE) +dim(as.matrix(mod2)) mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, verbose = TRUE, threads = 4) From 837e7c8fb412b96d1fe20645f88dcbc77e5f96f0 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 10:18:36 +0900 Subject: [PATCH 11/26] Don't remove any words --- src/rcpp_word2vec.cpp | 10 +++++----- src/word2vec/lib/word2vec.hpp | 21 ++++++++------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 61c9a27..5982359 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -52,11 +52,11 @@ Rcpp::List w2v_train(Rcpp::List texts_, w2v::corpus_t corpus(texts, types); corpus.setWordFreq(); - Rcpp::List out2 = Rcpp::List::create( - Rcpp::Named("frequency") = corpus.frequency - ); - - //return out2; + // Rcpp::List out2 = Rcpp::List::create( + // Rcpp::Named("frequency") = corpus.frequency + // ); + // + // return out2; w2v::trainSettings_t trainSettings; trainSettings.minWordFreq = minWordFreq; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index aec14cc..9407bb9 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -48,10 +48,7 @@ namespace w2v { // texts(_texts), stopWords(_stopWords) {} void setWordFreq() { - Rcpp::Rcout << "here1\n"; - std::unordered_set setStopWords; - frequency = frequency_t(types.size(), 0); totalWords = 0; trainWords = 0; @@ -59,20 +56,18 @@ namespace w2v { text_t text = texts[h]; for (size_t i = 0; i < text.size(); i++) { totalWords++; - int &word = text[i]; + auto &word = text[i]; //Rcpp::Rcout << i << ": " << word << "\n"; - if (word == 0) // padding - continue; if (word < 0 || types.size() < word) throw std::range_error("setWordFreq: invalid types"); + if (word == 0) // padding + continue; + // if (types[word - 1].empty()) { + // word = 0; // remove and pad + // continue; + // } frequency[word - 1]++; - if (types[word - 1].empty()) { - //Rcpp::Rcout << h << " " << i << " remove : " << word << "\n"; - word = 0; // remove and pad - } else { - //Rcpp::Rcout << h << " " << i << " count : " << word << "\n"; - trainWords++; - } + trainWords++; } } Rcpp::Rcout << "trainWords: " << trainWords << "\n"; From ed28dcb038d7917a30ae9c0dffa821daa2c0dea5 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 10 Feb 2024 10:32:08 +0900 Subject: [PATCH 12/26] Disable save() and load() --- src/rcpp_word2vec.cpp | 4 +++- src/word2vec/lib/word2vec.cpp | 25 +++++++++++++------------ src/word2vec/lib/word2vec.hpp | 12 ++++++------ 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index 5982359..a5714ae 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -140,6 +140,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, }, nullptr); } + //return Rcpp::List::create(); bool success = true; if (!trained) { Rcpp::Rcout << "Training failed: " << model->errMsg() << std::endl; @@ -185,7 +186,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, return out; } - +/* // [[Rcpp::export]] Rcpp::List w2v_load_model(std::string file, bool normalize = true) { bool normalise = normalize; @@ -209,6 +210,7 @@ bool w2v_save_model(SEXP ptr, std::string file) { bool success = model->save(file); return success; } +*/ // [[Rcpp::export]] std::vector w2v_dictionary(SEXP ptr) { diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index dbd8caa..51d6b21 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -8,7 +8,7 @@ #include #include "word2vec.hpp" #include "wordReader.hpp" -#include "vocabulary.hpp" +//#include "vocabulary.hpp" #include "trainer.hpp" namespace w2v { @@ -34,7 +34,7 @@ namespace w2v { // } // build vocabulary, skip stop-words and words with frequency < minWordFreq - std::shared_ptr vocabulary; + //std::shared_ptr vocabulary; // if (!_trainFile.empty()) { // vocabulary.reset(new vocabulary_t(trainWordsMapper, // stopWordsMapper, @@ -44,29 +44,29 @@ namespace w2v { // _vocabularyProgressCallback, // _vocabularyStatsCallback)); // } else { - vocabulary.reset(new vocabulary_t(corpus, - _trainSettings.minWordFreq, - _vocabularyProgressCallback, - _vocabularyStatsCallback)); + // vocabulary.reset(new vocabulary_t(corpus, + // _trainSettings.minWordFreq, + // _vocabularyProgressCallback, + // _vocabularyStatsCallback)); //} // key words descending ordered by their indexes - std::vector words; - vocabulary->words(words); + //std::vector words; + //vocabulary->words(words); m_vectorSize = _trainSettings.size; - m_mapSize = vocabulary->size(); + m_mapSize = corpus->types.size(); // train model std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), - vocabulary, + //vocabulary, corpus, //trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; std::size_t wordIndex = 0; - for (auto const &i:words) { + for (auto const &i : corpus->types) { auto &v = m_map[i]; v.resize(m_vectorSize); std::copy(&_trainMatrix[wordIndex * m_vectorSize], @@ -84,7 +84,7 @@ namespace w2v { return false; } - + /* bool w2vModel_t::save(const std::string &_modelFile) const noexcept { try { // save trained data in original word2vec format @@ -334,4 +334,5 @@ namespace w2v { i /= med; } } + */ } diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index 9407bb9..ae34896 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -206,9 +206,9 @@ namespace w2v { const map_t &map() {return m_map;} /// pure virtual method to save model of a derived class - virtual bool save(const std::string &_modelFile) const noexcept = 0; + //virtual bool save(const std::string &_modelFile) const noexcept = 0; /// pure virtual method to load model of a derived class - virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; + //virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; /** * Vector access by key value @@ -334,9 +334,9 @@ namespace w2v { trainProgressCallback_t _trainProgressCallback) noexcept; /// saves word vectors to file with _modelFile name - bool save(const std::string &_modelFile) const noexcept override; + //bool save(const std::string &_modelFile) const noexcept override; /// loads word vectors from file with _modelFile name - bool load(const std::string &_modelFile, bool normalize = true) noexcept override; + //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; /** * Normalise vectors */ @@ -390,9 +390,9 @@ namespace w2v { m_mapSize = m_map.size(); } /// saves document vectors to file with _modelFile name - bool save(const std::string &_modelFile) const noexcept override; + //bool save(const std::string &_modelFile) const noexcept override; /// loads document vectors from file with _modelFile name - bool load(const std::string &_modelFile, bool normalize = true) noexcept override; + //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; }; /** From 193c3510aa3a56508801486b6aa6ed3ecbae3638 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 09:34:28 +0900 Subject: [PATCH 13/26] Remove progress bar and callback for vocaburary --- src/rcpp_word2vec.cpp | 79 ++++++++++++++++++----------------- src/word2vec/lib/word2vec.cpp | 7 ++-- src/word2vec/lib/word2vec.hpp | 4 +- 3 files changed, 46 insertions(+), 44 deletions(-) diff --git a/src/rcpp_word2vec.cpp b/src/rcpp_word2vec.cpp index a5714ae..dc22480 100644 --- a/src/rcpp_word2vec.cpp +++ b/src/rcpp_word2vec.cpp @@ -83,28 +83,28 @@ Rcpp::List w2v_train(Rcpp::List texts_, Progress p(100, true); trained = model->train(trainSettings, corpus, //trainFile, stopWordsFile, // NOTE: remove - [&p] (float _percent) { - p.update(_percent / 2); - /* - std::cout << "\rParsing train data... " - << std::fixed << std::setprecision(2) - << _percent << "%" << std::flush; - */ - }, - [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - /* - Rcpp::Rcerr << std::endl - << "Finished reading data: " << std::endl - << "Vocabulary size: " << _vocWords << std::endl - << "Train words: " << _trainWords << std::endl - << "Total words: " << _totalWords << std::endl - << "Start training" << std::endl - << std::endl; - */ - vocWords = _vocWords; - trainWords = _trainWords; - totalWords = _totalWords; - }, + // [&p] (float _percent) { + // p.update(_percent / 2); + // /* + // std::cout << "\rParsing train data... " + // << std::fixed << std::setprecision(2) + // << _percent << "%" << std::flush; + // */ + // }, + // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { + // /* + // Rcpp::Rcerr << std::endl + // << "Finished reading data: " << std::endl + // << "Vocabulary size: " << _vocWords << std::endl + // << "Train words: " << _trainWords << std::endl + // << "Total words: " << _totalWords << std::endl + // << "Start training" << std::endl + // << std::endl; + // */ + // vocWords = _vocWords; + // trainWords = _trainWords; + // totalWords = _totalWords; + // }, [&p] (float _alpha, float _percent) { /* std::cout << '\r' @@ -116,30 +116,31 @@ Rcpp::List w2v_train(Rcpp::List texts_, << _percent << "%" << std::flush; */ - p.update(50 + (_percent / 2)); + p.update(_percent); } ); //std::cout << std::endl; } else { trained = model->train(trainSettings, corpus, //trainFile, stopWordsFile, // NOTE: remove - nullptr, - [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { - /* - Rcpp::Rcerr << std::endl - << "Finished reading data: " << std::endl - << "Vocabulary size: " << _vocWords << std::endl - << "Train words: " << _trainWords << std::endl - << "Total words: " << _totalWords << std::endl - << "Start training" << std::endl - << std::endl; - */ - vocWords = _vocWords; - trainWords = _trainWords; - totalWords = _totalWords; - }, + // nullptr, + // [&vocWords, &trainWords, &totalWords] (std::size_t _vocWords, std::size_t _trainWords, std::size_t _totalWords) { + // /* + // Rcpp::Rcerr << std::endl + // << "Finished reading data: " << std::endl + // << "Vocabulary size: " << _vocWords << std::endl + // << "Train words: " << _trainWords << std::endl + // << "Total words: " << _totalWords << std::endl + // << "Start training" << std::endl + // << std::endl; + // */ + // vocWords = _vocWords; + // trainWords = _trainWords; + // totalWords = _totalWords; + // }, nullptr); } + Rcpp::Rcout << "Training done\n"; //return Rcpp::List::create(); bool success = true; if (!trained) { @@ -149,7 +150,7 @@ Rcpp::List w2v_train(Rcpp::List texts_, // NORMALISE UPFRONT - DIFFERENT THAN ORIGINAL CODE // - original code dumps data to disk, next imports it and during import normalisation happens after which we can do nearest calculations // - the R wrapper only writes to disk at request so we need to normalise upfront in order to do directly nearest calculations - if(normalize){ + if (normalize) { //Rcpp::Rcout << "Finished training: finalising with embedding normalisation" << std::endl; model->normalize(); } diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 51d6b21..41a1234 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -16,8 +16,8 @@ namespace w2v { const corpus_t &_corpus, //const std::string &_trainFile, // NOTE: remove //const std::string &_stopWordsFile, // NOTE: remove - vocabularyProgressCallback_t _vocabularyProgressCallback, - vocabularyStatsCallback_t _vocabularyStatsCallback, + //vocabularyProgressCallback_t _vocabularyProgressCallback, + //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { try { // store tokens @@ -57,7 +57,7 @@ namespace w2v { m_mapSize = corpus->types.size(); // train model - std::vector _trainMatrix; + std::vector _trainMatrix; // NOTE: consider directly making m_map trainer_t(std::make_shared(_trainSettings), //vocabulary, corpus, @@ -67,6 +67,7 @@ namespace w2v { std::size_t wordIndex = 0; for (auto const &i : corpus->types) { + //Rcpp::Rcout << i << "\n"; auto &v = m_map[i]; v.resize(m_vectorSize); std::copy(&_trainMatrix[wordIndex * m_vectorSize], diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index ae34896..e2f2dfe 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -329,8 +329,8 @@ namespace w2v { const corpus_t &_corpus, //const std::string &_trainFile, // NOTE: remove //const std::string &_stopWordsFile, // NOTE: remove - vocabularyProgressCallback_t _vocabularyProgressCallback, - vocabularyStatsCallback_t _vocabularyStatsCallback, + //vocabularyProgressCallback_t _vocabularyProgressCallback, + //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; /// saves word vectors to file with _modelFile name From 1d0d355bed1ed9a9872e4d0cb8f0664de4bcf023 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 10:31:05 +0900 Subject: [PATCH 14/26] Improve handling of sentence lenghts --- src/word2vec/lib/trainThread.cpp | 41 ++++++++++++++++---------------- src/word2vec/lib/trainThread.hpp | 4 ++-- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 1844ad4..6e15457 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -18,19 +18,19 @@ namespace w2v { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); } - if (!m_sharedData.vocabulary) { - throw std::runtime_error("vocabulary object is not initialized"); - } + // if (!m_sharedData.vocabulary) { + // throw std::runtime_error("vocabulary object is not initialized"); + // } if (m_sharedData.trainSettings->sample > 0.0f) { m_downSampling.reset(new downSampling_t(m_sharedData.trainSettings->sample, - m_sharedData.vocabulary->trainWords())); + m_sharedData.corpus->trainWords)); } if (m_sharedData.trainSettings->negative > 0) { - std::vector frequencies; - m_sharedData.vocabulary->frequencies(frequencies); - m_nsDistribution.reset(new nsDistribution_t(frequencies)); + //std::vector frequencies; + //m_sharedData.vocabulary->frequencies(frequencies); + m_nsDistribution.reset(new nsDistribution_t(m_sharedData.corpus->frequency)); } if (m_sharedData.trainSettings->withHS && !m_sharedData.huffmanTree) { @@ -64,7 +64,7 @@ namespace w2v { std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations - * m_sharedData.vocabulary->trainWords(); + * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; while (!exitFlag) { // calc alpha @@ -86,32 +86,26 @@ namespace w2v { } } - // read sentence - std::vector sentence; - - // Rcpp::Rcout << "h: " << h << "\n"; if (h > range.second) { exitFlag = true; // EOF or end of requested region break; } text_t text = m_sharedData.corpus->texts[h]; + // read sentence + std::vector sentence; + sentence.reserve(text.size()); for (size_t i = 0; i < text.size(); i++) { - unsigned int word = text[i]; + auto &word = text[i]; if (word == 0) { continue; // padding } - // auto wordData = m_sharedData.vocabulary->data(word); - // if (wordData == nullptr) { - // continue; // no such word - // } - + threadProcessedWords++; - if (m_sharedData.trainSettings->sample > 0.0f) { // down-sampling... - //if ((*m_downSampling)(wordData->frequency, m_randomGenerator)) { - if ((*m_downSampling)(m_sharedData.corpus->frequency[word], m_randomGenerator)) { + if (m_sharedData.trainSettings->sample > 0.0f) { + if ((*m_downSampling)(m_sharedData.corpus->frequency[word - 1], m_randomGenerator)) { continue; // skip this word } } @@ -132,6 +126,9 @@ namespace w2v { inline void trainThread_t::cbow(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { + + if (_sentence.size() == 0) + return; for (std::size_t i = 0; i < _sentence.size(); ++i) { // hidden layers initialized with 0 values std::memset(m_hiddenLayerVals->data(), 0, m_hiddenLayerVals->size() * sizeof(float)); @@ -187,6 +184,8 @@ namespace w2v { inline void trainThread_t::skipGram(const std::vector &_sentence, std::vector &_trainMatrix) noexcept { + if (_sentence.size() == 0) + return; for (std::size_t i = 0; i < _sentence.size(); ++i) { auto rndShift = m_rndWindowShift(m_randomGenerator); for (auto j = rndShift; j < m_sharedData.trainSettings->window * 2 + 1 - rndShift; ++j) { diff --git a/src/word2vec/lib/trainThread.hpp b/src/word2vec/lib/trainThread.hpp index 63f5f17..f7af72a 100644 --- a/src/word2vec/lib/trainThread.hpp +++ b/src/word2vec/lib/trainThread.hpp @@ -19,7 +19,7 @@ #include "word2vec.hpp" //#include "wordReader.hpp" -#include "vocabulary.hpp" +//#include "vocabulary.hpp" #include "huffmanTree.hpp" #include "nsDistribution.hpp" #include "downSampling.hpp" @@ -41,7 +41,7 @@ namespace w2v { */ struct sharedData_t final { std::shared_ptr trainSettings; ///< trainSettings structure - std::shared_ptr vocabulary; ///< words data + //std::shared_ptr vocabulary; ///< words data std::shared_ptr corpus; ///< train data //std::shared_ptr fileMapper; /// NOTE: remove std::shared_ptr> bpWeights; ///< back propagation weights From 59ea5247f484d5b98b76955cf2263dfeb290eda1 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 11:17:25 +0900 Subject: [PATCH 15/26] Use for loop --- src/word2vec/lib/trainThread.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 6e15457..389a618 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -56,17 +56,21 @@ namespace w2v { void trainThread_t::worker(std::vector &_trainMatrix) noexcept { + Rcpp::Rcout << "Texts: " << range.first << " to " << range.second << "\n"; for (auto g = m_sharedData.trainSettings->iterations; g > 0; --g) { //Rcpp::Rcout << "g: " << (int)g << "\n"; - bool exitFlag = false; + //bool exitFlag = false; std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - std::size_t h = range.first; // NOTE: only used for corpus + //std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; - while (!exitFlag) { + //while (!exitFlag) { + //while (h <= range.second) { + for (std::size_t h = range.first; h <= range.second; h++) { + // calc alpha if (threadProcessedWords - prvThreadProcessedWords > wordsPerAlpha) { // next 0.01% processed *m_sharedData.processedWords += threadProcessedWords - prvThreadProcessedWords; @@ -86,10 +90,10 @@ namespace w2v { } } - if (h > range.second) { - exitFlag = true; // EOF or end of requested region - break; - } + // if (h > range.second) { + // exitFlag = true; // EOF or end of requested region + // break; + // } text_t text = m_sharedData.corpus->texts[h]; // read sentence @@ -119,7 +123,7 @@ namespace w2v { } else { cbow(sentence, _trainMatrix); } - h++; // move to next text + //h++; // move to next text } } } From 6bf7ee5e579d53a307e92fd58df735bb6c7f9d73 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 12:27:04 +0900 Subject: [PATCH 16/26] Clean up the code --- src/word2vec/lib/trainer.cpp | 44 ++++++++++++------------------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 5836c55..8cecbaa 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -11,7 +11,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, - const std::shared_ptr &_vocabulary, + //const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { @@ -22,23 +22,17 @@ namespace w2v { } sharedData.trainSettings = _trainSettings; - if (!_vocabulary) { - throw std::runtime_error("vocabulary object is not initialized"); - } - sharedData.vocabulary = _vocabulary; + // if (!_vocabulary) { + // throw std::runtime_error("vocabulary object is not initialized"); + // } + // sharedData.vocabulary = _vocabulary; if (!_corpus) { throw std::runtime_error("corpus is object is not initialized"); } sharedData.corpus = _corpus; - // if (!_corpus && !_fileMapper) { - // throw std::runtime_error("corpus and file mapper objects are not initialized"); - // } - // sharedData.corpus = _corpus; - // sharedData.fileMapper = _fileMapper; - - sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _vocabulary->size(), 0.0f)); + sharedData.bpWeights.reset(new std::vector(_trainSettings->size * _corpus->types.size(), 0.0f)); sharedData.expTable.reset(new std::vector(_trainSettings->expTableSize)); for (uint16_t i = 0; i < _trainSettings->expTableSize; ++i) { // Precompute the exp() table @@ -50,9 +44,7 @@ namespace w2v { } if (_trainSettings->withHS) { - std::vector frequencies; - _vocabulary->frequencies(frequencies); - sharedData.huffmanTree.reset(new huffmanTree_t(frequencies));; + sharedData.huffmanTree.reset(new huffmanTree_t(_corpus->frequency));; } if (_progressCallback != nullptr) { @@ -62,15 +54,9 @@ namespace w2v { sharedData.processedWords.reset(new std::atomic(0)); sharedData.alpha.reset(new std::atomic(_trainSettings->alpha)); - // if (_corpus) { - // // NOTE : corpus has no sentence delimiter - // m_matrixSize = sharedData.trainSettings->size * sharedData.vocabulary->size() + 100; - // } else { - m_matrixSize = sharedData.trainSettings->size * sharedData.vocabulary->size(); - //} - //Rcpp::Rcout << "corpus->texts.size(): " << sharedData.corpus->texts.size() << "\n"; - //Rcpp::Rcout << "vocabulary->size(): " << sharedData.vocabulary->size() << "\n"; - //Rcpp::Rcout << "_trainSettings->threads: " << (int)_trainSettings->threads << "\n"; + // NOTE: consider setting size elsewhere + m_matrixSize = sharedData.trainSettings->size * sharedData.corpus->types.size(); + for (uint8_t i = 0; i < _trainSettings->threads; ++i) { // trainThread_t t(i, sharedData); // Rcpp::Rcout << "thread: " << (int)i << " from " << t.range.first << " to " << t.range.second << "\n"; @@ -86,15 +72,15 @@ namespace w2v { std::uniform_real_distribution rndMatrixInitializer(-0.005f, 0.005f); _trainMatrix.resize(m_matrixSize); std::generate(_trainMatrix.begin(), _trainMatrix.end(), [&]() { - float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); - return v; - //return rndMatrixInitializer(randomGenerator); + //float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); + //return v; + return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? }); - + for (auto &i:m_threads) { i->launch(_trainMatrix); } - + for (auto &i:m_threads) { i->join(); } From 76c5abcf8142b91be770620c276319ce6397eeeb Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 11 Feb 2024 12:27:50 +0900 Subject: [PATCH 17/26] Fix word and document index --- src/word2vec/lib/trainThread.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 389a618..76f6b16 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -18,9 +18,6 @@ namespace w2v { if (!m_sharedData.trainSettings) { throw std::runtime_error("train settings are not initialized"); } - // if (!m_sharedData.vocabulary) { - // throw std::runtime_error("vocabulary object is not initialized"); - // } if (m_sharedData.trainSettings->sample > 0.0f) { m_downSampling.reset(new downSampling_t(m_sharedData.trainSettings->sample, @@ -28,8 +25,6 @@ namespace w2v { } if (m_sharedData.trainSettings->negative > 0) { - //std::vector frequencies; - //m_sharedData.vocabulary->frequencies(frequencies); m_nsDistribution.reset(new nsDistribution_t(m_sharedData.corpus->frequency)); } @@ -69,7 +64,7 @@ namespace w2v { auto wordsPerAlpha = wordsPerAllThreads / 10000; //while (!exitFlag) { //while (h <= range.second) { - for (std::size_t h = range.first; h <= range.second; h++) { + for (std::size_t h = range.first; h <= range.second; ++h) { // calc alpha if (threadProcessedWords - prvThreadProcessedWords > wordsPerAlpha) { // next 0.01% processed @@ -99,11 +94,11 @@ namespace w2v { // read sentence std::vector sentence; sentence.reserve(text.size()); - for (size_t i = 0; i < text.size(); i++) { + for (size_t i = 0; i < text.size(); ++i) { auto &word = text[i]; - if (word == 0) { - continue; // padding + if (word == 0) { // padding + continue; } threadProcessedWords++; @@ -115,7 +110,7 @@ namespace w2v { } //if (h == 1) // Rcpp::Rcout << word << ": " << wordData->index << "\n"; - sentence.push_back(word); + sentence.push_back(word - 1); // zero-based index of words } if (m_sharedData.trainSettings->withSG) { @@ -161,13 +156,13 @@ namespace w2v { for (std::size_t j = 0; j < m_sharedData.trainSettings->size; j++) { (*m_hiddenLayerVals)[j] /= cw; } - + if (m_sharedData.trainSettings->withHS) { hierarchicalSoftmax(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } else { negativeSampling(_sentence[i], *m_hiddenLayerErrors, *m_hiddenLayerVals, 0); } - + // hidden -> in for (auto j = rndShift; j < m_sharedData.trainSettings->window * 2 + 1 - rndShift; ++j) { if (j == m_sharedData.trainSettings->window) { From 73c2fc17040c842b670216a47d4b7d412abe1f8a Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:22:21 +0900 Subject: [PATCH 18/26] Tidy up --- src/word2vec/lib/trainThread.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index 76f6b16..e7d85b4 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -108,8 +108,6 @@ namespace w2v { continue; // skip this word } } - //if (h == 1) - // Rcpp::Rcout << word << ": " << wordData->index << "\n"; sentence.push_back(word - 1); // zero-based index of words } @@ -118,7 +116,6 @@ namespace w2v { } else { cbow(sentence, _trainMatrix); } - //h++; // move to next text } } } From 6613fb4128254f2c9528fcbb2aeeaea09583f0c4 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:27:57 +0900 Subject: [PATCH 19/26] Tidy up --- src/word2vec/lib/word2vec.cpp | 46 +++++------------------------------ src/word2vec/lib/word2vec.hpp | 20 +++------------ 2 files changed, 10 insertions(+), 56 deletions(-) diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 41a1234..7cbed6b 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -14,57 +14,23 @@ namespace w2v { bool w2vModel_t::train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - //const std::string &_trainFile, // NOTE: remove - //const std::string &_stopWordsFile, // NOTE: remove - //vocabularyProgressCallback_t _vocabularyProgressCallback, - //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept { try { // store tokens std::shared_ptr corpus(new corpus_t(_corpus)); - // map train data set file to memory - // std::shared_ptr trainWordsMapper; - // if (!_trainFile.empty()) { - // trainWordsMapper.reset(new fileMapper_t(_trainFile)); - // } - // // map stop-words file to memory - // std::shared_ptr stopWordsMapper; - // if (!_stopWordsFile.empty()) { - // stopWordsMapper.reset(new fileMapper_t(_stopWordsFile)); - // } - - // build vocabulary, skip stop-words and words with frequency < minWordFreq - //std::shared_ptr vocabulary; - // if (!_trainFile.empty()) { - // vocabulary.reset(new vocabulary_t(trainWordsMapper, - // stopWordsMapper, - // _trainSettings.wordDelimiterChars, - // _trainSettings.endOfSentenceChars, - // _trainSettings.minWordFreq, - // _vocabularyProgressCallback, - // _vocabularyStatsCallback)); - // } else { - // vocabulary.reset(new vocabulary_t(corpus, - // _trainSettings.minWordFreq, - // _vocabularyProgressCallback, - // _vocabularyStatsCallback)); - //} - // key words descending ordered by their indexes - //std::vector words; - //vocabulary->words(words); m_vectorSize = _trainSettings.size; m_mapSize = corpus->types.size(); - + + Rcpp::Rcout << "_trainSettings.size: " << _trainSettings.size << "\n"; + // train model - std::vector _trainMatrix; // NOTE: consider directly making m_map + std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), - //vocabulary, corpus, - //trainWordsMapper, // NOTE: remove _trainProgressCallback)(_trainMatrix); - //Rcpp::Rcout << "_trainMatrix: " << _trainMatrix.size() << "\n"; - + + // NOTE: directly make matrix from _trainMatrix std::size_t wordIndex = 0; for (auto const &i : corpus->types) { //Rcpp::Rcout << i << "\n"; diff --git a/src/word2vec/lib/word2vec.hpp b/src/word2vec/lib/word2vec.hpp index e2f2dfe..3485e3c 100644 --- a/src/word2vec/lib/word2vec.hpp +++ b/src/word2vec/lib/word2vec.hpp @@ -44,9 +44,7 @@ namespace w2v { corpus_t(): texts() {} corpus_t(texts_t _texts, types_t _types): texts(_texts), types(_types) {} - //corpus_t(texts_t _texts, words_t _stopWords): - // texts(_texts), stopWords(_stopWords) {} - + void setWordFreq() { frequency = frequency_t(types.size(), 0); @@ -72,6 +70,8 @@ namespace w2v { } Rcpp::Rcout << "trainWords: " << trainWords << "\n"; Rcpp::Rcout << "totalWords: " << totalWords << "\n"; + Rcpp::Rcout << "frequency.size(): " << frequency.size() << "\n"; + Rcpp::Rcout << "types.size(): " << types.size() << "\n"; } }; @@ -91,6 +91,7 @@ namespace w2v { uint8_t iterations = 5; ///< train iterations float alpha = 0.05f; ///< starting learn rate bool withSG = false; ///< use Skip-Gram instead of CBOW + // TODO: remove std::string wordDelimiterChars = " \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r"; std::string endOfSentenceChars = ".\n?!"; trainSettings_t() = default; @@ -205,11 +206,6 @@ namespace w2v { /// Direct access to the word-vector map const map_t &map() {return m_map;} - /// pure virtual method to save model of a derived class - //virtual bool save(const std::string &_modelFile) const noexcept = 0; - /// pure virtual method to load model of a derived class - //virtual bool load(const std::string &_modelFile, bool normalize = true) noexcept = 0; - /** * Vector access by key value * @param _key key value uniquely identifying vector in model @@ -327,16 +323,8 @@ namespace w2v { */ bool train(const trainSettings_t &_trainSettings, const corpus_t &_corpus, - //const std::string &_trainFile, // NOTE: remove - //const std::string &_stopWordsFile, // NOTE: remove - //vocabularyProgressCallback_t _vocabularyProgressCallback, - //vocabularyStatsCallback_t _vocabularyStatsCallback, trainProgressCallback_t _trainProgressCallback) noexcept; - /// saves word vectors to file with _modelFile name - //bool save(const std::string &_modelFile) const noexcept override; - /// loads word vectors from file with _modelFile name - //bool load(const std::string &_modelFile, bool normalize = true) noexcept override; /** * Normalise vectors */ From 76860c8e21eb236189ae2a556d3a9e3ef31b8318 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:47:51 +0900 Subject: [PATCH 20/26] Tidy up --- src/word2vec/lib/trainer.cpp | 16 +++------------- src/word2vec/lib/trainer.hpp | 4 ---- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/src/word2vec/lib/trainer.cpp b/src/word2vec/lib/trainer.cpp index 8cecbaa..550e729 100644 --- a/src/word2vec/lib/trainer.cpp +++ b/src/word2vec/lib/trainer.cpp @@ -11,9 +11,7 @@ namespace w2v { trainer_t::trainer_t(const std::shared_ptr &_trainSettings, - //const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback): m_threads() { trainThread_t::sharedData_t sharedData; @@ -22,11 +20,6 @@ namespace w2v { } sharedData.trainSettings = _trainSettings; - // if (!_vocabulary) { - // throw std::runtime_error("vocabulary object is not initialized"); - // } - // sharedData.vocabulary = _vocabulary; - if (!_corpus) { throw std::runtime_error("corpus is object is not initialized"); } @@ -58,11 +51,8 @@ namespace w2v { m_matrixSize = sharedData.trainSettings->size * sharedData.corpus->types.size(); for (uint8_t i = 0; i < _trainSettings->threads; ++i) { - // trainThread_t t(i, sharedData); - // Rcpp::Rcout << "thread: " << (int)i << " from " << t.range.first << " to " << t.range.second << "\n"; m_threads.emplace_back(new trainThread_t(i, sharedData)); } - //throw std::runtime_error("m_threads.emplace_back()"); } void trainer_t::operator()(std::vector &_trainMatrix) noexcept { @@ -72,9 +62,9 @@ namespace w2v { std::uniform_real_distribution rndMatrixInitializer(-0.005f, 0.005f); _trainMatrix.resize(m_matrixSize); std::generate(_trainMatrix.begin(), _trainMatrix.end(), [&]() { - //float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); - //return v; - return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? + float v = (float)(Rcpp::runif(1, -0.005f, 0.005f)[0]); + return v; + //return rndMatrixInitializer(randomGenerator); // NOTE:: pass random number seed? }); for (auto &i:m_threads) { diff --git a/src/word2vec/lib/trainer.hpp b/src/word2vec/lib/trainer.hpp index fcfba3b..30bc26f 100644 --- a/src/word2vec/lib/trainer.hpp +++ b/src/word2vec/lib/trainer.hpp @@ -15,8 +15,6 @@ #include #include "word2vec.hpp" -//#include "wordReader.hpp" -#include "vocabulary.hpp" #include "trainThread.hpp" namespace w2v { @@ -40,9 +38,7 @@ namespace w2v { * @param _progressCallback callback function to be called on each new 0.01% processed train data */ trainer_t(const std::shared_ptr &_trainSettings, - const std::shared_ptr &_vocabulary, const std::shared_ptr &_corpus, - //const std::shared_ptr &_fileMapper, // NOTE: remove std::function _progressCallback); /** From 160f923580da93f8734b11fe1b4f2a1db337ef76 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:51:31 +0900 Subject: [PATCH 21/26] Tidy up --- src/word2vec/lib/word2vec.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/word2vec/lib/word2vec.cpp b/src/word2vec/lib/word2vec.cpp index 7cbed6b..0699812 100644 --- a/src/word2vec/lib/word2vec.cpp +++ b/src/word2vec/lib/word2vec.cpp @@ -22,8 +22,6 @@ namespace w2v { m_vectorSize = _trainSettings.size; m_mapSize = corpus->types.size(); - Rcpp::Rcout << "_trainSettings.size: " << _trainSettings.size << "\n"; - // train model std::vector _trainMatrix; trainer_t(std::make_shared(_trainSettings), From f057c1a0fdb4d9c20735859a0442a430fcdda16a Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 08:55:28 +0900 Subject: [PATCH 22/26] Tidy up --- src/word2vec/lib/trainThread.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/word2vec/lib/trainThread.cpp b/src/word2vec/lib/trainThread.cpp index e7d85b4..ad52cc5 100644 --- a/src/word2vec/lib/trainThread.cpp +++ b/src/word2vec/lib/trainThread.cpp @@ -51,19 +51,15 @@ namespace w2v { void trainThread_t::worker(std::vector &_trainMatrix) noexcept { - Rcpp::Rcout << "Texts: " << range.first << " to " << range.second << "\n"; for (auto g = m_sharedData.trainSettings->iterations; g > 0; --g) { - //Rcpp::Rcout << "g: " << (int)g << "\n"; - //bool exitFlag = false; + std::size_t threadProcessedWords = 0; std::size_t prvThreadProcessedWords = 0; - //std::size_t h = range.first; // NOTE: only used for corpus auto wordsPerAllThreads = m_sharedData.trainSettings->iterations * m_sharedData.corpus->trainWords; auto wordsPerAlpha = wordsPerAllThreads / 10000; - //while (!exitFlag) { - //while (h <= range.second) { + for (std::size_t h = range.first; h <= range.second; ++h) { // calc alpha @@ -85,10 +81,6 @@ namespace w2v { } } - // if (h > range.second) { - // exitFlag = true; // EOF or end of requested region - // break; - // } text_t text = m_sharedData.corpus->texts[h]; // read sentence From c6cb07a4d1084df685b4d1ec5d0a69f53558184f Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:00:41 +0900 Subject: [PATCH 23/26] Build --- R/RcppExports.R | 12 ++---------- src/RcppExports.cpp | 34 ++++----------------------------- src/word2vec/lib/CMakeLists.txt | 4 ++-- 3 files changed, 8 insertions(+), 42 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index d1533a5..d6c90b8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,16 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -w2v_train <- function(texts_, stopWords_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { - .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) -} - -w2v_load_model <- function(file, normalize = TRUE) { - .Call('_word2vec_w2v_load_model', PACKAGE = 'word2vec', file, normalize) -} - -w2v_save_model <- function(ptr, file) { - .Call('_word2vec_w2v_save_model', PACKAGE = 'word2vec', ptr, file) +w2v_train <- function(texts_, types_, modelFile = "", minWordFreq = 5L, size = 100L, window = 5L, expTableSize = 1000L, expValueMax = 6L, sample = 0.001, withHS = FALSE, negative = 5L, threads = 1L, iterations = 5L, alpha = 0.05, withSG = FALSE, verbose = FALSE, normalize = TRUE) { + .Call('_word2vec_w2v_train', PACKAGE = 'word2vec', texts_, types_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize) } w2v_dictionary <- function(ptr) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c8bfcda..ebd9209 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,13 +11,13 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // w2v_train -Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector stopWords_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); -RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP stopWords_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { +Rcpp::List w2v_train(Rcpp::List texts_, Rcpp::CharacterVector types_, std::string modelFile, uint16_t minWordFreq, uint16_t size, uint8_t window, uint16_t expTableSize, uint8_t expValueMax, float sample, bool withHS, uint8_t negative, uint8_t threads, uint8_t iterations, float alpha, bool withSG, bool verbose, bool normalize); +RcppExport SEXP _word2vec_w2v_train(SEXP texts_SEXP, SEXP types_SEXP, SEXP modelFileSEXP, SEXP minWordFreqSEXP, SEXP sizeSEXP, SEXP windowSEXP, SEXP expTableSizeSEXP, SEXP expValueMaxSEXP, SEXP sampleSEXP, SEXP withHSSEXP, SEXP negativeSEXP, SEXP threadsSEXP, SEXP iterationsSEXP, SEXP alphaSEXP, SEXP withSGSEXP, SEXP verboseSEXP, SEXP normalizeSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< Rcpp::List >::type texts_(texts_SEXP); - Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type stopWords_(stopWords_SEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type types_(types_SEXP); Rcpp::traits::input_parameter< std::string >::type modelFile(modelFileSEXP); Rcpp::traits::input_parameter< uint16_t >::type minWordFreq(minWordFreqSEXP); Rcpp::traits::input_parameter< uint16_t >::type size(sizeSEXP); @@ -33,31 +33,7 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type withSG(withSGSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, stopWords_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); - return rcpp_result_gen; -END_RCPP -} -// w2v_load_model -Rcpp::List w2v_load_model(std::string file, bool normalize); -RcppExport SEXP _word2vec_w2v_load_model(SEXP fileSEXP, SEXP normalizeSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); - Rcpp::traits::input_parameter< bool >::type normalize(normalizeSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_load_model(file, normalize)); - return rcpp_result_gen; -END_RCPP -} -// w2v_save_model -bool w2v_save_model(SEXP ptr, std::string file); -RcppExport SEXP _word2vec_w2v_save_model(SEXP ptrSEXP, SEXP fileSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< SEXP >::type ptr(ptrSEXP); - Rcpp::traits::input_parameter< std::string >::type file(fileSEXP); - rcpp_result_gen = Rcpp::wrap(w2v_save_model(ptr, file)); + rcpp_result_gen = Rcpp::wrap(w2v_train(texts_, types_, modelFile, minWordFreq, size, window, expTableSize, expValueMax, sample, withHS, negative, threads, iterations, alpha, withSG, verbose, normalize)); return rcpp_result_gen; END_RCPP } @@ -128,8 +104,6 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_word2vec_w2v_train", (DL_FUNC) &_word2vec_w2v_train, 17}, - {"_word2vec_w2v_load_model", (DL_FUNC) &_word2vec_w2v_load_model, 2}, - {"_word2vec_w2v_save_model", (DL_FUNC) &_word2vec_w2v_save_model, 2}, {"_word2vec_w2v_dictionary", (DL_FUNC) &_word2vec_w2v_dictionary, 1}, {"_word2vec_w2v_embedding", (DL_FUNC) &_word2vec_w2v_embedding, 2}, {"_word2vec_w2v_nearest", (DL_FUNC) &_word2vec_w2v_nearest, 4}, diff --git a/src/word2vec/lib/CMakeLists.txt b/src/word2vec/lib/CMakeLists.txt index c15279d..ad6c414 100644 --- a/src/word2vec/lib/CMakeLists.txt +++ b/src/word2vec/lib/CMakeLists.txt @@ -18,8 +18,8 @@ set(PRJ_SRCS # ${PROJECT_INCLUDE_DIR}/word2vec.h # ${PROJECT_SOURCE_DIR}/c_binding.cpp ${PROJECT_SOURCE_DIR}/mapper.cpp - ${PROJECT_SOURCE_DIR}/vocabulary.hpp - ${PROJECT_SOURCE_DIR}/vocabulary.cpp +# ${PROJECT_SOURCE_DIR}/vocabulary.hpp +# ${PROJECT_SOURCE_DIR}/vocabulary.cpp ${PROJECT_SOURCE_DIR}/huffmanTree.hpp ${PROJECT_SOURCE_DIR}/huffmanTree.cpp ${PROJECT_SOURCE_DIR}/nsDistribution.hpp From 6f41e4dcaba9ac49a5402cd6ab1ffba828a25a4a Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:01:06 +0900 Subject: [PATCH 24/26] Update --- tests/test.R | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test.R b/tests/test.R index e232226..8f524ca 100644 --- a/tests/test.R +++ b/tests/test.R @@ -3,29 +3,29 @@ library(word2vec) corp <- data_corpus_inaugural %>% corpus_reshape() -toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) -lis <- unclass(toks) +toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% + tokens_remove(stopwords(), padding = TRUE) %>% + tokens_tolower() +ndoc(toks) -type <- types(toks) -type[type %in% stopwords()] <- "" -mod <- word2vec:::w2v_train(toks, type, verbose = TRUE) +mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, + iterations = 5, minWordFreq = 5) dim(as.matrix(mod)) +predict(mod, c("people", "american"), type = "nearest") -mod2 <- word2vec:::w2v_train(unclass(toks)[1:10], types(toks), verbose = TRUE) -dim(as.matrix(mod2)) +require(LSX) +lss <- as.textmodel_lss(t(as.matrix(mod)), "good") +head(coef(lss)) +tail(coef(lss)) +lis <- as.list(toks) mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, verbose = TRUE, threads = 4) emb_lis <- as.matrix(mod_lis) dim(emb_lis) -predict(mod_lis, c("people", "American"), type = "nearest") - -mod_txt <- word2vec(txt, dim = 50, iter = 5, split = c("[ \n]", "\n"), min_count = 5, - verbose = TRUE, threads = 4) -emb_txt <- as.matrix(mod_txt) -dim(emb_txt) -predict(mod_txt, c("people", "American"), type = "nearest") +pred_lis <- predict(mod_lis, c("people", "American"), type = "nearest") +#saveRDS(mod_lis, "tests/word2vec_v04.RDS") microbenchmark::microbenchmark( "lis" = word2vec(lis, dim = 50, iter = 5, min_count = 5, From a0ed2ef66c15fba83248269a9537eb28ccaa819c Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:16:00 +0900 Subject: [PATCH 25/26] Update --- tests/test.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test.R b/tests/test.R index 8f524ca..181791c 100644 --- a/tests/test.R +++ b/tests/test.R @@ -1,7 +1,9 @@ library(quanteda) library(word2vec) -corp <- data_corpus_inaugural %>% +data_corpus_guardian <- readRDS("/home/kohei/Dropbox/Public/data_corpus_guardian2016-10k.rds") +corp <- data_corpus_guardian %>% +#corp <- data_corpus_inaugural %>% corpus_reshape() toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% tokens_remove(stopwords(), padding = TRUE) %>% @@ -9,7 +11,7 @@ toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% ndoc(toks) mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, - iterations = 5, minWordFreq = 5) + iterations = 5, minWordFreq = 5, threads = 6) dim(as.matrix(mod)) predict(mod, c("people", "american"), type = "nearest") From 6e08bc5c5c06081698c8b3234d3ac3f6b77262ec Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 12 Feb 2024 09:28:02 +0900 Subject: [PATCH 26/26] Update --- tests/test.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test.R b/tests/test.R index 181791c..e081c84 100644 --- a/tests/test.R +++ b/tests/test.R @@ -1,5 +1,6 @@ library(quanteda) library(word2vec) +library(LSX) data_corpus_guardian <- readRDS("/home/kohei/Dropbox/Public/data_corpus_guardian2016-10k.rds") corp <- data_corpus_guardian %>% @@ -15,11 +16,16 @@ mod <- word2vec:::w2v_train(toks, types(toks), verbose = TRUE, size = 300, dim(as.matrix(mod)) predict(mod, c("people", "american"), type = "nearest") -require(LSX) -lss <- as.textmodel_lss(t(as.matrix(mod)), "good") +dfmt <- dfm(toks, remove_padding = TRUE) %>% + dfm_trim(min_termfreq = 5) +lss <- textmodel_lss(dfmt, c("good" = 1, "bad" = -1), cache = TRUE) head(coef(lss)) tail(coef(lss)) +lss2 <- as.textmodel_lss(t(as.matrix(mod)), c("good" = 1, "bad" = -1)) +head(coef(lss2)) +tail(coef(lss2)) + lis <- as.list(toks) mod_lis <- word2vec(lis, dim = 50, iter = 5, min_count = 5, verbose = TRUE, threads = 4)