diff --git a/R/Cluster.R b/R/Cluster.R index 1b11203..1e528e2 100644 --- a/R/Cluster.R +++ b/R/Cluster.R @@ -176,6 +176,7 @@ example_path <- function(file = NULL) { return(path) } + #' Read Count #' #' @export diff --git a/R/RcppExports.R b/R/RcppExports.R index 2ed5e48..48433bc 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,6 +9,10 @@ WriteColumnFile <- function(xPosition, yPosition, data, cutoff, countTable, save invisible(.Call('_clustur_WriteColumnFile', PACKAGE = 'clustur', xPosition, yPosition, data, cutoff, countTable, saveLocation)) } +DetermineIfPhylipOrColumnFile <- function(filePath) { + .Call('_clustur_DetermineIfPhylipOrColumnFile', PACKAGE = 'clustur', filePath) +} + ProcessDistanceFiles <- function(filePath, countTable, cutoff, isSim) { .Call('_clustur_ProcessDistanceFiles', PACKAGE = 'clustur', filePath, countTable, cutoff, isSim) } diff --git a/src/Adapters/CountTableAdapter.h b/src/Adapters/CountTableAdapter.h index f8bcfdc..58ae96a 100644 --- a/src/Adapters/CountTableAdapter.h +++ b/src/Adapters/CountTableAdapter.h @@ -27,11 +27,13 @@ class CountTableAdapter { Rcpp::DataFrame GetCountTable() const {return countTable;} Rcpp::DataFrame ReCreateDataFrame() const; private: + void CreateNameToIndex(); struct IndexAbundancePair { int groupIndex; int sequenceIndex; double abundance; }; + std::unordered_map nameToRowIndex; std::vector sampleNames; std::unordered_map> dataFrameMap; std::vector groups; diff --git a/src/CountTableAdapter.cpp b/src/CountTableAdapter.cpp index e48ee2c..9a2968e 100644 --- a/src/CountTableAdapter.cpp +++ b/src/CountTableAdapter.cpp @@ -29,6 +29,7 @@ bool CountTableAdapter::CreateDataFrameMap(const Rcpp::DataFrame &countTable) { // We only want the actual group names. so everything after groups.insert(groups.end(), columnNames.begin() + 2, columnNames.end()); this->countTable = countTable; + CreateNameToIndex(); return true; } @@ -86,29 +87,27 @@ bool CountTableAdapter::CreateDataFrameMapFromSparseCountTable(const Rcpp::DataF dataFrameMap = data; // In a count table, the first to columns are the sequence and the total abundance. // We only want the actual group names. so everything after - this->countTable = countTable; + CreateNameToIndex(); return true; } double CountTableAdapter::FindAbundanceBasedOnGroup(const std::string &group, const std::string &sampleName) const { - if (std::find(groups.begin(), groups.end(), group) == groups.end()) - return -1; //Not Found, may need to throw and execption... - if (std::find(sampleNames.begin(), sampleNames.end(), sampleName) == sampleNames.end()) - return -1; //Not Found, may need to throw and execption... + // We will preprocess the find during hte read dist process. So remove special checks + // - Protip hashmap find is faster than vector + if(nameToRowIndex.find(sampleName) == nameToRowIndex.end()) + return -1; const std::vector groupCol = GetColumnByName(group); - const long index = std::distance(sampleNames.begin(), std::find(sampleNames.begin(), - sampleNames.end(), sampleName)); - return dataFrameMap.at(group)[index]; + return dataFrameMap.at(group)[nameToRowIndex.at(sampleName)]; } double CountTableAdapter::FindTotalAbundance(const std::string &sampleName) const { - if(std::find(sampleNames.begin(), sampleNames.end(), sampleName) == sampleNames.end()) - return -1; // Not found - const long index = std::distance(sampleNames.begin(), std::find(sampleNames.begin(), - sampleNames.end(), sampleName)); - return dataFrameMap.at("total")[index]; + // We will preprocess the find during hte read dist process. So remove special checks + // - Protip hashmap find is faster than vector + if(nameToRowIndex.find(sampleName) == nameToRowIndex.end()) + return -1; + return dataFrameMap.at("total")[nameToRowIndex.at(sampleName)]; } std::string CountTableAdapter::GetNameByIndex(const int index) const { @@ -148,6 +147,13 @@ Rcpp::DataFrame CountTableAdapter::ReCreateDataFrame() const { return countTable; } + +void CountTableAdapter::CreateNameToIndex() { + for(size_t i = 0; i < sampleNames.size(); i++) { + nameToRowIndex[sampleNames[i]] = i; + } +} + // Gets every column but the first column (the sequence names) std::vector CountTableAdapter::GetColumnByName(const std::string &name) const { if (dataFrameMap.find(name) != dataFrameMap.end()) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 2115726..c7a340b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -40,16 +40,27 @@ BEGIN_RCPP return R_NilValue; END_RCPP } +// DetermineIfPhylipOrColumnFile +bool DetermineIfPhylipOrColumnFile(const std::string& filePath); +RcppExport SEXP _clustur_DetermineIfPhylipOrColumnFile(SEXP filePathSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type filePath(filePathSEXP); + rcpp_result_gen = Rcpp::wrap(DetermineIfPhylipOrColumnFile(filePath)); + return rcpp_result_gen; +END_RCPP +} // ProcessDistanceFiles -SEXP ProcessDistanceFiles(const std::string& filePath, const Rcpp::DataFrame& countTable, double cutoff, bool isSim); +SEXP ProcessDistanceFiles(const std::string& filePath, const Rcpp::DataFrame& countTable, const double cutoff, const bool isSim); RcppExport SEXP _clustur_ProcessDistanceFiles(SEXP filePathSEXP, SEXP countTableSEXP, SEXP cutoffSEXP, SEXP isSimSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::string& >::type filePath(filePathSEXP); Rcpp::traits::input_parameter< const Rcpp::DataFrame& >::type countTable(countTableSEXP); - Rcpp::traits::input_parameter< double >::type cutoff(cutoffSEXP); - Rcpp::traits::input_parameter< bool >::type isSim(isSimSEXP); + Rcpp::traits::input_parameter< const double >::type cutoff(cutoffSEXP); + Rcpp::traits::input_parameter< const bool >::type isSim(isSimSEXP); rcpp_result_gen = Rcpp::wrap(ProcessDistanceFiles(filePath, countTable, cutoff, isSim)); return rcpp_result_gen; END_RCPP @@ -132,6 +143,7 @@ RcppExport SEXP run_testthat_tests(SEXP); static const R_CallMethodDef CallEntries[] = { {"_clustur_WritePhylipFile", (DL_FUNC) &_clustur_WritePhylipFile, 6}, {"_clustur_WriteColumnFile", (DL_FUNC) &_clustur_WriteColumnFile, 6}, + {"_clustur_DetermineIfPhylipOrColumnFile", (DL_FUNC) &_clustur_DetermineIfPhylipOrColumnFile, 1}, {"_clustur_ProcessDistanceFiles", (DL_FUNC) &_clustur_ProcessDistanceFiles, 4}, {"_clustur_ProcessSparseMatrix", (DL_FUNC) &_clustur_ProcessSparseMatrix, 6}, {"_clustur_GetDistanceDataFrame", (DL_FUNC) &_clustur_GetDistanceDataFrame, 1}, diff --git a/src/main.cpp b/src/main.cpp index d940c91..9fe75c1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,6 @@ #include "MothurDependencies/ColumnDistanceMatrixReader.h" #include "MothurDependencies/SharedFileBuilder.h" #include "Adapters/DistanceFileReader.h" -#include "Tests/OptimatrixAdapterTestFixture.h" #if DEBUG_RCPP #include #include @@ -53,9 +52,8 @@ Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const } - //[[Rcpp::export]] -SEXP ProcessDistanceFiles(const std::string& filePath, const Rcpp::DataFrame& countTable, double cutoff, bool isSim) { +bool DetermineIfPhylipOrColumnFile(const std::string& filePath) { std::fstream data(filePath); std::unordered_map map; map[true] = "This is a phylip file. Processing now..."; @@ -77,19 +75,26 @@ SEXP ProcessDistanceFiles(const std::string& filePath, const Rcpp::DataFrame& co isPhylip = false; Rcpp::Rcout << map[isPhylip] << "\n"; data.close(); + return isPhylip; +} + +//[[Rcpp::export]] +SEXP ProcessDistanceFiles(const std::string& filePath, const Rcpp::DataFrame& countTable, const double cutoff, + const bool isSim) { + const bool isPhylip = DetermineIfPhylipOrColumnFile(filePath); CountTableAdapter adapter; adapter.CreateDataFrameMap(countTable); if(isPhylip) { DistanceFileReader* read = new ReadPhylipMatrix(cutoff, isSim); - std::vector rowDataMatrix = read->ReadToRowData(filePath); + const std::vector rowDataMatrix = read->ReadToRowData(filePath); read->SetCountTable(adapter); read->SetRowDataMatrix(rowDataMatrix); read->ReadRowDataMatrix(rowDataMatrix); return Rcpp::XPtr(read); } DistanceFileReader* read = new ColumnDistanceMatrixReader(cutoff, isSim); - std::vector rowDataMatrix = read->ReadToRowData(adapter, filePath); + const std::vector rowDataMatrix = read->ReadToRowData(adapter, filePath); read->SetCountTable(adapter); read->SetRowDataMatrix(rowDataMatrix); read->ReadRowDataMatrix(rowDataMatrix); diff --git a/tests/testthat/extdata/sparse_matrix_data.RDS b/tests/testthat/extdata/sparse_matrix_data.RDS deleted file mode 100644 index 572b262..0000000 Binary files a/tests/testthat/extdata/sparse_matrix_data.RDS and /dev/null differ diff --git a/tests/testthat/test-test-opticluster.R b/tests/testthat/test-test-opticluster.R index c89b93a..dfc9582 100644 --- a/tests/testthat/test-test-opticluster.R +++ b/tests/testthat/test-test-opticluster.R @@ -140,6 +140,16 @@ test_that("Read dist can read column and phylip files", { expect_true(nrow(get_distance_data_frame(distance_data_phylip)) == 9604) }) + +test_that("We can determine if a file is phylip or not", { + is_not_phylip <- + DetermineIfPhylipOrColumnFile(test_path("extdata", "amazon_column.dist")) + is_phylip <- + DetermineIfPhylipOrColumnFile(test_path("extdata", "amazon_phylip.dist")) + expect_true(is_phylip) + expect_false(is_not_phylip) +}) + test_that("Validate Count Table returns a valid count table", { count_table <- read.delim(test_path("extdata", "amazon.count_table")) validated_count_table <- validate_count_table(count_table)