diff --git a/CMash/GroundTruth.py b/CMash/GroundTruth.py index 8493361..8365245 100644 --- a/CMash/GroundTruth.py +++ b/CMash/GroundTruth.py @@ -10,6 +10,7 @@ import screed from argparse import ArgumentTypeError import multiprocessing + # The following is for ease of development (so I don't need to keep re-installing the tool) try: from CMash import MinHash as MH @@ -23,12 +24,12 @@ from CMash import MinHash as MH from CMash import Query - # FIXME: Make sure I am *identifying* the rc k-mers with the k-mers and not counting them as distinct entities notACTG = re.compile('[^ACTG]') # look for any not ACTG + class TrueContainment: """ This class has functionality to compute the ground truth containment indicies and return them in the same format @@ -36,6 +37,7 @@ class TrueContainment: 1. Small training databases 2. Databases that were formed using genomes that you have direct access to (i.e. live on your file system) """ + def __init__(self, training_database_file: str, k_sizes: str): self.training_database_file = training_database_file self.k_sizes = self.__parseNumList(k_sizes) @@ -80,7 +82,7 @@ def __kmers(seq, ksize): for i in range(len(seq) - ksize + 1): yield seq[i:i + ksize] - def __return_ksize_to_kmers(self, input_file): + def __return_ksize_to_kmers(self, input_file: str) -> dict: k_sizes = self.k_sizes k_size_to_kmers = dict() # initialize all the k-mer sizes for the query file @@ -107,7 +109,8 @@ def __return_ksize_to_kmers(self, input_file): for kmer in self.__kmers(seq, k_size): if kmer: k_size_to_kmers[k_size].add(kmer) # add the kmer - k_size_to_kmers[k_size].add(khmer.reverse_complement(kmer)) # add the reverse complement + k_size_to_kmers[k_size].add( + khmer.reverse_complement(kmer)) # add the reverse complement return k_size_to_kmers @staticmethod @@ -119,24 +122,55 @@ def __compute_all_training_kmers(self): num_threads = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=num_threads) # res is returned in the same order as self.training_file_names according to the docs - res = pool.map(self.__return_ksize_to_kmers, self.training_file_names) + #res = pool.map(self.__return_ksize_to_kmers, self.training_file_names) + res = map(self.__return_ksize_to_kmers, self.training_file_names) for (item, file_name) in zip(res, self.training_file_names): training_file_to_ksize_to_kmers[file_name] = item pool.close() return training_file_to_ksize_to_kmers - def return_data_frame(self, query_file: str) -> pd.DataFrame: - pass - - - - - + def __return_containment_indicies(self, query_file: str) -> np.ndarray: + training_file_names = self.training_file_names + k_sizes = self.k_sizes + training_file_to_ksize_to_kmers = self.training_file_to_ksize_to_kmers + num_files = len(training_file_names) + # rows are the files, columns are the k-mer sizes + containment_indicies = np.zeros((num_files, len(k_sizes))) + # if the query file is part of the training files, then nothing extra to do + if query_file in training_file_names: + for (j, k_size) in enumerate(k_sizes): + query_kmers = training_file_to_ksize_to_kmers[query_file][k_size] + for (i, file_name) in enumerate(training_file_names): + training_kmers = training_file_to_ksize_to_kmers[file_name][k_size] + # | train \cap query| / | train | + containment_indicies[i, j] = self.__return_containment_index(training_kmers, query_kmers) + else: + # need to compute the k-mers in the query file + query_file_to_ksize_to_kmers = self.__return_ksize_to_kmers(query_file) + for (j, k_size) in enumerate(k_sizes): + query_kmers = query_file_to_ksize_to_kmers[query_file][k_size] + for (i, file_name) in enumerate(training_file_names): + training_kmers = training_file_to_ksize_to_kmers[file_name][k_size] + # | train \cap query| / | train | + containment_indicies[i, j] = self.__return_containment_index(training_kmers, query_kmers) + return containment_indicies + + def return_containment_data_frame(self, query_file: str, location_of_thresh: int, coverage_threshold: float) -> pd.DataFrame: + k_range = self.k_sizes + training_file_names = self.training_file_names + containment_indices = self.__return_containment_indicies(query_file) + df = Query.return_data_frame(training_file_names=training_file_names, + k_range=k_range, + location_of_thresh=location_of_thresh, + containment_indices=containment_indices, + coverage_threshold=coverage_threshold) + return df def main(): training_database_file = "/home/dkoslicki/Desktop/CMash/tests/script_tests/TrainingDatabase.h5" query_file = "/home/dkoslicki/Desktop/CMash/tests/Organisms/taxid_1192839_4_genomic.fna.gz" + if __name__ == "__main__": - main() \ No newline at end of file + main()