Skip to content

Commit

Permalink
GroundTruth: everything works in serial, now to debug parallel. #14
Browse files Browse the repository at this point in the history
  • Loading branch information
dkoslicki committed Mar 27, 2020
1 parent 7555e49 commit 1bf1b13
Showing 1 changed file with 46 additions and 12 deletions.
58 changes: 46 additions & 12 deletions CMash/GroundTruth.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import screed
from argparse import ArgumentTypeError
import multiprocessing

# The following is for ease of development (so I don't need to keep re-installing the tool)
try:
from CMash import MinHash as MH
Expand All @@ -23,19 +24,20 @@
from CMash import MinHash as MH
from CMash import Query


# FIXME: Make sure I am *identifying* the rc k-mers with the k-mers and not counting them as distinct entities


notACTG = re.compile('[^ACTG]') # look for any not ACTG


class TrueContainment:
"""
This class has functionality to compute the ground truth containment indicies and return them in the same format
as the scripts (to ease future testing). It is **only** intended for:
1. Small training databases
2. Databases that were formed using genomes that you have direct access to (i.e. live on your file system)
"""

def __init__(self, training_database_file: str, k_sizes: str):
self.training_database_file = training_database_file
self.k_sizes = self.__parseNumList(k_sizes)
Expand Down Expand Up @@ -80,7 +82,7 @@ def __kmers(seq, ksize):
for i in range(len(seq) - ksize + 1):
yield seq[i:i + ksize]

def __return_ksize_to_kmers(self, input_file):
def __return_ksize_to_kmers(self, input_file: str) -> dict:
k_sizes = self.k_sizes
k_size_to_kmers = dict()
# initialize all the k-mer sizes for the query file
Expand All @@ -107,7 +109,8 @@ def __return_ksize_to_kmers(self, input_file):
for kmer in self.__kmers(seq, k_size):
if kmer:
k_size_to_kmers[k_size].add(kmer) # add the kmer
k_size_to_kmers[k_size].add(khmer.reverse_complement(kmer)) # add the reverse complement
k_size_to_kmers[k_size].add(
khmer.reverse_complement(kmer)) # add the reverse complement
return k_size_to_kmers

@staticmethod
Expand All @@ -119,24 +122,55 @@ def __compute_all_training_kmers(self):
num_threads = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=num_threads)
# res is returned in the same order as self.training_file_names according to the docs
res = pool.map(self.__return_ksize_to_kmers, self.training_file_names)
#res = pool.map(self.__return_ksize_to_kmers, self.training_file_names)
res = map(self.__return_ksize_to_kmers, self.training_file_names)
for (item, file_name) in zip(res, self.training_file_names):
training_file_to_ksize_to_kmers[file_name] = item
pool.close()
return training_file_to_ksize_to_kmers

def return_data_frame(self, query_file: str) -> pd.DataFrame:
pass





def __return_containment_indicies(self, query_file: str) -> np.ndarray:
training_file_names = self.training_file_names
k_sizes = self.k_sizes
training_file_to_ksize_to_kmers = self.training_file_to_ksize_to_kmers
num_files = len(training_file_names)
# rows are the files, columns are the k-mer sizes
containment_indicies = np.zeros((num_files, len(k_sizes)))
# if the query file is part of the training files, then nothing extra to do
if query_file in training_file_names:
for (j, k_size) in enumerate(k_sizes):
query_kmers = training_file_to_ksize_to_kmers[query_file][k_size]
for (i, file_name) in enumerate(training_file_names):
training_kmers = training_file_to_ksize_to_kmers[file_name][k_size]
# | train \cap query| / | train |
containment_indicies[i, j] = self.__return_containment_index(training_kmers, query_kmers)
else:
# need to compute the k-mers in the query file
query_file_to_ksize_to_kmers = self.__return_ksize_to_kmers(query_file)
for (j, k_size) in enumerate(k_sizes):
query_kmers = query_file_to_ksize_to_kmers[query_file][k_size]
for (i, file_name) in enumerate(training_file_names):
training_kmers = training_file_to_ksize_to_kmers[file_name][k_size]
# | train \cap query| / | train |
containment_indicies[i, j] = self.__return_containment_index(training_kmers, query_kmers)
return containment_indicies

def return_containment_data_frame(self, query_file: str, location_of_thresh: int, coverage_threshold: float) -> pd.DataFrame:
k_range = self.k_sizes
training_file_names = self.training_file_names
containment_indices = self.__return_containment_indicies(query_file)
df = Query.return_data_frame(training_file_names=training_file_names,
k_range=k_range,
location_of_thresh=location_of_thresh,
containment_indices=containment_indices,
coverage_threshold=coverage_threshold)
return df


def main():
training_database_file = "/home/dkoslicki/Desktop/CMash/tests/script_tests/TrainingDatabase.h5"
query_file = "/home/dkoslicki/Desktop/CMash/tests/Organisms/taxid_1192839_4_genomic.fna.gz"


if __name__ == "__main__":
main()
main()

0 comments on commit 1bf1b13

Please sign in to comment.