Skip to content

Commit

Permalink
add doc strings, bug fixes, and turn into a function too so you can c…
Browse files Browse the repository at this point in the history
…all it just like StreamingQueryDNADatabase.py #14
  • Loading branch information
dkoslicki committed Mar 27, 2020
1 parent ae995c2 commit 0a557cc
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 10 deletions.
107 changes: 98 additions & 9 deletions CMash/GroundTruth.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import screed
from argparse import ArgumentTypeError
import multiprocessing
import argparse

# The following is for ease of development (so I don't need to keep re-installing the tool)
try:
Expand Down Expand Up @@ -47,14 +48,25 @@ def __init__(self, training_database_file: str, k_sizes: str):
self.training_file_to_ksize_to_kmers = self.__compute_all_training_kmers()

def __import_database(self) -> list:
"""
Private function that imports the HDF5 training file.
:return: a list of CountEstimators
:rtype: MinHash.CountEstimator
"""
CEs = MH.import_multiple_from_single_hdf5(self.training_database_file)
return CEs

def __return_file_names(self):
"""
Private function that gets all the files names contained in the training data.
:return: a list of file names
:rtype: list
"""
training_file_names = list(map(lambda x: x.input_file_name.decode('utf-8'), self.CEs))
return training_file_names

def __parseNumList(self, k_sizes_str: str) -> list:
@staticmethod
def __parseNumList(k_sizes_str: str) -> list:
"""
Parses a string like 10-21-1 and turn it into a list like [10, 11, 12,...,21]
:param k_sizes_str: the <start>-<end>-<increment> string
Expand All @@ -77,13 +89,25 @@ def __parseNumList(self, k_sizes_str: str) -> list:

@staticmethod
def __kmers(seq, ksize):
"""yield all k-mers of len ksize from seq.
"""
yield all k-mers of len ksize from seq.
Returns an iterable object
:param seq: a DNA sequence
:type seq: str
:param ksize: a k-mer size
:type ksize: int
"""
for i in range(len(seq) - ksize + 1):
yield seq[i:i + ksize]

def _return_ksize_to_kmers(self, input_file: str) -> dict:
"""
Enumerates all the k-mers specified by self.k_sizes in the genome/metagenome specified by input_file.
:param input_file: a file path to a fna/fq/gzipped file containing DNA sequences
:type input_file: str
:return: a dictionary with keys corresponding to k-mer sizes in self.k_sizes, and values dictionaries containing canonical k-mers
:rtype: dict
"""
k_sizes = self.k_sizes
k_size_to_kmers = dict()
# initialize all the k-mer sizes for the query file
Expand Down Expand Up @@ -125,9 +149,23 @@ def _return_ksize_to_kmers(self, input_file: str) -> dict:

@staticmethod
def __return_containment_index(set1: set, set2: set):
"""
Computes the containment index
:param set1: a set of k-mers
:type set1: set
:param set2: another set of k-mers
:type set2: set
:return: containment index |set1 \cap set2| / |set 1|
:rtype: float
"""
return len(set1.intersection(set2)) / float(len(set1))

def __compute_all_training_kmers(self):
"""
In a parallelized fashion, enumerate all the k-mers for the given self.k_sizes in the input training genomes.
:return: a dictionary with keys given by self.training_file_names, values are dictionaries: keys are k_sizes, values are sets of canonical k-mers
:rtype: dict
"""
training_file_to_ksize_to_kmers = dict()
num_threads = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=int(min(num_threads, len(self.training_file_names))))
Expand All @@ -139,6 +177,16 @@ def __compute_all_training_kmers(self):
return training_file_to_ksize_to_kmers

def __return_containment_indicies(self, query_file: str) -> np.ndarray:
"""
Creates a matrix of containment indicies:
for each i in self.training_file_names:
for each k in k_sizes:
containment_indicies[i ,k] = |query_file_k-mers \cap training_file_i_k-mers| / |training_file_i_k-mers|
:param query_file: a file pointing to a fasta/q (maybe compressed) file
:type query_file: str
:return: a numpy matrix of containment indicies: containment_indicies[i ,k] = |query_file_k-mers \cap training_file_i_k-mers| / |training_file_i_k-mers|
:rtype: np.ndarray
"""
training_file_names = self.training_file_names
k_sizes = self.k_sizes
training_file_to_ksize_to_kmers = self.training_file_to_ksize_to_kmers
Expand All @@ -157,14 +205,27 @@ def __return_containment_indicies(self, query_file: str) -> np.ndarray:
# need to compute the k-mers in the query file
query_file_to_ksize_to_kmers = self._return_ksize_to_kmers(query_file)
for (j, k_size) in enumerate(k_sizes):
query_kmers = query_file_to_ksize_to_kmers[query_file][k_size]
query_kmers = query_file_to_ksize_to_kmers[k_size]
for (i, file_name) in enumerate(training_file_names):
training_kmers = training_file_to_ksize_to_kmers[file_name][k_size]
# | train \cap query| / | train |
containment_indicies[i, j] = self.__return_containment_index(training_kmers, query_kmers)
return containment_indicies

def return_containment_data_frame(self, query_file: str, location_of_thresh: int, coverage_threshold: float) -> pd.DataFrame:
"""
Returns a Pandas Data frame with rows indexed by training file names, columns indicated by k-mer sizes, and entries the
containment indicies for the give query_file. Same exact format as CMash/Query.py and scripts/StreamingQueryDNADatabase.py
:param query_file: a file pointing to a fasta/q (maybe compressed) file. Need not be in the training data
:type query_file: str
:param location_of_thresh: where in self.k_sizes the thresholding should take place (-1 means the last one)
:type location_of_thresh: int
:param coverage_threshold: filter out those results that have containment indicies strictly below this threshold
:type coverage_threshold: float
:return: Returns a Pandas Data frame with rows indexed by training file names, columns indicated by k-mer sizes, and entries the
containment indicies for the give query_file.
:rtype: pandas.DataFrame
"""
k_range = self.k_sizes
training_file_names = self.training_file_names
containment_indices = self.__return_containment_indicies(query_file)
Expand All @@ -177,13 +238,41 @@ def return_containment_data_frame(self, query_file: str, location_of_thresh: int


def main():
training_database_file = "/home/dkoslicki/Desktop/CMash/tests/script_tests/TrainingDatabase.h5"
query_file = "/home/dkoslicki/Desktop/CMash/tests/Organisms/taxid_1192839_4_genomic.fna.gz"
k_range = "4-5-1" # small test
#k_range = "10-21-2" # same as in tests/script_tests/run_small_tests.sh
g = TrueContainment(training_database_file, k_range)
print(g.return_containment_data_frame(query_file, -1, .1)) # defaults
parser = argparse.ArgumentParser(
description="This script calculates the *ground truth* containment indicies for each of the training/reference sketches"
" via brute force enumeration of all the (canonical) k-mers", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-c', '--containment_threshold', type=float,
help="Only return results with containment index above this "
"threshold at the maximum k-mer size.", default=0.1)
parser.add_argument('-l', '--location_of_thresh', type=int,
help="Location in range to apply the threshold passed by the -c flag. -l 2 -c 5-50-10 means the"
" threshold will be applied at k-size 25. Default is largest size.", default=-1)
parser.add_argument('in_file', help="Input file: FASTA/Q file to be processes")
parser.add_argument('reference_file',
help='Training database/reference file (in HDF5 format). Created with MakeStreamingDNADatabase.py')
parser.add_argument('out_file', help='Output csv file with the containment indices.')
parser.add_argument('k_range', type=str,
help="Range of k-mer sizes in the formate <start>-<end>-<increment>."
" So 5-10-2 means [5, 7, 9]. If <end> is larger than the k-mer size"
"of the training data, this will automatically be reduced.")

# get the args
args = parser.parse_args()
k_sizes = args.k_range
training_database_file = args.reference_file
query_file = args.in_file
out_file = args.out_file
location_of_thresh = args.location_of_thresh
coverage_threshold = args.containment_threshold

# pre-compute the kmers in the training database
g = TrueContainment(training_database_file=training_database_file, k_sizes=k_sizes)

# compute the containment indicies
df = g.return_containment_data_frame(query_file=query_file, location_of_thresh=location_of_thresh, coverage_threshold=coverage_threshold)

# save them
df.to_csv(out_file, index=True, encoding='utf-8')


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion CMash/Query.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def return_data_frame(training_file_names: list, k_range: list, location_of_thre
:type location_of_thresh: int
:param containment_indices: the containment indicies matrix you wish to convert to a pandas data frame
:type containment_indices: numpy.ndarray
:param coverage_threshold: filter out those results that have containment indicies below this threshold
:param coverage_threshold: filter out those results that have containment indicies strictly below this threshold
:type coverage_threshold: float
"""
results = dict()
Expand Down

0 comments on commit 0a557cc

Please sign in to comment.