add doc strings, bug fixes, and turn into a function too so you can c…

…all it just like StreamingQueryDNADatabase.py #14
dkoslicki · Mar 27, 2020 · 0a557cc · 0a557cc
1 parent ae995c2
commit 0a557cc
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 10 deletions.
diff --git a/CMash/GroundTruth.py b/CMash/GroundTruth.py
@@ -10,6 +10,7 @@
 import screed
 from argparse import ArgumentTypeError
 import multiprocessing
+import argparse
 
 # The following is for ease of development (so I don't need to keep re-installing the tool)
 try:
@@ -47,14 +48,25 @@ def __init__(self, training_database_file: str, k_sizes: str):
 		self.training_file_to_ksize_to_kmers = self.__compute_all_training_kmers()
 
 	def __import_database(self) -> list:
+		"""
+		Private function that imports the HDF5 training file.
+		:return: a list of CountEstimators
+		:rtype: MinHash.CountEstimator
+		"""
 		CEs = MH.import_multiple_from_single_hdf5(self.training_database_file)
 		return CEs
 
 	def __return_file_names(self):
+		"""
+		Private function that gets all the files names contained in the training data.
+		:return: a list of file names
+		:rtype: list
+		"""
 		training_file_names = list(map(lambda x: x.input_file_name.decode('utf-8'), self.CEs))
 		return training_file_names
 
-	def __parseNumList(self, k_sizes_str: str) -> list:
+	@staticmethod
+	def __parseNumList(k_sizes_str: str) -> list:
 		"""
 		Parses a string like 10-21-1 and turn it into a list like [10, 11, 12,...,21]
 		:param k_sizes_str: the <start>-<end>-<increment> string
@@ -77,13 +89,25 @@ def __parseNumList(self, k_sizes_str: str) -> list:
 
 	@staticmethod
 	def __kmers(seq, ksize):
-		"""yield all k-mers of len ksize from seq.
+		"""
+		yield all k-mers of len ksize from seq.
 		Returns an iterable object
+		:param seq: a DNA sequence
+		:type seq: str
+		:param ksize: a k-mer size
+		:type ksize: int
 		"""
 		for i in range(len(seq) - ksize + 1):
 			yield seq[i:i + ksize]
 
 	def _return_ksize_to_kmers(self, input_file: str) -> dict:
+		"""
+		Enumerates all the k-mers specified by self.k_sizes in the genome/metagenome specified by input_file.
+		:param input_file: a file path to a fna/fq/gzipped file containing DNA sequences
+		:type input_file: str
+		:return: a dictionary with keys corresponding to k-mer sizes in self.k_sizes, and values dictionaries containing canonical k-mers
+		:rtype: dict
+		"""
 		k_sizes = self.k_sizes
 		k_size_to_kmers = dict()
 		# initialize all the k-mer sizes for the query file
@@ -125,9 +149,23 @@ def _return_ksize_to_kmers(self, input_file: str) -> dict:
 
 	@staticmethod
 	def __return_containment_index(set1: set, set2: set):
+		"""
+		Computes the containment index
+		:param set1: a set of k-mers
+		:type set1: set
+		:param set2: another set of k-mers
+		:type set2: set
+		:return: containment index  |set1 \cap set2| / |set 1|
+		:rtype: float
+		"""
 		return len(set1.intersection(set2)) / float(len(set1))
 
 	def __compute_all_training_kmers(self):
+		"""
+		In a parallelized fashion, enumerate all the k-mers for the given self.k_sizes in the input training genomes.
+		:return: a dictionary with keys given by self.training_file_names, values are dictionaries: keys are k_sizes, values are sets of canonical k-mers
+		:rtype: dict
+		"""
 		training_file_to_ksize_to_kmers = dict()
 		num_threads = multiprocessing.cpu_count()
 		pool = multiprocessing.Pool(processes=int(min(num_threads, len(self.training_file_names))))
@@ -139,6 +177,16 @@ def __compute_all_training_kmers(self):
 		return training_file_to_ksize_to_kmers
 
 	def __return_containment_indicies(self, query_file: str) -> np.ndarray:
+		"""
+		Creates a matrix of containment indicies:
+			for each i in self.training_file_names:
+				for each k in k_sizes:
+					containment_indicies[i ,k] = |query_file_k-mers \cap training_file_i_k-mers| / |training_file_i_k-mers|
+		:param query_file: a file pointing to a fasta/q (maybe compressed) file
+		:type query_file: str
+		:return: a numpy matrix of containment indicies: containment_indicies[i ,k] = |query_file_k-mers \cap training_file_i_k-mers| / |training_file_i_k-mers|
+		:rtype: np.ndarray
+		"""
 		training_file_names = self.training_file_names
 		k_sizes = self.k_sizes
 		training_file_to_ksize_to_kmers = self.training_file_to_ksize_to_kmers
@@ -157,14 +205,27 @@ def __return_containment_indicies(self, query_file: str) -> np.ndarray:
 			# need to compute the k-mers in the query file
 			query_file_to_ksize_to_kmers = self._return_ksize_to_kmers(query_file)
 			for (j, k_size) in enumerate(k_sizes):
-				query_kmers = query_file_to_ksize_to_kmers[query_file][k_size]
+				query_kmers = query_file_to_ksize_to_kmers[k_size]
 				for (i, file_name) in enumerate(training_file_names):
 					training_kmers = training_file_to_ksize_to_kmers[file_name][k_size]
 					# | train \cap query| / | train |
 					containment_indicies[i, j] = self.__return_containment_index(training_kmers, query_kmers)
 		return containment_indicies
 
 	def return_containment_data_frame(self, query_file: str, location_of_thresh: int, coverage_threshold: float) -> pd.DataFrame:
+		"""
+		Returns a Pandas Data frame with rows indexed by training file names, columns indicated by k-mer sizes, and entries the
+		containment indicies for the give query_file. Same exact format as CMash/Query.py and scripts/StreamingQueryDNADatabase.py
+		:param query_file: a file pointing to a fasta/q (maybe compressed) file. Need not be in the training data
+		:type query_file: str
+		:param location_of_thresh: where in self.k_sizes the thresholding should take place (-1 means the last one)
+		:type location_of_thresh: int
+		:param coverage_threshold: filter out those results that have containment indicies strictly below this threshold
+		:type coverage_threshold: float
+		:return: Returns a Pandas Data frame with rows indexed by training file names, columns indicated by k-mer sizes, and entries the
+		containment indicies for the give query_file.
+		:rtype: pandas.DataFrame
+		"""
 		k_range = self.k_sizes
 		training_file_names = self.training_file_names
 		containment_indices = self.__return_containment_indicies(query_file)
@@ -177,13 +238,41 @@ def return_containment_data_frame(self, query_file: str, location_of_thresh: int
 
 
 def main():
-	training_database_file = "/home/dkoslicki/Desktop/CMash/tests/script_tests/TrainingDatabase.h5"
-	query_file = "/home/dkoslicki/Desktop/CMash/tests/Organisms/taxid_1192839_4_genomic.fna.gz"
-	k_range = "4-5-1"  # small test
-	#k_range = "10-21-2"  # same as in tests/script_tests/run_small_tests.sh
-	g = TrueContainment(training_database_file, k_range)
-	print(g.return_containment_data_frame(query_file, -1, .1))  # defaults
+	parser = argparse.ArgumentParser(
+		description="This script calculates the *ground truth* containment indicies for each of the training/reference sketches"
+		            " via brute force enumeration of all the (canonical) k-mers", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+	parser.add_argument('-c', '--containment_threshold', type=float,
+	                    help="Only return results with containment index above this "
+	                         "threshold at the maximum k-mer size.", default=0.1)
+	parser.add_argument('-l', '--location_of_thresh', type=int,
+	                    help="Location in range to apply the threshold passed by the -c flag. -l 2 -c 5-50-10 means the"
+	                         " threshold will be applied at k-size 25. Default is largest size.", default=-1)
+	parser.add_argument('in_file', help="Input file: FASTA/Q file to be processes")
+	parser.add_argument('reference_file',
+	                    help='Training database/reference file (in HDF5 format). Created with MakeStreamingDNADatabase.py')
+	parser.add_argument('out_file', help='Output csv file with the containment indices.')
+	parser.add_argument('k_range', type=str,
+	                    help="Range of k-mer sizes in the formate <start>-<end>-<increment>."
+	                         " So 5-10-2 means [5, 7, 9]. If <end> is larger than the k-mer size"
+	                         "of the training data, this will automatically be reduced.")
+
+	# get the args
+	args = parser.parse_args()
+	k_sizes = args.k_range
+	training_database_file = args.reference_file
+	query_file = args.in_file
+	out_file = args.out_file
+	location_of_thresh = args.location_of_thresh
+	coverage_threshold = args.containment_threshold
+
+	# pre-compute the kmers in the training database
+	g = TrueContainment(training_database_file=training_database_file, k_sizes=k_sizes)
+
+	# compute the containment indicies
+	df = g.return_containment_data_frame(query_file=query_file, location_of_thresh=location_of_thresh, coverage_threshold=coverage_threshold)
 
+	# save them
+	df.to_csv(out_file, index=True, encoding='utf-8')
 
 
 if __name__ == "__main__":

diff --git a/CMash/Query.py b/CMash/Query.py
@@ -28,7 +28,7 @@ def return_data_frame(training_file_names: list, k_range: list, location_of_thre
 	:type location_of_thresh: int
 	:param containment_indices: the containment indicies matrix you wish to convert to a pandas data frame
 	:type containment_indices: numpy.ndarray
-	:param coverage_threshold: filter out those results that have containment indicies below this threshold
+	:param coverage_threshold: filter out those results that have containment indicies strictly below this threshold
 	:type coverage_threshold: float
 	"""
 	results = dict()