Skip to content

Commit

Permalink
ok, well, at least it works, but only if you rip out the bloom filter…
Browse files Browse the repository at this point in the history
… altogether, and don't break if you don't see a TST match for a shorter kmer size. #2 and #20. Calling it a night
  • Loading branch information
dkoslicki committed Mar 18, 2020
1 parent 1ccf64f commit 4f6378f
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 6 deletions.
34 changes: 33 additions & 1 deletion dataForShaopeng/test_issue/StreamingQueryScript.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import timeit
from itertools import islice

t00 = timeit.default_timer()

# TODO: export hit matrices

Expand Down Expand Up @@ -54,7 +55,7 @@ def parseNumList(input):
raise Exception("The --range argument is required, no matter what the help menu says.")
training_data = "/home/dkoslicki/Desktop/CMash/dataForShaopeng/test_issue/TrainingDatabase_k_61.h5"
query_file = "/home/dkoslicki/Desktop/CMash/dataForShaopeng/small_data/taxid_1909294_104_genomic.fna.gz"
results_file = f"/home/dkoslicki/Desktop/CMash/dataForShaopeng/test_issue/out_{my_range.replace('-','_')})"
results_file = f"/home/dkoslicki/Desktop/CMash/dataForShaopeng/test_issue/out_{my_range.replace('-','_')}.csv"
npz_file = os.path.splitext(results_file)[0] + "_hit_matrix.npz"
num_threads = 12
location_of_thresh = -1
Expand Down Expand Up @@ -179,7 +180,13 @@ def return_matches(self, input_kmer, k_size_loc):
to_return = []
for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]: # FIXME: might need to break if one of them matches
# for kmer in [input_kmer]:
#if kmer not in all_kmers_bf: # TODO: but for some reason it works here
# return [], False
prefix_matches = tree.keys(kmer) # get all the k-mers whose prefix matches
#if not kmer in all_kmers_bf:
# return [], False
#if prefix_matches and not kmer in all_kmers_bf:
# print(f"prefix: {prefix_matches}, is in BF: {kmer in all_kmers_bf}")
# match_info = set()
# get the location of the found kmers in the counters
for item in prefix_matches:
Expand Down Expand Up @@ -219,16 +226,37 @@ def process_seq(self, seq):
possible_match = True
# start looking at the other k_sizes, don't overhang len(seq)
if possible_match:
#prev_match = True
for other_k_size in [x for x in k_range[1:] if i + x <= len(seq)]:
kmer = seq[i:i + other_k_size]
#if kmer in all_kmers_bf:
#print("True")
#if kmer not in all_kmers_bf and tree.keys(kmer):
# print(f"{kmer}\n")
if True:
k_size_loc = k_range.index(other_k_size)
match_list, saw_match = self.return_matches(kmer, k_size_loc)
if saw_match:
to_return.extend(match_list)
#if saw_match and kmer not in all_kmers_bf: # TODO: this doesn't return a single "bad thing happened", yet I can't use `if kmer in all_kmers_bf` at the top
# print("bad things happened")
#else:
# break # TODO: Interesting! If you break after not seeing a match, it goes back to the bad results. So something is wrong with the logic of the for loop
# TODO: What I would like to do is see what the kmers look like when you don't see a match for a smaller k-mer size, but somehow do see a match for the larger kmer size
#if saw_match and not prev_match:
#print(f"{kmer}\n")
#else:
# prev_match = saw_match
# FIXME: ok, so here's the problem: longer kmer matches while shorter does not: for kmer='CATGTCTTTCAGGCTGGAACCGGAGGCGACCAATGCCGACCGGCTGCTTTAT', tree.keys(kmer)==[] and tree.keys(khmer.reverse_complement(kmer))==match, yet tree.keys(khmer.reverse_complement(kmer[0:-1]))==[] and tree.keys(kmer[0:-1])==[]
# FIXME: but curiously, tree.keys(khmer.reverse_complement(kmer)[0:-1])==match, so it depends on when the reverse complement is being applied!!!! <<<<<-------------!!!!!!!!!!!!!
# FIXME: Additionally, kmer in all_kmers_bf == TRUE yet, kmer[0:-1] in all_kmers_bf == FALSE
# FIXME: so the problem is, that reverse complement is REVERSE complement, you need to FLIP THE KMER AROUND if you are doing prefix lookups in the tree. MAYBE?!
# MAYBE: instead of just passing the kmer, and then returning_matches using the kmer and it's reverse complement, I should pass the kmer,

# FIXME: I *might* be able to circumvent all of this just by using canonical k-mers....
else:
break

return to_return


Expand Down Expand Up @@ -258,6 +286,7 @@ def map_func(sequence):
if verbose:
print("Read in %d sequences" % i)
res = pool.map(map_func, to_proc, chunksize=int(max(1, min(num_reads_per_core, len(to_proc) / num_threads))))
#res = map(map_func, to_proc)
flattened_res = [item for sublist in res if sublist for item in sublist]
flattened_res = list(set(flattened_res)) # dedup it
match_tuples.extend(flattened_res)
Expand Down Expand Up @@ -345,3 +374,6 @@ def map_func(sequence):
print("Exporting results")
t0 = timeit.default_timer()
filtered_results.to_csv(results_file, index=True, encoding='utf-8')

t11 = timeit.default_timer()
print(f"Total time: {t11 - t00}")
Binary file modified dataForShaopeng/test_issue/TrainingDatabase_k_61.h5
Binary file not shown.
13 changes: 8 additions & 5 deletions scripts/StreamingQueryDNADatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,11 @@ def return_matches(self, input_kmer, k_size_loc):
""" Get all the matches in the trie with the kmer prefix"""
match_info = set()
to_return = []
for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]: # FIXME: might need to break if one of them matches
saw_match = False
for kmer in [input_kmer, khmer.reverse_complement(input_kmer)]:
#for kmer in [input_kmer]:
if kmer not in all_kmers_bf: # TODO: this is fine
continue
prefix_matches = tree.keys(kmer) # get all the k-mers whose prefix matches
#match_info = set()
# get the location of the found kmers in the counters
Expand All @@ -222,12 +225,12 @@ def process_seq(self, seq):
for i in range(len(seq) - small_k_size + 1): # look at all k-mers
kmer = seq[i:i + small_k_size]
possible_match = False
#if kmer not in seen_kmers: # if we should process it
if True:
#if kmer in all_kmers_bf: # if we should process it
if kmer not in seen_kmers: # if we should process it
#if True:
#if kmer in all_kmers_bf or kmer[::-1] in all_kmers_bf: # if we should process it
if True:
match_list, saw_match = self.return_matches(kmer, 0)
if saw_match: # TODO: note, I *could* add all the trie matches and their sub-kmers to the seen_kmers
if saw_match:
seen_kmers.add(kmer) # FIXME: might also be able to add the reverse complements in here, instead of adjusting the division down near line 332
seen_kmers.add(khmer.reverse_complement(kmer))
to_return.extend(match_list)
Expand Down

0 comments on commit 4f6378f

Please sign in to comment.