Skip to content

Commit

Permalink
problem really is with the bloom filter. Remove it everywhere, and pr…
Browse files Browse the repository at this point in the history
…oblems in #2 and #20 go away
  • Loading branch information
dkoslicki committed Mar 18, 2020
1 parent cccd5bb commit bf01bf3
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 6 deletions.
11 changes: 7 additions & 4 deletions dataForShaopeng/test_issue/StreamingQueryScript.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,13 @@ def parseNumList(input):


# read in the arguments
k_range = parseNumList("60-61-1")
my_range = "50-61-1"
k_range = parseNumList(my_range)
if k_range is None:
raise Exception("The --range argument is required, no matter what the help menu says.")
training_data = "/home/dkoslicki/Desktop/CMash/dataForShaopeng/test_issue/TrainingDatabase_k_61.h5"
query_file = "/home/dkoslicki/Desktop/CMash/dataForShaopeng/small_data/taxid_1909294_104_genomic.fna.gz"
results_file = "/home/dkoslicki/Desktop/CMash/dataForShaopeng/test_issue/out_61_61_1.csv"
results_file = f"/home/dkoslicki/Desktop/CMash/dataForShaopeng/test_issue/out_{my_range.replace('-','_')})"
npz_file = os.path.splitext(results_file)[0] + "_hit_matrix.npz"
num_threads = 12
location_of_thresh = -1
Expand Down Expand Up @@ -203,7 +204,8 @@ def process_seq(self, seq):
for i in range(len(seq) - small_k_size + 1): # look at all k-mers
kmer = seq[i:i + small_k_size]
possible_match = False
if kmer not in seen_kmers: # if we should process it
#if kmer not in seen_kmers: # if we should process it
if True:
#if kmer in all_kmers_bf: # if we should process it # FIXME: problem might be here since if I remove the bloom filter, everything appears to work just fine....
if True:
match_list, saw_match = self.return_matches(kmer, 0)
Expand All @@ -219,7 +221,8 @@ def process_seq(self, seq):
if possible_match:
for other_k_size in [x for x in k_range[1:] if i + x <= len(seq)]:
kmer = seq[i:i + other_k_size]
if kmer in all_kmers_bf:
#if kmer in all_kmers_bf:
if True:
k_size_loc = k_range.index(other_k_size)
match_list, saw_match = self.return_matches(kmer, k_size_loc)
if saw_match:
Expand Down
Binary file modified dataForShaopeng/test_issue/TrainingDatabase_k_61.h5
Binary file not shown.
2 changes: 1 addition & 1 deletion dataForShaopeng/test_issue/check_real.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def reduce_to_with_revcomp(kmers, k_size):
print(f"True containment 109 containment at {max_k}: {len(kmer_sets[0].intersection(kmer_sets[1])) / float(len(kmer_sets[1]))}")

# and check the containments with rev-comps
max_k = 60
max_k = 51
kmer_sets_rev = []
for i, input_file in enumerate(import_list):
kmer_sets_rev.append(set())
Expand Down
3 changes: 2 additions & 1 deletion scripts/StreamingQueryDNADatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,8 @@ def process_seq(self, seq):
for i in range(len(seq) - small_k_size + 1): # look at all k-mers
kmer = seq[i:i + small_k_size]
possible_match = False
if kmer not in seen_kmers: # if we should process it
#if kmer not in seen_kmers: # if we should process it
if True:
#if kmer in all_kmers_bf: # if we should process it
if True:
match_list, saw_match = self.return_matches(kmer, 0)
Expand Down

0 comments on commit bf01bf3

Please sign in to comment.