Skip to content

Commit

Permalink
commit output with no BF for checking. #2 #20 #19
Browse files Browse the repository at this point in the history
  • Loading branch information
dkoslicki committed Mar 18, 2020
1 parent f7bee92 commit 0161b06
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 12 deletions.
1 change: 1 addition & 0 deletions dataForShaopeng/test_issue/StreamingQueryScript.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def process_seq(self, seq):
# MAYBE: instead of just passing the kmer, and then returning_matches using the kmer and it's reverse complement, I should pass the kmer,

# FIXME: I *might* be able to circumvent all of this just by using canonical k-mers....
# TODO: may think the right idea is: rev-comp, then shorten
else:
break

Expand Down
Binary file modified dataForShaopeng/test_issue/TrainingDatabase_k_61.h5
Binary file not shown.
Binary file modified dataForShaopeng/test_issue/TrainingDatabase_k_61.tst
Binary file not shown.
60 changes: 58 additions & 2 deletions dataForShaopeng/test_issue/check_real.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def reduce_to_with_revcomp(kmers, k_size):
for j in range(len(seq) - max_k + 1):
kmer_sets[i].add(seq[j:j + max_k])

print(f"True containment 104 containment at {max_k}: {len(kmer_sets[0].intersection(kmer_sets[1])) / float(len(kmer_sets[0]))}")
print(f"True containment 109 containment at {max_k}: {len(kmer_sets[0].intersection(kmer_sets[1])) / float(len(kmer_sets[1]))}")
print(f"True containment 104 containment at, plain kmers {max_k}: {len(kmer_sets[0].intersection(kmer_sets[1])) / float(len(kmer_sets[0]))}")
print(f"True containment 109 containment at, plain kmers {max_k}: {len(kmer_sets[0].intersection(kmer_sets[1])) / float(len(kmer_sets[1]))}")

# and check the containments with rev-comps
max_k = 40
Expand All @@ -82,5 +82,61 @@ def reduce_to_with_revcomp(kmers, k_size):
print(f"True containment 109 at {max_k} with rev-comp: {len(kmer_sets_rev[0].intersection(kmer_sets_rev[1])) / float(len(kmer_sets_rev[1]))}")



# check containment with canonical k-mers
max_k = 40
kmer_sets_can = []
for i, input_file in enumerate(import_list):
kmer_sets_can.append(set())
for record in screed.open(input_file):
seq = record.sequence
for j in range(len(seq) - max_k + 1):
kmer = seq[j:j + max_k]
kmer_rc = khmer.reverse_complement(kmer)
if kmer_rc < kmer:
kmer = kmer_rc
kmer_sets_can[i].add(kmer)


print(f"True containment 104 at {max_k}, canonical: {len(kmer_sets_can[0].intersection(kmer_sets_can[1])) / float(len(kmer_sets_can[0]))}")
print(f"True containment 109 at {max_k}, canonical: {len(kmer_sets_can[0].intersection(kmer_sets_can[1])) / float(len(kmer_sets_can[1]))}")


######
# also check the largest size
# and check the containments with rev-comps
max_k = 61
kmer_sets_rev = []
for i, input_file in enumerate(import_list):
kmer_sets_rev.append(set())
for record in screed.open(input_file):
seq = record.sequence
for j in range(len(seq) - max_k + 1):
kmer_sets_rev[i].add(seq[j:j + max_k])
kmer_sets_rev[i].add(khmer.reverse_complement(seq[j:j + max_k]))


print(f"True containment 104 at {max_k} with rev-comp: {len(kmer_sets_rev[0].intersection(kmer_sets_rev[1])) / float(len(kmer_sets_rev[0]))}")
print(f"True containment 109 at {max_k} with rev-comp: {len(kmer_sets_rev[0].intersection(kmer_sets_rev[1])) / float(len(kmer_sets_rev[1]))}")

# check containment with canonical k-mers
max_k = 61
kmer_sets_can = []
for i, input_file in enumerate(import_list):
kmer_sets_can.append(set())
for record in screed.open(input_file):
seq = record.sequence
for j in range(len(seq) - max_k + 1):
kmer = seq[j:j + max_k]
kmer_rc = khmer.reverse_complement(kmer)
if kmer_rc < kmer:
kmer = kmer_rc
kmer_sets_can[i].add(kmer)


print(f"True containment 104 at {max_k}, canonical: {len(kmer_sets_can[0].intersection(kmer_sets_can[1])) / float(len(kmer_sets_can[0]))}")
print(f"True containment 109 at {max_k}, canonical: {len(kmer_sets_can[0].intersection(kmer_sets_can[1])) / float(len(kmer_sets_can[1]))}")


# FIXME: problem is with the bloom filter. See StreamingQueryScript.py lines 207 ish, don't use the bloom filter and things go fine

2 changes: 1 addition & 1 deletion dataForShaopeng/test_issue/out_40_61_1.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
,k=40,k=41,k=42,k=43,k=44,k=45,k=46,k=47,k=48,k=49,k=50,k=51,k=52,k=53,k=54,k=55,k=56,k=57,k=58,k=59,k=60,k=61
taxid_1909294_104_genomic.fna.gz,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
taxid_1909294_109_genomic.fna.gz,0.677,0.582,0.575,0.57,0.564,0.559,0.555,0.555,0.551,0.547,0.542,0.535,0.532,0.527,0.523,0.52,0.518,0.513,0.504,0.5,0.497,0.493
taxid_1909294_109_genomic.fna.gz,0.677,0.67,0.662,0.653,0.644,0.634,0.624,0.617,0.608,0.601,0.589,0.577,0.57,0.562,0.554,0.543,0.538,0.529,0.514,0.508,0.501,0.493
2 changes: 1 addition & 1 deletion dataForShaopeng/test_issue/out_50_61_1.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
,k=50,k=51,k=52,k=53,k=54,k=55,k=56,k=57,k=58,k=59,k=60,k=61
taxid_1909294_104_genomic.fna.gz,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
taxid_1909294_109_genomic.fna.gz,0.589,0.535,0.532,0.527,0.523,0.519,0.518,0.513,0.504,0.5,0.497,0.493
taxid_1909294_109_genomic.fna.gz,0.589,0.577,0.57,0.562,0.554,0.543,0.538,0.529,0.514,0.508,0.501,0.493
12 changes: 8 additions & 4 deletions scripts/MakeStreamingDNADatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,18 @@ def main():
to_insert = set()
for i in range(len(genome_sketches)):
for kmer_index in range(len(genome_sketches[i]._kmers)): # FIXME: think about which suffix/prefixes to add here
# normal kmer
kmer = genome_sketches[i]._kmers[kmer_index]
to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
# rev-comp kmer
kmer = khmer.reverse_complement(genome_sketches[i]._kmers[kmer_index])
to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
kmer = khmer.reverse_complement(genome_sketches[i]._kmers[kmer_index])[::-1]
to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
kmer = khmer.reverse_complement(genome_sketches[i]._kmers[kmer_index][::-1])
to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
# rev-comp then reverse
#kmer = khmer.reverse_complement(genome_sketches[i]._kmers[kmer_index])[::-1]
#to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
# reverse then rev-comp
#kmer = khmer.reverse_complement(genome_sketches[i]._kmers[kmer_index][::-1])
#to_insert.add(kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
tree = mt.Trie(to_insert)
if verbose:
print("Saving Ternary search tree")
Expand Down
21 changes: 17 additions & 4 deletions scripts/StreamingQueryDNADatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,30 @@ def keyfunction(item):
for sketch in sketches:
for kmer in sketch._kmers: # FIXME: think about what all actually needs to be added
for ksize in k_range:
# truncated kmer
all_kmers_bf.add(kmer[0:ksize]) # put all the k-mers and the appropriate suffixes in
# truncate, reverse
all_kmers_bf.add(kmer[0:ksize][::-1])

# truncate, rev-comp
all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize])) # also add the reverse complement
# truncate, reverse, rev-comp
all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize][::-1]))
# truncate, rev-comp, reverse
all_kmers_bf.add(khmer.reverse_complement(kmer[0:ksize])[::-1])

# tail
all_kmers_bf.add(kmer[-ksize:]) # put all the k-mers and the appropriate suffixes in
# reversed tail
all_kmers_bf.add(kmer[-ksize:][::-1])
# tail, rev-comp
all_kmers_bf.add(khmer.reverse_complement(kmer[-ksize:])) # also add the reverse complement
# tail, reverse, rev-comp
all_kmers_bf.add(khmer.reverse_complement(kmer[-ksize:][::-1]))
# tail, revcomp, reverse
all_kmers_bf.add(khmer.reverse_complement(kmer[-ksize:])[::-1])
# rev-comp, tail
all_kmers_bf.add(khmer.reverse_complement(kmer)[-ksize:])

except IOError:
print("No such file or directory/error opening file: %s" % hydra_file)
Expand Down Expand Up @@ -236,8 +249,8 @@ def process_seq(self, seq):
possible_match = False
if kmer not in seen_kmers: # if we should process it
#if True:
if kmer in all_kmers_bf: # if we should process it
#if True:
#if kmer in all_kmers_bf: # if we should process it
if True:
match_list, saw_match = self.return_matches(kmer, 0)
if saw_match:
seen_kmers.add(kmer) # FIXME: might also be able to add the reverse complements in here, instead of adjusting the division down near line 332
Expand All @@ -251,8 +264,8 @@ def process_seq(self, seq):
if possible_match:
for other_k_size in [x for x in k_range[1:] if i+x <= len(seq)]:
kmer = seq[i:i + other_k_size]
if kmer in all_kmers_bf:
#if True:
#if kmer in all_kmers_bf:
if True:
k_size_loc = k_range.index(other_k_size)
match_list, saw_match = self.return_matches(kmer, k_size_loc)
if saw_match:
Expand Down

0 comments on commit 0161b06

Please sign in to comment.