Skip to content

Commit

Permalink
added a bunch of unit tests for #14. Still some concerns about order …
Browse files Browse the repository at this point in the history
…of truncating and taking rev-comps
  • Loading branch information
dkoslicki committed Mar 20, 2020
1 parent 57f346d commit 4915e29
Showing 1 changed file with 63 additions and 17 deletions.
80 changes: 63 additions & 17 deletions tests/unit_tests/test_Query.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
# create some test data
# First, the TST
seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG"
seq2 = "ATCTACTCAACATTAACTACTCATATTAACTCACATTCATATCCATACTACTCGT"
seq3 = "ACTCATGTTAGATCGATATTGACTGATGACTCGTTGCACTGCATGCTGCATGATGC"
seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"
seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT"
seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
CE1 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y')
CE2 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y')
CE3 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y')
CE4 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y')
CE1 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y')
CE2 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y')
CE3 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y')
CE4 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y')
CE1.add_sequence(seq1)
CE2.add_sequence(seq2)
CE3.add_sequence(seq3)
Expand Down Expand Up @@ -50,28 +50,35 @@
temp_TST_file = tempfile.mktemp()
tree.save(temp_TST_file)

# TODO: marisa_trie has an issue with single character prefix lookups
# TODO: see https://github.com/pytries/marisa-trie/issues/55
# TODO: so set k-range above that
k_range = [2, 3, 5]

# Create module tests
def test_initialize_Create():
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=[1, 3, 5])
assert C.k_range == [1, 3, 5]
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range)
assert C.k_range == k_range
assert C.TST_file == temp_TST_file
pass

def test_Create_import_TST():
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=[1, 3, 5])
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range)
C.import_TST()
# make sure the k-mers and their reverse complements have been added
assert C.tree.keys("AAA")
assert C.tree.keys("TTT")
assert C.tree.keys("ACT")
assert C.tree.keys("AGT")
assert C.tree.keys("ATA")
assert C.tree.keys("TAT")
# Make sure the correct kmers are being identified with the correct k-mers
assert C.tree.keys("AAA")[0] == "AAAx3x0"
assert C.tree.keys("TTT")[0] == "TTTx3x0"
pass
for kmer in ["AAA", "TTT"]:
matches = C.tree.keys("AAA")
for match in matches:
location_info = "x".join(match.split('x')[1:])
assert (location_info == "3x0") or (location_info == "1x0")


def test_Create_BF_prefilter():
k_range = [1, 3, 5]
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range)
C.import_TST()
C.create_BF_prefilter()
Expand All @@ -95,22 +102,61 @@ def test_Create_BF_prefilter():
assert kmer[0:k_size] in C.all_kmers_bf
assert khmer.reverse_complement(kmer[0:k_size]) in C.all_kmers_bf

# check if the BF is case insensitive
for CE in CEs:
for kmer in CE._kmers:
if kmer:
for k_size in k_range:
trunc_kmer = kmer[0:k_size]
trunc_kmer = trunc_kmer.lower()
assert trunc_kmer in C.all_kmers_bf
# khmer doesn't properly handle rev-comps of lower-case characters
# see https://github.com/dib-lab/khmer/issues/1904
assert khmer.reverse_complement(trunc_kmer.upper()).lower() in C.all_kmers_bf


# Counters module tests
def test_initialize_Counters():
k_range = [1, 3, 5]
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range)
C.import_TST()
C.create_BF_prefilter()
counters = Counters(tree=C.tree, k_range=k_range, all_kmers_bf=C.all_kmers_bf)

def test_Counters_return_matches():
k_range = [1, 3, 5]
C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range)
C.import_TST()
C.create_BF_prefilter()
counters = Counters(tree=C.tree, k_range=k_range, all_kmers_bf=C.all_kmers_bf)

# test the return matches on known k-mers
# each sketch kmer (or it's reverse complement) should match to the TST
# TODO: big note here: proper way to check this: take the reverse complement, THEN truncate
# (which effectively takes the suffix, as the suffix of a rev-comp is the prefix of the original)
# but this calls into question how create_BF_prefilter is working since it truncates, THEN takes the revcomp
# but this is the only way I could get all these tests to pass successfully
for CE in CEs:
for k_size in k_range:
for kmer in CE._kmers:
kmer = kmer[0:k_size]
if kmer:
k_size_loc = k_range.index(len(kmer))
to_return, saw_match = counters.return_matches(input_kmer=kmer, k_size_loc=k_size_loc)
assert saw_match
for to_return_elem in to_return:
truncated_sketches = list(map(lambda x: x[0:k_size], CEs[to_return_elem[0]]._kmers))
# add the reverse complements as well, since the TST return_matches matches to rev-comps as well
truncated_sketches_revcomp = list(map(lambda x: khmer.reverse_complement(x)[0:k_size], CEs[to_return_elem[0]]._kmers))
# make sure the kmer really is in the sketch indicated by to_return, could be in the truncated or the rev-comp one
assert (kmer in truncated_sketches) or (kmer in truncated_sketches_revcomp)
# make sure the k_size_loc is correct
assert to_return_elem[1] == k_size_loc
# make sure it returned the correct location in the sketch
# note that at some smaller kmer values, it may appear in multiple locations, so just make sure
# that it appears somewhere in the list
indices = [i for i, x in enumerate(truncated_sketches) if x == kmer]
indices_revcomp = [i for i, x in enumerate(truncated_sketches_revcomp) if x == kmer]
assert (to_return_elem[2] in indices) or (to_return_elem[2] in indices_revcomp)


def test_Counters_process_seq():
pass
Expand Down

0 comments on commit 4915e29

Please sign in to comment.