From 4915e29c5f0c3f4c0e31a918ebf2204be049666d Mon Sep 17 00:00:00 2001 From: dkoslicki Date: Thu, 19 Mar 2020 21:18:17 -0700 Subject: [PATCH] added a bunch of unit tests for #14. Still some concerns about order of truncating and taking rev-comps --- tests/unit_tests/test_Query.py | 80 ++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 17 deletions(-) diff --git a/tests/unit_tests/test_Query.py b/tests/unit_tests/test_Query.py index 5d93cfb..7024da9 100644 --- a/tests/unit_tests/test_Query.py +++ b/tests/unit_tests/test_Query.py @@ -12,13 +12,13 @@ # create some test data # First, the TST seq1 = "ATCGTATGAGTATCGTCGATGCATGCATCGATGCATGCTACGTATCGCATGCATG" -seq2 = "ATCTACTCAACATTAACTACTCATATTAACTCACATTCATATCCATACTACTCGT" -seq3 = "ACTCATGTTAGATCGATATTGACTGATGACTCGTTGCACTGCATGCTGCATGATGC" +seq2 = "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" +seq3 = "ATATATATATATATATATATATATATATATATATATATATATATATATATATATAT" seq4 = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" -CE1 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y') -CE2 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y') -CE3 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y') -CE4 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=3, save_kmers='y') +CE1 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y') +CE2 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y') +CE3 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y') +CE4 = MH.CountEstimator(n=5, max_prime=9999999999971, ksize=5, save_kmers='y') CE1.add_sequence(seq1) CE2.add_sequence(seq2) CE3.add_sequence(seq3) @@ -50,28 +50,35 @@ temp_TST_file = tempfile.mktemp() tree.save(temp_TST_file) +# TODO: marisa_trie has an issue with single character prefix lookups +# TODO: see https://github.com/pytries/marisa-trie/issues/55 +# TODO: so set k-range above that +k_range = [2, 3, 5] + # Create module tests def test_initialize_Create(): - C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=[1, 3, 5]) - assert C.k_range == [1, 3, 5] + C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) + assert C.k_range == k_range assert C.TST_file == temp_TST_file pass def test_Create_import_TST(): - C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=[1, 3, 5]) + C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) C.import_TST() # make sure the k-mers and their reverse complements have been added assert C.tree.keys("AAA") assert C.tree.keys("TTT") - assert C.tree.keys("ACT") - assert C.tree.keys("AGT") + assert C.tree.keys("ATA") + assert C.tree.keys("TAT") # Make sure the correct kmers are being identified with the correct k-mers - assert C.tree.keys("AAA")[0] == "AAAx3x0" - assert C.tree.keys("TTT")[0] == "TTTx3x0" - pass + for kmer in ["AAA", "TTT"]: + matches = C.tree.keys("AAA") + for match in matches: + location_info = "x".join(match.split('x')[1:]) + assert (location_info == "3x0") or (location_info == "1x0") + def test_Create_BF_prefilter(): - k_range = [1, 3, 5] C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) C.import_TST() C.create_BF_prefilter() @@ -95,22 +102,61 @@ def test_Create_BF_prefilter(): assert kmer[0:k_size] in C.all_kmers_bf assert khmer.reverse_complement(kmer[0:k_size]) in C.all_kmers_bf + # check if the BF is case insensitive + for CE in CEs: + for kmer in CE._kmers: + if kmer: + for k_size in k_range: + trunc_kmer = kmer[0:k_size] + trunc_kmer = trunc_kmer.lower() + assert trunc_kmer in C.all_kmers_bf + # khmer doesn't properly handle rev-comps of lower-case characters + # see https://github.com/dib-lab/khmer/issues/1904 + assert khmer.reverse_complement(trunc_kmer.upper()).lower() in C.all_kmers_bf + # Counters module tests def test_initialize_Counters(): - k_range = [1, 3, 5] C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) C.import_TST() C.create_BF_prefilter() counters = Counters(tree=C.tree, k_range=k_range, all_kmers_bf=C.all_kmers_bf) def test_Counters_return_matches(): - k_range = [1, 3, 5] C = Create(training_database_file=temp_database_file, bloom_filter_file="", TST_file=temp_TST_file, k_range=k_range) C.import_TST() C.create_BF_prefilter() counters = Counters(tree=C.tree, k_range=k_range, all_kmers_bf=C.all_kmers_bf) + # test the return matches on known k-mers + # each sketch kmer (or it's reverse complement) should match to the TST + # TODO: big note here: proper way to check this: take the reverse complement, THEN truncate + # (which effectively takes the suffix, as the suffix of a rev-comp is the prefix of the original) + # but this calls into question how create_BF_prefilter is working since it truncates, THEN takes the revcomp + # but this is the only way I could get all these tests to pass successfully + for CE in CEs: + for k_size in k_range: + for kmer in CE._kmers: + kmer = kmer[0:k_size] + if kmer: + k_size_loc = k_range.index(len(kmer)) + to_return, saw_match = counters.return_matches(input_kmer=kmer, k_size_loc=k_size_loc) + assert saw_match + for to_return_elem in to_return: + truncated_sketches = list(map(lambda x: x[0:k_size], CEs[to_return_elem[0]]._kmers)) + # add the reverse complements as well, since the TST return_matches matches to rev-comps as well + truncated_sketches_revcomp = list(map(lambda x: khmer.reverse_complement(x)[0:k_size], CEs[to_return_elem[0]]._kmers)) + # make sure the kmer really is in the sketch indicated by to_return, could be in the truncated or the rev-comp one + assert (kmer in truncated_sketches) or (kmer in truncated_sketches_revcomp) + # make sure the k_size_loc is correct + assert to_return_elem[1] == k_size_loc + # make sure it returned the correct location in the sketch + # note that at some smaller kmer values, it may appear in multiple locations, so just make sure + # that it appears somewhere in the list + indices = [i for i, x in enumerate(truncated_sketches) if x == kmer] + indices_revcomp = [i for i, x in enumerate(truncated_sketches_revcomp) if x == kmer] + assert (to_return_elem[2] in indices) or (to_return_elem[2] in indices_revcomp) + def test_Counters_process_seq(): pass