Skip to content

Commit

Permalink
add old method of making TST #29
Browse files Browse the repository at this point in the history
  • Loading branch information
dkoslicki committed Mar 24, 2020
1 parent 27123d8 commit 5855656
Showing 1 changed file with 31 additions and 2 deletions.
33 changes: 31 additions & 2 deletions CMash/Make.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

class MakeTSTNew:
def __init__(self, training_database_file_name: str, TST_export_file_name: str):
self. training_database_file_name = training_database_file_name
self.training_database_file_name = training_database_file_name
self.TST_export_file_name = TST_export_file_name

@staticmethod
Expand Down Expand Up @@ -38,12 +38,41 @@ def yield_trie_items_to_insert_no_import(file_name):
temp_kmers = subgrp["kmers"][...]
kmers = [kmer.decode('utf-8') for kmer in temp_kmers]
for (kmer_index, kmer) in enumerate(kmers):
# add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement
if kmer:
yield kmer + 'x' + str(i) + 'x' + str(kmer_index) # format here is kmer+x+hash_index+kmer_index
# rev-comp kmer
kmer_rc = khmer.reverse_complement(kmer)
yield kmer_rc + 'x' + str(i) + 'x' + str(kmer_index) # format here is kmer+x+hash_index+kmer_index

def make_TST(self):
tree = mt.Trie(self.yield_trie_items_to_insert_no_import(self. training_database_file_name))
tree = mt.Trie(self.yield_trie_items_to_insert_no_import(self.training_database_file_name))
tree.save(self.TST_export_file_name)


class MakeTSTOld:
def __init__(self, genome_sketches: list, TST_export_file_name: str):
self.genome_sketches = genome_sketches
self.TST_export_file_name = TST_export_file_name

@classmethod
def make_TST(self):
genome_sketches = self.genome_sketches
to_insert = set()
# add both the original k-mer and the reverse complement, as the MinHashes were created without reverse complement
for i in range(len(genome_sketches)):
for kmer_index in range(len(genome_sketches[i]._kmers)):
# normal kmer
kmer = genome_sketches[i]._kmers[kmer_index]
# only insert the kmer if it's actually non-empty
if kmer:
to_insert.add(
kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index
# rev-comp kmer
kmer = khmer.reverse_complement(genome_sketches[i]._kmers[kmer_index])
to_insert.add(
kmer + 'x' + str(i) + 'x' + str(kmer_index)) # format here is kmer+x+hash_index+kmer_index

# export the TST
tree = mt.Trie(to_insert)
tree.save(self.TST_export_file_name)

0 comments on commit 5855656

Please sign in to comment.