Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/Retraining' into ModularizeMake …
Browse files Browse the repository at this point in the history
…for #29 to test in production environment (server)
  • Loading branch information
dkoslicki committed Mar 24, 2020
2 parents 0f3da00 + d0f8bc0 commit d4ae9f3
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 0 deletions.
28 changes: 28 additions & 0 deletions tutorials/retraining/retrain_CMash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
#set -e
# this script will demonstrate how to re-train CMash on a custom database

trainingFiles="training_files.txt" # location of the full path names to the training genomes (fasta/q, compressed or not)
numThreads=8 # number of threads to utilize for the re-training
cmashBaseName="cmash_db_n1000_k60" # base name you wish for the output training database
outputDir="test" # name of the output directory you would like the results to be placed into. THIS DIRECTORY MUST ALREADY EXIST # FIXME
kmerRange="30-60-10" # range of k-mer sizes you want to be able to query with. This translates to [30, 40, 50, 60]


# don't change these unless you know what you're doing
cmashDatabase="${cmashBaseName}.h5"
cmashDump="${cmashBaseName}_dump.fa"
prefilterName="${cmashBaseName}_${kmerRange}.bf"
maxKsize=$(echo ${kmerRange} | cut -d'-' -f2)
numHashes=1000


# re-train CMash
echo "re-training CMash"
rm ${cmashDatabase} 2> /dev/null
python ../../scripts/MakeStreamingDNADatabase.py ${trainingFiles} ${outputDir}/${cmashDatabase} -n ${numHashes} -k ${maxKsize} -v

# make streaming pre-filter
echo "making streaming pre-filter"
rm ${prefilterName} 2> /dev/null
python ../../scripts/MakeStreamingPrefilter.py ${outputDir}/${cmashDatabase} ${outputDir}/${prefilterName} ${kmerRange}
85 changes: 85 additions & 0 deletions tutorials/retraining/retrain_and_test_metalign.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash
#set -e
# A basic end-to-end workflow to make sure things are working correctly (esp wrt integration with CMash)
# basic steps:
# make small training database
# re-train Cmash
# make a mock community
# use metalign to profile the mock community (results should be two strains corresponding to the two selected organisms)

# requires bbmap: https://sourceforge.net/projects/bbmap/

bbmapRandomReads="~/Documents/bbmap/randomreads.sh"
trainingFiles="training_files.txt"
numThreads=8
cmashBaseName="cmash_db_n1000_k60"
cmashDatabase="${cmashBaseName}.h5"
cmashDump="${cmashBaseName}_dump.fa"
prefilterName="cmash_filter_n1000_k60_30-60-10.bf"
bbmapRef='mock_reference.fa'
mockCommunity="mock_community.fa"



# copy some of the training data over
echo "getting organism files"
rm -rf organism_files 2> /dev/null
mkdir organism_files
find ../data/organism_files/ -type f -name "*.fna.gz" | head -n 25 | xargs -I{} cp {} organism_files/

# Pull off the taxonomy information
echo "getting taxonomy"
rm db_info.txt 2> /dev/null
rm headers.txt 2> /dev/null
for file in `find organism_files/ -type f -name "*.fna.gz"`
do
zcat ${file} | grep '>' | cut -d' ' -f1 | sed 's/>//g' >> headers.txt # pull off accessions
done
LC_ALL=C grep -F -w -f headers.txt ../data/db_info.txt > db_info.txt # look for the taxonomy info
rm headers.txt

# store the names of the training files
rm $trainingFiles 2> /dev/null
fullPath="$(pwd)/organism_files"
find $fullPath -type f -name "*.fna.gz" > ${trainingFiles}

# re-train CMash
echo "re-training CMash"
rm ${cmashDatabase} 2> /dev/null
python ../CMash/scripts/MakeStreamingDNADatabase.py ${trainingFiles} ${cmashDatabase} -n 1000 -k 60 #-v

# make streaming pre-filter
echo "making streaming pre-filter"
rm ${prefilterName} 2> /dev/null
python ../CMash/scripts/MakeStreamingPrefilter.py ${cmashDatabase} ${prefilterName} 30-60-10

# dump all the k-mers in the new training database
echo "dumping training k-mers"
rm ${cmashDump} 2> /dev/null
python dump_kmers.py ${cmashDatabase} ${cmashDump}

# do the kmc counting of the k-mers
echo "running kmc"
rm "${cmashBaseName}.kmc_pre" 2> /dev/null
rm "${cmashBaseName}.kmc_suf" 2> /dev/null
# count all the k-mers in the training database, -ci0 means include them all, -cs3 use small counters
../KMC/bin/kmc -v -k60 -fa -ci0 -cs3 -t"${numThreads}" -jlogsample ${cmashDump} "${cmashBaseName}_dump" .


# Next, make a silly mock community
echo "mock community organisms:"
head -n 2 ${trainingFiles}
rm $bbmapRef 2> /dev/null
head -n 2 ${trainingFiles} | xargs -I{} zcat {} >> "${bbmapRef}_temp"
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < "${bbmapRef}_temp" | tail -n +2 > "${bbmapRef}"
rm "${bbmapRef}_temp"
eval ${bbmapRandomReads} ref=${bbmapRef} out="${mockCommunity}_temp.fa" adderrors=false simplenames=t minlength=250 maxlength=250 maxsnps=0 coverage=20 maxsnps=0 maxinss=0 maxdels=0 maxsubs=0 maxns=0 maxinslen=0 maxdellen=0 maxsublen=0 maxnlen=0 mininslen=0 mindellen=0 minsublen=0 minnlen=0
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < "${mockCommunity}_temp.fa" | tail -n +2 > "${mockCommunity}"
rm "${mockCommunity}_temp.fa"

# Run metalign on the new data
#cd ..
#python metalign.py local_tests/mock_community.fa local_tests/ --output local_tests/test_out.csv --verbose --keep_temp_files --temp_dir local_tests/metalign_temp

# then take a look at local_tests/test_out.csv and make sure everything worked. Should show two strains corresponding to the ones used to form the mock community

0 comments on commit d4ae9f3

Please sign in to comment.