Skip to content

Commit

Permalink
create script to generate estimate and ground truth outputs #14
Browse files Browse the repository at this point in the history
  • Loading branch information
dkoslicki committed Mar 27, 2020
1 parent fe0643a commit 9a94f77
Showing 1 changed file with 73 additions and 0 deletions.
73 changes: 73 additions & 0 deletions tests/script_tests/run_comparison_to_ground_truth.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash

# This is to compare the results of CMash against a brute force calculated containment indicies.
# WARNING: this uses a LOT of memory and is CPU intensive as well, so use with caution (and use on a server)

# In case you have multiple versions installed (eg. Metalign as well as CMash), make sure python is looking in the right place:
export PYTHONPATH="$(dirname $(dirname "`pwd`"))":$PYTHONPATH

#Make sure the correct CMash is being pulled from
testFile=$(python -c "from CMash import MinHash as MH; print(MH.__file__)")
parentDir=`dirname $PWD`
parentDir=`dirname ${parentDir}`
correctFile="${parentDir}/CMash/MinHash.py"
if [ "$testFile" == "$correctFile" ];
then
echo "Files are correct"
else
echo "Files are not correct"
exit 1
fi

testOrganism="../Organisms/taxid_1192839_4_genomic.fna.gz"
maxK=5
kSizes="4-${maxK}-1"
numHashes=10
containmentThresh=.01
locationOfThresh=-1

scriptsDir="${parentDir}/scripts"
modulesDir="${parentDir}/CMash"

# make the training database
echo "Training on data"
rm TrainingDatabase.h5 2> /dev/null
rm TrainingDatabase.tst 2> /dev/null
/usr/bin/time python ${scriptsDir}/MakeStreamingDNADatabase.py filenames.txt TrainingDatabase.h5 -n ${numHashes} -k ${maxK}
if test -f TrainingDatabase.h5; then
if test -f TrainingDatabase.tst; then
echo "Training file successfully created"
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi

echo "Classifying sample with CMash"
outName="est_results.csv"
rm ${outName} 2> /dev/null
# make a streaming pre-filter
/usr/bin/time python ${scriptsDir}/StreamingQueryDNADatabase.py ${testOrganism} TrainingDatabase.h5 ${outName} $kSizes --sensitive -l $locationOfThresh -c $containmentThresh
if test -f ${outName}; then
echo "sensitive classify successful"
cat ${outName}
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi

echo "Computing ground truth containment indicies"
outName="true_results.csv"
rm ${outName} 2> /dev/null
# make a streaming pre-filter
/usr/bin/time python ${modulesDir}/GroundTruth.py ${testOrganism} TrainingDatabase.h5 ${outName} $kSizes -l $locationOfThresh -c $containmentThresh
if test -f ${outName}; then
echo "sensitive classify successful"
cat ${outName}
else
echo "SOMETHING WENT WRONG!!!!"
exit 1
fi

0 comments on commit 9a94f77

Please sign in to comment.