Skip to content

Commit

Permalink
Update benchmark code, and transcripts to latest version
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Mar 21, 2023
1 parent cc35efb commit fbc8909
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ It works by:
* Converting RefSeq/Ensembl GTFs to JSON
* Providing loaders for the HGVS libraries, via JSON.gz files, or REST API via [cdot_rest](https://github.com/SACGF/cdot_rest))

We currently support ~893k transcripts (vs ~141k in UTA v.20210129)
We currently support ~905k transcripts (vs ~141k in UTA v.20210129)

## New

Expand Down
2 changes: 1 addition & 1 deletion cdot/hgvs/dataproviders/fasta_seqfetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ChainedSeqFetcher:
This is useful if you want to use FastaSeqFetcher (below) as a fallback if SeqFetcher fails
seq_fetcher = ChainedSeqFetcher(SeqFetcher(), FastaSeqFetcher(fasta_filename))
seqfetcher = ChainedSeqFetcher(SeqFetcher(), FastaSeqFetcher(fasta_filename))
"""

def __init__(self, *args):
Expand Down
23 changes: 18 additions & 5 deletions tests/benchmark_hgvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from hgvs.assemblymapper import AssemblyMapper
from hgvs.exceptions import HGVSDataNotAvailableError, HGVSInvalidVariantError

from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider
from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider, FastaSeqFetcher


def handle_args():
Expand All @@ -24,6 +24,7 @@ def handle_args():
group.add_argument('--rest', action='store_true')
group.add_argument('--rest-insecure', action='store_true')
parser.add_argument('--json', help='JSON file')
parser.add_argument('--fasta', help='Fasta file for local sequences')
args = parser.parse_args()
if not any([args.uta, args.rest, args.rest_insecure, args.json]):
parser.error("You need to specify at least one of 'uta', 'rest', 'rest-insecure', 'json'")
Expand All @@ -41,17 +42,22 @@ def main():
total = len(hgvs_g_c_list)
print(f"Using {total} test records")

seqfetcher = None
if args.fasta:
seqfetcher = FastaSeqFetcher(args.fasta)

if args.uta:
hdp = hgvs.dataproviders.uta.connect()
elif args.rest:
hdp = RESTDataProvider() # Uses API server at cdot.cc
hdp = RESTDataProvider(seqfetcher=seqfetcher) # Uses API server at cdot.cc
elif args.rest_insecure:
hdp = RESTDataProvider(secure=False, seqfetcher=seqfetcher)
elif args.json:
hdp = JSONDataProvider([args.json])
elif args.json_insecure:
hdp = JSONDataProvider([args.json], secure=False)
hdp = JSONDataProvider([args.json], seqfetcher=seqfetcher)
else:
raise ValueError("Unknown data provider method!")

print("Starting benchmark...")
am = AssemblyMapper(hdp,
assembly_name='GRCh38',
alt_aln_method='splign', replace_reference=True)
Expand All @@ -62,6 +68,7 @@ def main():
correct = 0
incorrect = 0
no_data = 0
total_start = time.time()
for hgvs_g, hgvs_c in hgvs_g_c_list:
start = time.time()
try:
Expand Down Expand Up @@ -90,10 +97,16 @@ def main():
time_taken = end - start
run_times.append(time_taken)

total_end = time.time()

print(f"Total: {total}, correct: {correct}, incorrect: {incorrect}, no data: {no_data}")
df = pd.DataFrame(run_times)
print(df.describe())

total_time = total_end - total_start
num_per_second = 1/total_time * total
print(f"{total} in {total_time} = {num_per_second} per second")


if __name__ == '__main__':
main()
Expand Down

0 comments on commit fbc8909

Please sign in to comment.