From 0860b558524c79b930a27a3afc6dee81739fbacc Mon Sep 17 00:00:00 2001 From: Lucas van Dijk Date: Wed, 1 Jun 2022 17:30:58 -0400 Subject: [PATCH] Fix parse_straingst to support the new format The function now checks whether we're reading a new-style StrainGST file or not. Fixing this function also fixes `straingr prepare-ref`. Closes #11. --- docs/straingr.md | 5 +++++ src/strainge/io/utils.py | 26 ++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/docs/straingr.md b/docs/straingr.md index f0d70a9..fd6786d 100644 --- a/docs/straingr.md +++ b/docs/straingr.md @@ -74,6 +74,11 @@ We specify the similarities.tsv file created at the StrainGST database construct step, to reuse the calculated k-mer similarities again for clustering. The resulting concatenated reference will be written to `refs_concat.fasta`. +**New in version 1.3**: If you use the new split StrainGST output format introduced in +version 1.3, only specify the files listing the predicted strains. So, replace +`straigr prepare-ref -s path/to/straingst/*.tsv ...` with +`straingr prepare-ref -s path/to/straingst/*.strains.tsv ...`. + #### 2. Align reads to the reference StrainGR is built to be used with `bwa mem`, as it uses the supplied diff --git a/src/strainge/io/utils.py b/src/strainge/io/utils.py index 3453476..0b87999 100644 --- a/src/strainge/io/utils.py +++ b/src/strainge/io/utils.py @@ -33,6 +33,7 @@ from typing import List, Iterable # noqa from pathlib import Path from contextlib import contextmanager +from itertools import chain @contextmanager @@ -102,18 +103,31 @@ def parse_straingst(result_file, return_sample_stats=False): # Ignore comments result_file = (line for line in result_file if not line.startswith('#')) + first_line = next(result_file) + + old_style_straingst = False + if first_line.startswith("sample"): + old_style_straingst = True # Collect sample statistics (first two lines) - sample_stats = [ - next(result_file), - next(result_file) - ] + if old_style_straingst: + sample_stats = [ + first_line, + next(result_file) + ] + else: + sample_stats = [] - if return_sample_stats: + if sample_stats and return_sample_stats: sample_stats = next(csv.DictReader(sample_stats, delimiter='\t')) # Return sample statistics yield sample_stats + elif not sample_stats and return_sample_stats: + raise ValueError("Trying to read sample statistics from a new style (v1.3) StrainGST output!") # Return each strain found with its statistics - yield from csv.DictReader(result_file, delimiter='\t') + if old_style_straingst: + yield from csv.DictReader(result_file, delimiter='\t') + else: + yield from csv.DictReader(chain([first_line], result_file), delimiter='\t')