Fix parse_straingst to support the new format

The function now checks whether we're reading a new-style StrainGST file or not. Fixing this function also fixes `straingr prepare-ref`. Closes #11.
broadinstitute · Jun 1, 2022 · 0860b55 · 0860b55
1 parent d9c77b0
commit 0860b55
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 6 deletions.
diff --git a/docs/straingr.md b/docs/straingr.md
@@ -74,6 +74,11 @@ We specify the similarities.tsv file created at the StrainGST database construct
 step, to reuse the calculated k-mer similarities again for clustering. The resulting
 concatenated reference will be written to `refs_concat.fasta`.
 
+**New in version 1.3**: If you use the new split StrainGST output format introduced in 
+version 1.3, only specify the files listing the predicted strains. So, replace 
+`straigr prepare-ref -s path/to/straingst/*.tsv ...` with 
+`straingr prepare-ref -s path/to/straingst/*.strains.tsv ...`.
+
 #### 2. Align reads to the reference
 
 StrainGR is built to be used with `bwa mem`, as it uses the supplied 

diff --git a/src/strainge/io/utils.py b/src/strainge/io/utils.py
@@ -33,6 +33,7 @@
 from typing import List, Iterable  # noqa
 from pathlib import Path
 from contextlib import contextmanager
+from itertools import chain
 
 
 @contextmanager
@@ -102,18 +103,31 @@ def parse_straingst(result_file, return_sample_stats=False):
 
     # Ignore comments
     result_file = (line for line in result_file if not line.startswith('#'))
+    first_line = next(result_file)
+
+    old_style_straingst = False
+    if first_line.startswith("sample"):
+        old_style_straingst = True
 
     # Collect sample statistics (first two lines)
-    sample_stats = [
-        next(result_file),
-        next(result_file)
-    ]
+    if old_style_straingst:
+        sample_stats = [
+            first_line,
+            next(result_file)
+        ]
+    else:
+        sample_stats = []
 
-    if return_sample_stats:
+    if sample_stats and return_sample_stats:
         sample_stats = next(csv.DictReader(sample_stats, delimiter='\t'))
 
         # Return sample statistics
         yield sample_stats
+    elif not sample_stats and return_sample_stats:
+        raise ValueError("Trying to read sample statistics from a new style (v1.3) StrainGST output!")
 
     # Return each strain found with its statistics
-    yield from csv.DictReader(result_file, delimiter='\t')
+    if old_style_straingst:
+        yield from csv.DictReader(result_file, delimiter='\t')
+    else:
+        yield from csv.DictReader(chain([first_line], result_file), delimiter='\t')