From 0860b558524c79b930a27a3afc6dee81739fbacc Mon Sep 17 00:00:00 2001
From: Lucas van Dijk <info@lucasvandijk.nl>
Date: Wed, 1 Jun 2022 17:30:58 -0400
Subject: [PATCH] Fix parse_straingst to support the new format

The function now checks whether we're reading a new-style StrainGST
file or not. Fixing this function also fixes
`straingr prepare-ref`.

Closes #11.
---
 docs/straingr.md         |  5 +++++
 src/strainge/io/utils.py | 26 ++++++++++++++++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/docs/straingr.md b/docs/straingr.md
index f0d70a9..fd6786d 100644
--- a/docs/straingr.md
+++ b/docs/straingr.md
@@ -74,6 +74,11 @@ We specify the similarities.tsv file created at the StrainGST database construct
 step, to reuse the calculated k-mer similarities again for clustering. The resulting
 concatenated reference will be written to `refs_concat.fasta`.
 
+**New in version 1.3**: If you use the new split StrainGST output format introduced in 
+version 1.3, only specify the files listing the predicted strains. So, replace 
+`straigr prepare-ref -s path/to/straingst/*.tsv ...` with 
+`straingr prepare-ref -s path/to/straingst/*.strains.tsv ...`.
+
 #### 2. Align reads to the reference
 
 StrainGR is built to be used with `bwa mem`, as it uses the supplied 
diff --git a/src/strainge/io/utils.py b/src/strainge/io/utils.py
index 3453476..0b87999 100644
--- a/src/strainge/io/utils.py
+++ b/src/strainge/io/utils.py
@@ -33,6 +33,7 @@
 from typing import List, Iterable  # noqa
 from pathlib import Path
 from contextlib import contextmanager
+from itertools import chain
 
 
 @contextmanager
@@ -102,18 +103,31 @@ def parse_straingst(result_file, return_sample_stats=False):
 
     # Ignore comments
     result_file = (line for line in result_file if not line.startswith('#'))
+    first_line = next(result_file)
+
+    old_style_straingst = False
+    if first_line.startswith("sample"):
+        old_style_straingst = True
 
     # Collect sample statistics (first two lines)
-    sample_stats = [
-        next(result_file),
-        next(result_file)
-    ]
+    if old_style_straingst:
+        sample_stats = [
+            first_line,
+            next(result_file)
+        ]
+    else:
+        sample_stats = []
 
-    if return_sample_stats:
+    if sample_stats and return_sample_stats:
         sample_stats = next(csv.DictReader(sample_stats, delimiter='\t'))
 
         # Return sample statistics
         yield sample_stats
+    elif not sample_stats and return_sample_stats:
+        raise ValueError("Trying to read sample statistics from a new style (v1.3) StrainGST output!")
 
     # Return each strain found with its statistics
-    yield from csv.DictReader(result_file, delimiter='\t')
+    if old_style_straingst:
+        yield from csv.DictReader(result_file, delimiter='\t')
+    else:
+        yield from csv.DictReader(chain([first_line], result_file), delimiter='\t')