From 91d01cde3fc5a52a952f3a1475941c3c40f795bd Mon Sep 17 00:00:00 2001 From: Kevin Wu Date: Mon, 17 Feb 2025 16:58:46 -0800 Subject: [PATCH] Cleanup (#311) --- chai_lab/data/parsing/fasta.py | 13 ++++++++----- chai_lab/data/parsing/msas/a3m.py | 8 +++++--- ..._chai.py => stage_colabfold_outputs_for_chai.py} | 3 ++- 3 files changed, 15 insertions(+), 9 deletions(-) rename scripts/{stage_for_chai.py => stage_colabfold_outputs_for_chai.py} (97%) diff --git a/chai_lab/data/parsing/fasta.py b/chai_lab/data/parsing/fasta.py index 8a26939..48b2bbd 100644 --- a/chai_lab/data/parsing/fasta.py +++ b/chai_lab/data/parsing/fasta.py @@ -5,7 +5,7 @@ import logging from io import StringIO from pathlib import Path -from typing import NamedTuple, Sequence +from typing import NamedTuple, Sequence, TextIO from chai_lab.data.parsing.structure.entity_type import EntityType from chai_lab.data.residue_constants import restype_1to3_with_x @@ -31,12 +31,15 @@ def fastas_to_str(fastas: Sequence[Fasta]) -> str: return "".join(f">{fasta.header}\n{fasta.sequence}\n" for fasta in fastas) -def read_fasta(file_path: str | Path | StringIO) -> list[Fasta]: +def read_fasta(file_path: str | Path) -> list[Fasta]: + with open(file_path) as source: + return read_fasta_content(source) + + +def read_fasta_content(content: StringIO | TextIO) -> list[Fasta]: from Bio import SeqIO - fasta_sequences = SeqIO.parse( - open(file_path) if isinstance(file_path, (str, Path)) else file_path, "fasta" - ) + fasta_sequences = SeqIO.parse(content, "fasta") return [Fasta(fasta.description, str(fasta.seq)) for fasta in fasta_sequences] diff --git a/chai_lab/data/parsing/msas/a3m.py b/chai_lab/data/parsing/msas/a3m.py index e6d90fc..d78c28a 100644 --- a/chai_lab/data/parsing/msas/a3m.py +++ b/chai_lab/data/parsing/msas/a3m.py @@ -19,7 +19,7 @@ import numba import numpy as np -from chai_lab.data.parsing.fasta import Fasta, read_fasta +from chai_lab.data.parsing.fasta import Fasta, read_fasta_content from chai_lab.data.residue_constants import residue_types_with_nucleotides_order MAPPED_TOKEN_SKIP: Final[int] = -1 @@ -124,8 +124,10 @@ def read_colabfold_a3m(fname: Path) -> dict[str, list[Fasta]]: if not block: continue strio = StringIO(block) - hits = read_fasta(strio) + hits = read_fasta_content(strio) assert len(hits) > 0 - assert re.match(r"^[0-9]{3}$", (query := hits[0].header)) + + query = hits[0].header + assert re.match(r"^[0-9]{3}$", query) retval[query] = hits return retval diff --git a/scripts/stage_for_chai.py b/scripts/stage_colabfold_outputs_for_chai.py similarity index 97% rename from scripts/stage_for_chai.py rename to scripts/stage_colabfold_outputs_for_chai.py index f4516b3..9899e98 100644 --- a/scripts/stage_for_chai.py +++ b/scripts/stage_colabfold_outputs_for_chai.py @@ -3,7 +3,8 @@ # See the LICENSE file for details. """ Given a output directory from a ColabFold run, traverses the directory structure and stage -the same MSA and templates to run through Chai1. +the same MSA and templates to run through Chai1. This is likely not applicable unless you are +incorporating Chai1 into adhoc analyses based on ColabFold outputs. Some minimal example: