Helper function to load colabfold a3m hit files

chaidiscovery · Feb 17, 2025 · a83b639 · a83b639
1 parent 172b666
commit a83b639
Showing 1 changed file with 21 additions and 0 deletions.
diff --git a/chai_lab/data/parsing/msas/a3m.py b/chai_lab/data/parsing/msas/a3m.py
@@ -9,13 +9,17 @@
 we do use expects a column of sequences in the same format that .a3m gives.
 """
 
+import re
 import string
 from functools import lru_cache
+from io import StringIO
+from pathlib import Path
 from typing import Final
 
 import numba
 import numpy as np
 
+from chai_lab.data.parsing.fasta import Fasta, read_fasta
 from chai_lab.data.residue_constants import residue_types_with_nucleotides_order
 
 MAPPED_TOKEN_SKIP: Final[int] = -1
@@ -107,3 +111,20 @@ def tokenize_sequences_to_arrays(
         out_deletions=out_deletions,
     )
     return out_sequences, out_deletions
+
+
+def read_colabfold_a3m(fname: Path) -> dict[str, list[Fasta]]:
+    """Returns mapping of MSA hits per identifier in the given a3m file.
+
+    The query line in each block of MSA hits is dropped.
+    """
+    text = fname.read_text()
+    retval: dict[str, list[Fasta]] = {}
+    for block in text.split("\x00"):  # Splits on null byte
+        if not block:
+            continue
+        strio = StringIO(block)
+        query, *hits = read_fasta(strio)
+        assert re.match(r"^[0-9]{3}$", query.header)
+        retval[query.header] = hits
+    return retval