Skip to content

Commit

Permalink
Helper function to load colabfold a3m hit files
Browse files Browse the repository at this point in the history
  • Loading branch information
wukevin committed Feb 17, 2025
1 parent 172b666 commit a83b639
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions chai_lab/data/parsing/msas/a3m.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
we do use expects a column of sequences in the same format that .a3m gives.
"""

import re
import string
from functools import lru_cache
from io import StringIO
from pathlib import Path
from typing import Final

import numba
import numpy as np

from chai_lab.data.parsing.fasta import Fasta, read_fasta
from chai_lab.data.residue_constants import residue_types_with_nucleotides_order

MAPPED_TOKEN_SKIP: Final[int] = -1
Expand Down Expand Up @@ -107,3 +111,20 @@ def tokenize_sequences_to_arrays(
out_deletions=out_deletions,
)
return out_sequences, out_deletions


def read_colabfold_a3m(fname: Path) -> dict[str, list[Fasta]]:
"""Returns mapping of MSA hits per identifier in the given a3m file.
The query line in each block of MSA hits is dropped.
"""
text = fname.read_text()
retval: dict[str, list[Fasta]] = {}
for block in text.split("\x00"): # Splits on null byte
if not block:
continue
strio = StringIO(block)
query, *hits = read_fasta(strio)
assert re.match(r"^[0-9]{3}$", query.header)
retval[query.header] = hits
return retval

0 comments on commit a83b639

Please sign in to comment.