Skip to content

Commit

Permalink
Expose helper function to merge a3ms in a directory
Browse files Browse the repository at this point in the history
  • Loading branch information
wukevin committed Dec 9, 2024
1 parent f77f441 commit 15e2b38
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 6 deletions.
15 changes: 9 additions & 6 deletions chai_lab/data/parsing/msas/aligned_pqt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
from functools import lru_cache
from pathlib import Path
from typing import Literal, Mapping
from typing import Literal, Mapping, Optional

import pandas as pd
import pandera as pa
Expand Down Expand Up @@ -174,7 +174,7 @@ def merge_multi_a3m_to_aligned_dataframe(
msa_a3m_files: Mapping[Path, MSADataSource],
insert_keys_for_sources: Literal["all", "none", "uniprot"] = "uniprot",
) -> pd.DataFrame:
"""Merge multiple a3m files into a single aligned parquet file."""
"""Merge multiple a3ms from the same query sequence into a single aligned parquet."""
dfs = {
src: a3m_to_aligned_dataframe(
a3m_path,
Expand All @@ -198,10 +198,10 @@ def merge_multi_a3m_to_aligned_dataframe(
return pd.concat(chunks, ignore_index=True).reset_index(drop=True)


def _merge_files_in_directory(directory: str):
def merge_a3m_in_directory(directory: str, output_directory: Optional[str] = None):
"""Finds .a3m files in a directory and combine them into a single aligned.pqt file.
Files are expected to be named like hits_uniref90.a3m (uniref90 is the source database).
All files in the directoroy are assumed to be derived from the same query sequence.
All files in the directory are assumed to be derived from the same query sequence.
Provided as a example commandline interface to merge files.
"""
Expand All @@ -226,12 +226,15 @@ def _merge_files_in_directory(directory: str):
)
# Get the query sequence and use it to determine where we save the file.
query_seq: str = df.iloc[0]["sequence"]
df.to_parquet(dir_path / expected_basename(query_seq))
# Default to writing into the same directory if output directory isn't specified
outdir = Path(output_directory) if output_directory is not None else dir_path
outdir.mkdir(exist_ok=True, parents=True)
df.to_parquet(outdir / expected_basename(query_seq))


if __name__ == "__main__":
import typer

logging.basicConfig(level=logging.INFO)

typer.run(_merge_files_in_directory)
typer.run(merge_a3m_in_directory)
5 changes: 5 additions & 0 deletions chai_lab/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import typer

from chai_lab.chai1 import run_inference
from chai_lab.data.parsing.msas.aligned_pqt import merge_a3m_in_directory

logging.basicConfig(level=logging.INFO)

Expand All @@ -35,6 +36,10 @@ def citation():
def cli():
app = typer.Typer()
app.command("fold", help="Run Chai-1 to fold a complex.")(run_inference)
app.command(
"a3m-to-pqt",
help="Convert a3m files for a *single sequence* into a aligned parquet file",
)(merge_a3m_in_directory)
app.command("citation", help="Print citation information")(citation)
app()

Expand Down

0 comments on commit 15e2b38

Please sign in to comment.