diff --git a/README.md b/README.md index ebc388e..df82631 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ If you are hosting your own ColabFold server, additionally pass the `--msa-serve chai fold --use-msa-server --msa-server-url "https://api.internalcolabserver.com" input.fasta output_folder ``` +We also provide additional utility functions for tasks such as MSA file format conversion; see `chai --help` for details. + ### Programmatic inference The main entrypoint into the Chai-1 folding code is through the `chai_lab.chai1.run_inference` function. The following script demonstrates how to programmatically provide inputs to the model, and obtain a list of PDB files for downstream analysis: @@ -51,7 +53,7 @@ The main entrypoint into the Chai-1 folding code is through the `chai_lab.chai1. python examples/predict_structure.py ``` -To get the best performance, we recommend running the model with MSAs. The following script demonstrates how to provide MSAs to the model by calling out to an MSA server: +To get the best performance, we recommend running the model with MSAs. The following script demonstrates how to provide MSAs to the model. ```shell python examples/msas/predict_with_msas.py diff --git a/chai_lab/data/parsing/msas/aligned_pqt.py b/chai_lab/data/parsing/msas/aligned_pqt.py index a9a2cee..80a5679 100644 --- a/chai_lab/data/parsing/msas/aligned_pqt.py +++ b/chai_lab/data/parsing/msas/aligned_pqt.py @@ -9,7 +9,7 @@ import logging from functools import lru_cache from pathlib import Path -from typing import Literal, Mapping +from typing import Literal, Mapping, Optional import pandas as pd import pandera as pa @@ -174,7 +174,7 @@ def merge_multi_a3m_to_aligned_dataframe( msa_a3m_files: Mapping[Path, MSADataSource], insert_keys_for_sources: Literal["all", "none", "uniprot"] = "uniprot", ) -> pd.DataFrame: - """Merge multiple a3m files into a single aligned parquet file.""" + """Merge multiple a3ms from the same query sequence into a single aligned parquet.""" dfs = { src: a3m_to_aligned_dataframe( a3m_path, @@ -198,10 +198,10 @@ def merge_multi_a3m_to_aligned_dataframe( return pd.concat(chunks, ignore_index=True).reset_index(drop=True) -def _merge_files_in_directory(directory: str): +def merge_a3m_in_directory(directory: str, output_directory: Optional[str] = None): """Finds .a3m files in a directory and combine them into a single aligned.pqt file. Files are expected to be named like hits_uniref90.a3m (uniref90 is the source database). - All files in the directoroy are assumed to be derived from the same query sequence. + All files in the directory are assumed to be derived from the same query sequence. Provided as a example commandline interface to merge files. """ @@ -226,7 +226,10 @@ def _merge_files_in_directory(directory: str): ) # Get the query sequence and use it to determine where we save the file. query_seq: str = df.iloc[0]["sequence"] - df.to_parquet(dir_path / expected_basename(query_seq)) + # Default to writing into the same directory if output directory isn't specified + outdir = Path(output_directory) if output_directory is not None else dir_path + outdir.mkdir(exist_ok=True, parents=True) + df.to_parquet(outdir / expected_basename(query_seq)) if __name__ == "__main__": @@ -234,4 +237,4 @@ def _merge_files_in_directory(directory: str): logging.basicConfig(level=logging.INFO) - typer.run(_merge_files_in_directory) + typer.run(merge_a3m_in_directory) diff --git a/chai_lab/main.py b/chai_lab/main.py index 8132d65..9714492 100644 --- a/chai_lab/main.py +++ b/chai_lab/main.py @@ -9,6 +9,7 @@ import typer from chai_lab.chai1 import run_inference +from chai_lab.data.parsing.msas.aligned_pqt import merge_a3m_in_directory logging.basicConfig(level=logging.INFO) @@ -35,6 +36,10 @@ def citation(): def cli(): app = typer.Typer() app.command("fold", help="Run Chai-1 to fold a complex.")(run_inference) + app.command( + "a3m-to-pqt", + help="Convert all a3m files in a directory for a *single sequence* into a aligned parquet file", + )(merge_a3m_in_directory) app.command("citation", help="Print citation information")(citation) app() diff --git a/examples/msas/README.md b/examples/msas/README.md index dd47f4a..9fc6605 100644 --- a/examples/msas/README.md +++ b/examples/msas/README.md @@ -24,7 +24,7 @@ See the following for a toy example of what this table might look like: | RKSES... | uniprot | Mus musculus | A mouse sequence from uniprot | | ... | -We additionally provide example code to parse `a3m` files into this format; see `merge_multi_a3m_to_aligned_dataframe` in `chai_lab/data/parsing/msas/aligned_pqt.py`. This file can also be run as a commandline script; run `python chai_lab/data/parsing/msas/aligned_pqt.py --help` for details. Note, however, that this code defaults to only parsing pairing keys based on species annotation in UniProt files; this follows the logic described in both AlphaFold3 and AlphaFold2 multimer. To specify pairing keys for different data sources, or to use something other than species as the pairing key, we encourage users to build their own parsing logic to create `.aligned.pqt` files. +We additionally provide example code to parse `a3m` files into this format; see `merge_multi_a3m_to_aligned_dataframe` in `chai_lab/data/parsing/msas/aligned_pqt.py`. This can also be run through commandline interface; run `chai a3m-to-pqt --help` for details. Note, however, that this code defaults to only parsing pairing keys based on species annotation in UniProt files; this follows the logic described in both AlphaFold3 and AlphaFold2 multimer. To specify pairing keys for different data sources, or to use something other than species as the pairing key, we encourage users to build their own parsing logic to create `.aligned.pqt` files. ### TLDR