diff --git a/README.md b/README.md index df82631..bc35766 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ We provide a [web server](https://lab.chaidiscovery.com) so you can test the Cha

## Using experimental restraints -Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) for details. +Chai-1 uniquely offers the ability to fold complexes with user-specified "restraints" as inputs. These restraints specify inter-chain contacts or covalent bonds at various resolutions that are used to guide Chai-1 in folding the complex. See [restraints documentation](examples/restraints/README.md) and [covalent bond documentation](examples/covalent_bonds/README.md) for details.

diff --git a/chai_lab/chai1.py b/chai_lab/chai1.py index 672462b..f25ac9b 100644 --- a/chai_lab/chai1.py +++ b/chai_lab/chai1.py @@ -400,12 +400,17 @@ def make_all_atom_feature_context( if cov_a.numel() > 0 and cov_b.numel() > 0: orig_a, orig_b = merged_context.atom_covalent_bond_indices if orig_a.numel() == orig_b.numel() == 0: - merged_context.atom_covalent_bond_indices = (orig_a, orig_b) + merged_context.atom_covalent_bond_indices = (cov_a, cov_b) else: merged_context.atom_covalent_bond_indices = ( torch.concatenate([orig_a, cov_a]), torch.concatenate([orig_b, cov_b]), ) + assert ( + merged_context.atom_covalent_bond_indices[0].numel() + == merged_context.atom_covalent_bond_indices[1].numel() + > 0 + ) else: restraint_context = RestraintContext.empty() diff --git a/chai_lab/data/dataset/structure/all_atom_structure_context.py b/chai_lab/data/dataset/structure/all_atom_structure_context.py index 1a2d432..0b88cf1 100644 --- a/chai_lab/data/dataset/structure/all_atom_structure_context.py +++ b/chai_lab/data/dataset/structure/all_atom_structure_context.py @@ -105,7 +105,7 @@ def report_bonds(self) -> None: res_idx_b = self.token_residue_index[tok_b] resname_a = tensorcode_to_string(self.token_residue_name[tok_a]) resname_b = tensorcode_to_string(self.token_residue_name[tok_b]) - logging.info( + logger.info( f"Bond {i} (asym res_idx resname): {asym_a} {res_idx_a} {resname_a} <> {asym_b} {res_idx_b} {resname_b}" ) diff --git a/chai_lab/data/dataset/structure/bond_utils.py b/chai_lab/data/dataset/structure/bond_utils.py index 3830643..e925f7f 100644 --- a/chai_lab/data/dataset/structure/bond_utils.py +++ b/chai_lab/data/dataset/structure/bond_utils.py @@ -2,6 +2,7 @@ # Licensed under the Apache License, Version 2.0. # See the LICENSE file for details. + import torch from einops import rearrange from torch import Tensor @@ -129,7 +130,7 @@ def get_atom_covalent_bond_pairs_from_constraints( pass case _: raise ValueError(f"Unrecognized pariwise interaction: {ctype}") - return torch.tensor(ret_a, dtype=torch.int), torch.tensor(ret_b, dtype=torch.int) + return torch.tensor(ret_a, dtype=torch.long), torch.tensor(ret_b, dtype=torch.long) @typecheck diff --git a/examples/glycosylation/1ac5.fasta b/examples/covalent_bonds/1ac5.fasta similarity index 100% rename from examples/glycosylation/1ac5.fasta rename to examples/covalent_bonds/1ac5.fasta diff --git a/examples/glycosylation/bonds.restraints b/examples/covalent_bonds/1ac5.restraints similarity index 100% rename from examples/glycosylation/bonds.restraints rename to examples/covalent_bonds/1ac5.restraints diff --git a/examples/covalent_bonds/8cyo.fasta b/examples/covalent_bonds/8cyo.fasta new file mode 100644 index 0000000..d4d95b6 --- /dev/null +++ b/examples/covalent_bonds/8cyo.fasta @@ -0,0 +1,4 @@ +>protein|8cyo-protein +MKKGHHHHHHGAISLISALVRAHVDSNPAMTSLDYSRFQANPDYQMSGDDTQHIQQFYDLLTGSMEIIRGWAEKIPGFADLPKADQDLLFESAFLELFVLRLAYRSNPVEGKLIFCNGVVLHRLQCVRGFGEWIDSIVEFSSNLQNMNIDISAFSCIAALAMVTERHGLKEPKRVEELQNKIVNTLKDHVTFNNGGLNRPNYLSKLLGKLPELRTLCTQGLQRIFYLKLEDLVPPPAIIDKLFLDTLPF +>ligand|8cyo-ligand +c1cc(c(cc1OCC(=O)NCCS)Cl)Cl \ No newline at end of file diff --git a/examples/covalent_bonds/8cyo.restraints b/examples/covalent_bonds/8cyo.restraints new file mode 100644 index 0000000..b05cfab --- /dev/null +++ b/examples/covalent_bonds/8cyo.restraints @@ -0,0 +1,2 @@ +chainA,res_idxA,chainB,res_idxB,connection_type,confidence,min_distance_angstrom,max_distance_angstrom,comment,restraint_id +A,C217@SG,B,@S1,covalent,1.0,0.0,0.0,protein-ligand,bond1 diff --git a/examples/glycosylation/README.md b/examples/covalent_bonds/README.md similarity index 81% rename from examples/glycosylation/README.md rename to examples/covalent_bonds/README.md index 3267c84..4ad9fa2 100644 --- a/examples/glycosylation/README.md +++ b/examples/covalent_bonds/README.md @@ -1,9 +1,9 @@ # Working with bond restraints -Chai-1 supports specifying covalent bonds as input, which specify covalent linkages between atoms in the folded complex. This is useful for specifying covalent modifications such as glycosylation events, which we demonstrate below, but can be generally used to specify arbitrary "non-canonical" bonds in a structure. +Chai-1 supports specifying covalent bonds as input, which specify covalent linkages between atoms in the folded complex. This is useful for specifying covalent modifications such as glycosylation events, but can be generally used to specify arbitrary "non-canonical" bonds in a structure; we demonstrate both cases below. A few notes: -- Chai-1 was not trained on disulfide bonds, and we have not evaluated whether specifying such bond information yields expected behaviors. +- Chai-1 was not trained on intra-chain bonds (e.g., disulfides), and we have not evaluated whether specifying such bond information yields expected behaviors. - These bond restraints should not be used to specify modified amino acids that already have an associated CCD code; for these examples, include the modified residue's CCD code in parentheses directly in the sequence in place of its canonical residue, e.g., `RKDES(MSE)EES` to specify a selenomethionine at the 6th position. ## Glycans @@ -61,12 +61,20 @@ NAG(4-1 NAG(4-1 BMA(3-1 MAN)(6-1 MAN))) ``` This branched example has a root `NAG` ring followed by a `NAG` and a `BMA`, which then branches to two `MAN` rings. For additional examples of this syntax, please refer to the examples in `tests/test_glycans.py`. -### Example +### Glycan example We have included an example of how glycans can be specified under `predict_glycosylated.py` in this directory, along with its corresponding `bonds.restraints` csv file. This example is based on the PDB structure [1AC5](https://www.rcsb.org/structure/1ac5). The predicted structrue (colored, glycans in purple and orange, protein in green) from this script should look like the following when aligned with the ground truth 1AC5 structure (gray): ![glycan example prediction](./output.png) -### A note on leaving atoms +## Non-glycan ligands + +You can also use covalent bonds to "connect" residues to non-glycan ligands. To demonstrate this, we use a single subunit from the homodimer [8CYO](https://www.rcsb.org/structure/8cyo). We specify a fasta file with a protein sequence and SMILES string in `8cyo.fasta` (we use SMILES for demonstration purposes even though this specific ligand has a CCD code) and the corresponding restraints in `8cyo.restraints`. Folding this example with `predict_covalent_ligand.py` yields the following structure (RCSB ground truth structure shown in gray). + +![non glycan example prediction](./non_glycan_output.png) + +## A note on leaving atoms + +One might notice that in the above example, we are specifying CCD codes for sugar rings and connecting them to each other and an amino acid residue via various bonds. A subtle point is that the reference conformer for these sugar rings include OH hydroxyl groups that leave when bonds are formed. Under the hood, Chai-1 tries to automatically find and remove these atoms (see `AllAtomStructureContext.drop_glycan_leaving_atoms_inplace` for implementation), but this logic only drops leaving hydroxyl groups within glycan sugar rings. For other, non-sugar covalently attached ligands, please specify a SMILES string *without* the leaving atoms. If this does not work for your use case, please open a GitHub issue. + -One might notice that in the above example, we are specifying CCD codes for sugar rings and connecting them to each other and an amino acid residue via various bonds. A subtle point is that the reference conformer for these sugar rings include OH hydroxyl groups that leave when bonds are formed. Under the hood, Chai-1 tries to automatically find and remove these atoms (see `AllAtomStructureContext.drop_glycan_leaving_atoms_inplace` for implementation), but this logic only drops leaving hydroxyl groups within glycan sugar rings. For other, non-sugar covalently attached ligands, please specify a SMILES string without the leaving atoms. If this does not work for your use case, please open a GitHub issue. diff --git a/examples/covalent_bonds/non_glycan_output.png b/examples/covalent_bonds/non_glycan_output.png new file mode 100644 index 0000000..438a22d Binary files /dev/null and b/examples/covalent_bonds/non_glycan_output.png differ diff --git a/examples/glycosylation/output.png b/examples/covalent_bonds/output.png similarity index 100% rename from examples/glycosylation/output.png rename to examples/covalent_bonds/output.png diff --git a/examples/covalent_bonds/predict_covalent_ligand.py b/examples/covalent_bonds/predict_covalent_ligand.py new file mode 100644 index 0000000..c2b5025 --- /dev/null +++ b/examples/covalent_bonds/predict_covalent_ligand.py @@ -0,0 +1,27 @@ +import logging +import shutil +from pathlib import Path + +from chai_lab.chai1 import run_inference + +logging.basicConfig(level=logging.INFO) + +# Inference expects an empty directory; enforce this +output_dir = Path("/workspaces/chai-lab/tmp/outputs") +if output_dir.exists(): + logging.warning(f"Removing old output directory: {output_dir}") + shutil.rmtree(output_dir) +output_dir.mkdir(exist_ok=True, parents=True) + +candidates = run_inference( + fasta_file=Path(__file__).with_name("8cyo.fasta"), + output_dir=output_dir, + constraint_path=Path(__file__).with_name("8cyo.restraints"), + num_trunk_recycles=3, + num_diffn_timesteps=200, + seed=42, + device="cuda:0", + use_esm_embeddings=True, +) + +cif_paths = candidates.cif_paths diff --git a/examples/glycosylation/predict_glycosylated.py b/examples/covalent_bonds/predict_glycosylated.py similarity index 91% rename from examples/glycosylation/predict_glycosylated.py rename to examples/covalent_bonds/predict_glycosylated.py index 4734b77..e469119 100644 --- a/examples/glycosylation/predict_glycosylated.py +++ b/examples/covalent_bonds/predict_glycosylated.py @@ -16,7 +16,7 @@ candidates = run_inference( fasta_file=Path(__file__).with_name("1ac5.fasta"), output_dir=output_dir, - constraint_path=Path(__file__).with_name("bonds.restraints"), + constraint_path=Path(__file__).with_name("1ac5.restraints"), num_trunk_recycles=3, num_diffn_timesteps=200, seed=42,