diff --git a/chai_lab/data/dataset/inference_dataset.py b/chai_lab/data/dataset/inference_dataset.py index 400aa8e..4b04f8f 100644 --- a/chai_lab/data/dataset/inference_dataset.py +++ b/chai_lab/data/dataset/inference_dataset.py @@ -187,8 +187,12 @@ def load_chains_from_raw( chain_id=chain_index, sym_id=sym_id, ) - except Exception: - logger.exception(f"Failed to tokenize input {entity_data=} {sym_id=}") + if tok is None: + logger.exception(f"Failed to tokenize input {entity_data=} {sym_id=}") + except Exception as e: + logger.exception( + f"Failed to tokenize input {entity_data=} {sym_id=}", exc_info=e + ) tok = None structure_contexts.append(tok) diff --git a/chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py b/chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py index 5078121..6041d56 100644 --- a/chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py +++ b/chai_lab/data/dataset/structure/all_atom_residue_tokenizer.py @@ -183,6 +183,7 @@ def tokenize_residue( if ( residue.name in standard_residue_pdb_codes and entity_type != EntityType.LIGAND + and entity_type != EntityType.MANUAL_GLYCAN ) else self._tokenize_per_atom ) @@ -388,6 +389,9 @@ def _tokenize_entity( valid_residues = [x for x in tokenized_residues if x is not None] if len(valid_residues) == 0: + logger.warning( + f"Got no residues for entity {entity_data.entity_id} with residues {entity_data.residues}" + ) return None tokens = TokenSpan.concatenate(valid_residues) diff --git a/chai_lab/data/parsing/glycans.py b/chai_lab/data/parsing/glycans.py index 9621a89..42079c8 100644 --- a/chai_lab/data/parsing/glycans.py +++ b/chai_lab/data/parsing/glycans.py @@ -61,7 +61,7 @@ def _glycan_string_to_sugars_and_bonds( parent_sugar_idx.pop() # Remove continue chunk = glycan_string[i : i + 3] - if re.match(r"[A-Z]{3}", chunk): + if re.match(r"[0-9A-Z]{3}", chunk): # Match CCD codes (3 char, alphanumeric) sugars.append(chunk) parent_sugar_idx.append(len(sugars) - 1) # latest sugar elif re.match(r"[1-6]{1}-[1-6]{1}", chunk): @@ -81,6 +81,8 @@ def _glycan_string_to_sugars_and_bonds( def glycan_string_residues(glycan_string: str) -> list[Residue]: sugars, _bonds = _glycan_string_to_sugars_and_bonds(glycan_string) + if not sugars: + raise ValueError(f"No residues parsed from {glycan_string=}") return [ Residue( name=sugar, diff --git a/tests/test_glycans.py b/tests/test_glycans.py index 7c3d646..fd73da3 100644 --- a/tests/test_glycans.py +++ b/tests/test_glycans.py @@ -1,9 +1,18 @@ # Copyright (c) 2024 Chai Discovery, Inc. # Licensed under the Apache License, Version 2.0. # See the LICENSE file for details. +import pytest + from chai_lab.data.parsing.glycans import _glycan_string_to_sugars_and_bonds +@pytest.mark.parametrize("ccd_code", ["MAN", "99K", "FUC"]) +def test_parsing_ccd_codes(ccd_code: str): + """Test that various single CCD codes are parsed correctly.""" + res, _ = _glycan_string_to_sugars_and_bonds(ccd_code) + assert len(res) == 1 + + def test_complex_parsing(): glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))" sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)