diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/.DS_Store differ diff --git a/selfies/constants.py b/selfies/constants.py index 597ff5e..2083a4b 100644 --- a/selfies/constants.py +++ b/selfies/constants.py @@ -9,7 +9,7 @@ "Ds", "Rg", "Cn", "Fl", "Lv", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", - "No", "Lr" + "No", "Lr", "*", "[*]" } ORGANIC_SUBSET = {"B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"} diff --git a/selfies/decoder.py b/selfies/decoder.py index 7e8d1b8..db88aa7 100644 --- a/selfies/decoder.py +++ b/selfies/decoder.py @@ -154,9 +154,19 @@ def _derive_mol_from_symbols( elif "eps" in symbol: next_state = 0 if (state == 0) else None - # Case 4: regular symbol (e.g. [N], [=C], [F]) - else: + # Case 4: [*] + elif symbol == "*": + atom = mol.add_wildcard_atom() # add wildcard atom + mol.add_attribution( + atom, attribute_stack + + [Attribution(index + attribution_index, symbol)] + if attribute_stack is not None else None + ) + prev_atom = atom + next_state = 0 + # Case 5: regular symbol (e.g. [N], [=C], [F]) + else: output = process_atom_symbol(symbol) if output is None: _raise_decoder_error(selfies, symbol) @@ -185,6 +195,7 @@ def _derive_mol_from_symbols( [Attribution(index + attribution_index, symbol)] if attribute_stack is not None else None) prev_atom = atom + if next_state is None: break diff --git a/selfies/encoder.py b/selfies/encoder.py index 149fc1e..916f9c4 100644 --- a/selfies/encoder.py +++ b/selfies/encoder.py @@ -47,7 +47,7 @@ def encoder(smiles: str, strict: bool = True, attribute: bool = False) -> str: >>> import selfies as sf >>> sf.encoder("C=CF") '[C][=C][F]' - +atom .. note:: This function does not currently support SMILES with: * The wildcard symbol ``*``. @@ -240,3 +240,4 @@ def _atom_to_selfies(bond, atom): assert not atom.is_aromatic bond_char = "" if (bond is None) else _bond_to_selfies(bond) return "[{}{}]".format(bond_char, atom_to_smiles(atom, brackets=False)) + diff --git a/selfies/grammar_rules.py b/selfies/grammar_rules.py index 9cd354e..f837e7b 100644 --- a/selfies/grammar_rules.py +++ b/selfies/grammar_rules.py @@ -107,7 +107,7 @@ def get_selfies_from_index(index: int) -> List[str]: r"^[\[]" # opening square bracket [ r"([=#/\\]?)" # bond char r"(\d*)" # isotope number (optional, e.g. 123, 26) - r"([A-Z][a-z]?)" # element symbol + r"([A-Z][a-z]?|\*)" # element symbol or wildcard r"([@]{0,2})" # chiral_tag (optional, only @ and @@ supported) r"((?:[H]\d)?)" # H count (optional, e.g. H1, H3) r"((?:[+-][1-9]+)?)" # charge (optional, e.g. +1) @@ -115,6 +115,7 @@ def get_selfies_from_index(index: int) -> List[str]: ) + def _process_atom_selfies_no_cache(symbol): m = SELFIES_ATOM_PATTERN.match(symbol) if m is None: diff --git a/selfies/mol_graph.py b/selfies/mol_graph.py index b491333..6e70dd2 100644 --- a/selfies/mol_graph.py +++ b/selfies/mol_graph.py @@ -42,7 +42,8 @@ def __init__( isotope: Optional[int] = None, chirality: Optional[str] = None, h_count: Optional[int] = None, - charge: int = 0 + charge: int = 0, + is_wildcard: bool = False ): self.index = None self.element = element @@ -51,7 +52,8 @@ def __init__( self.chirality = chirality self.h_count = h_count self.charge = charge - + self.is_wildcard = is_wildcard + @property @functools.lru_cache() def bonding_capacity(self): @@ -142,6 +144,8 @@ def get_out_dirbonds(self, src: int) -> List[DirectedBond]: def get_bond_count(self, idx: int) -> int: return self._bond_counts[idx] + + def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom: atom.index = len(self) @@ -154,6 +158,14 @@ def add_atom(self, atom: Atom, mark_root: bool = False) -> Atom: if atom.is_aromatic: self._delocal_subgraph[atom.index] = list() return atom + + + def add_wildcard_atom(self, mark_root: bool = False) -> Atom: + wildcard_atom = Atom(element='*', is_aromatic=False, is_wildcard=True, h_count=0, charge=0) # add is_wildcard=True + added_atom = self.add_atom(wildcard_atom, mark_root) + return added_atom + + def add_attribution( self, diff --git a/selfies/utils/smiles_utils.py b/selfies/utils/smiles_utils.py index bd514c2..e056376 100644 --- a/selfies/utils/smiles_utils.py +++ b/selfies/utils/smiles_utils.py @@ -66,6 +66,11 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]: i = 0 while i < len(smiles): + # 添加这部分来处理 * 符号 + if smiles[i] == "*" or smiles[i:i+3] == "[*]": + yield SMILESToken(None, i, i + 1, SMILESTokenTypes.ATOM, "*") + i += 1 + continue if smiles[i] == ".": yield SMILESToken(None, i, i + 1, SMILESTokenTypes.DOT, smiles[i]) @@ -127,12 +132,16 @@ def tokenize_smiles(smiles: str) -> Iterator[SMILESToken]: # ============================================================================= + + def smiles_to_atom(atom_symbol: str) -> Optional[Atom]: """Reads an atom from its SMILES representation. :param atom_symbol: a SMILES atom symbol. :return: the atom that the input symbol represents. """ + if atom_symbol == "*": + return Atom("*", False) if atom_symbol[0] == "[" and atom_symbol[-1] == "]": pass # continue below @@ -183,6 +192,7 @@ def smiles_to_atom(atom_symbol: str) -> Optional[Atom]: ) + def smiles_to_bond( bond_char: Optional[str] ) -> Tuple[Union[int, float], Optional[str]]: @@ -358,6 +368,8 @@ def atom_to_smiles(atom: Atom, brackets: bool = True) -> str: :return: a SMILES symbol representing the input atom. """ assert not atom.is_aromatic + if atom.element == '*': + return '*' specs = (atom.isotope, atom.chirality, atom.h_count, atom.charge) if specs == (None, None, None, 0): @@ -443,12 +455,17 @@ def _derive_smiles_from_fragment( ring_log, attribution_maps, attribution_index=0): curr_atom, curr = mol.get_atom(root), root - token = atom_to_smiles(curr_atom) + + if curr_atom.is_wildcard: # 使用新增属性进行判断 + token = "*" + else: + token = atom_to_smiles(curr_atom) + + # token = atom_to_smiles(curr_atom) derived.append(token) attribution_maps.append(AttributionMap( _strlen(derived) - 1 + attribution_index, token, mol.get_attribution(curr_atom))) - out_bonds = mol.get_out_dirbonds(curr) for i, bond in enumerate(out_bonds): if bond.ring_bond: diff --git a/test_polysf.py b/test_polysf.py new file mode 100644 index 0000000..22f1124 --- /dev/null +++ b/test_polysf.py @@ -0,0 +1,21 @@ +import selfies as sf +from rdkit import Chem + +polymer_smiles = ['*CC(*)(C)C', + 'C1=C(SC(=C1)[*])[*]', + 'CCCCC1=C(SC(=C1)[*])[*]', + 'CCCCCCC1=C(SC(=C1)[*])[*]', + 'CCCCCCCCC1=C(SC(=C1)[*])[*]', + 'C1(=CC(=C(C=C1C=C[*])OC)[*])OCC(CC)CCCC' + ] + +for i in polymer_smiles: + mol = Chem.MolFromSmiles(i) + ori_smi = Chem.MolToSmiles(mol) + selfies = sf.encoder(ori_smi) + de_smi = sf.decoder(selfies) + de_smi = Chem.MolToSmiles(Chem.MolFromSmiles(de_smi)) + print('polymer smiles:', ori_smi, 'selfies:', selfies, 'decode selfies:', de_smi, 'equal?:', ori_smi == de_smi) + + +