Merge pull request #428 from ISISNeutronMuon/chi/pdb-fix

Fix pdb conversion
ISISNeutronMuon · Apr 30, 2024 · 3694932 · 3694932
2 parents a4424f1 + 834f793
commit 3694932
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 12 deletions.
diff --git a/MDANSE/Src/MDANSE/Chemistry/__init__.py b/MDANSE/Src/MDANSE/Chemistry/__init__.py
@@ -13,6 +13,9 @@
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 #
+import os
+import json
+
 from MDANSE.Chemistry.Databases import (
     AtomsDatabase,
     MoleculesDatabase,
@@ -27,3 +30,6 @@
 RESIDUES_DATABASE = ResiduesDatabase()
 
 NUCLEOTIDES_DATABASE = NucleotidesDatabase()
+
+with open(os.path.join(os.path.dirname(__file__), "residues_alt_names.json"), "r") as f:
+    RESIDUE_ALT_NAMES = json.load(f)
diff --git a/MDANSE/Src/MDANSE/Chemistry/residues_alt_names.json b/MDANSE/Src/MDANSE/Chemistry/residues_alt_names.json
@@ -0,0 +1,4 @@
+{
+  "HIS": ["HID", "HIE", "HIP"],
+  "CYS": ["CYS", "CYX"]
+}
diff --git a/MDANSE/Src/MDANSE/IO/PDB.py b/MDANSE/Src/MDANSE/IO/PDB.py
@@ -73,12 +73,11 @@
 @undocumented: export_filters
 @undocumented: DummyChain
 """
-
 import string
 
 import numpy as np
 
-from MDANSE.Chemistry import NUCLEOTIDES_DATABASE, RESIDUES_DATABASE
+from MDANSE.Chemistry import NUCLEOTIDES_DATABASE, RESIDUES_DATABASE, RESIDUE_ALT_NAMES
 from MDANSE.IO.FortranFormat import FortranFormat, FortranLine
 from MDANSE.IO.PDBExportFilters import export_filters
 from MDANSE.IO.TextFile import TextFile
@@ -490,7 +489,11 @@ def nextResidue(self, name, number=None, terminus=None):
         name = name.upper()
         if self.export_filter is not None:
             name, number = self.export_filter.processResidue(name, number, terminus)
-        self.het_flag = not (name in RESIDUES_DATABASE or name in NUCLEOTIDES_DATABASE)
+        self.het_flag = not (
+            name in RESIDUES_DATABASE
+            or name in RESIDUE_ALT_NAMES
+            or name in NUCLEOTIDES_DATABASE
+        )
         self.data["residue_name"] = name
         self.data["residue_number"] = (self.data["residue_number"] + 1) % 10000
         self.data["insertion_code"] = ""
@@ -1052,7 +1055,10 @@ def isCompatible(self, chain_data, residue_data):
         return (
             chain_data["chain_id"] == self.chain_id
             and chain_data["segment_id"] == self.segment_id
-            and residue_data["residue_name"] in RESIDUES_DATABASE
+            and (
+                residue_data["residue_name"] in RESIDUES_DATABASE
+                or residue_data["residue_name"] in RESIDUE_ALT_NAMES
+            )
         )
 
 
@@ -1097,6 +1103,7 @@ def isCompatible(self, chain_data, residue_data):
             chain_data["chain_id"] == self.chain_id
             and chain_data["segment_id"] == self.segment_id
             and residue_data["residue_name"] not in RESIDUES_DATABASE
+            and residue_data["residue_name"] not in RESIDUE_ALT_NAMES
             and residue_data["residue_name"] not in NUCLEOTIDES_DATABASE
         )
 
@@ -1279,7 +1286,7 @@ def extractData(self, data):
     def newResidue(self, residue_data):
         name = residue_data["residue_name"]
         residue_number = residue_data["residue_number"]
-        if name in RESIDUES_DATABASE:
+        if name in RESIDUES_DATABASE or name in RESIDUE_ALT_NAMES:
             residue = PDBAminoAcidResidue(name, [], residue_number)
         elif name in NUCLEOTIDES_DATABASE:
             residue = PDBNucleotideResidue(name, [], residue_number)

diff --git a/MDANSE/Src/MDANSE/IO/PDBReader.py b/MDANSE/Src/MDANSE/IO/PDBReader.py
@@ -37,6 +37,7 @@
     MOLECULES_DATABASE,
     NUCLEOTIDES_DATABASE,
     RESIDUES_DATABASE,
+    RESIDUE_ALT_NAMES,
 )
 from MDANSE.MolecularDynamics.Configuration import RealConfiguration
 from MDANSE.IO.PDB import PDBMolecule, PDBNucleotideChain, PDBPeptideChain, Structure
@@ -473,17 +474,56 @@ def _process_residue(self, residue):
 
         pdb_atoms = [at.name for at in residue]
 
-        atoms_found = [None] * len(pdb_atoms)
-
-        for comp, pdb_atom in enumerate(pdb_atoms):
-            for at, info in RESIDUES_DATABASE[code]["atoms"].items():
-                if pdb_atom == at or pdb_atom in info["alternatives"]:
-                    atoms_found[comp] = at
+        if code in RESIDUE_ALT_NAMES:
+            # RESIDUES_DATABASE is a many-to-one map multiple codes
+            # can map to one residue in MDANSE.
+            # If the code is in RESIDUE_ALT_NAMES then it is a
+            # many-to-many map. In this case the code can map to many
+            # different residues.
+            for new_code in RESIDUE_ALT_NAMES[code]:
+                # go through each name that code could be
+                atoms_found = [None] * len(pdb_atoms)
+                for comp, pdb_atom in enumerate(pdb_atoms):
+                    if len(atoms_found) != len(
+                        RESIDUES_DATABASE[new_code]["atoms"].items()
+                    ):
+                        # different number of atoms break and try the next name
+                        break
+                    for at, info in RESIDUES_DATABASE[new_code]["atoms"].items():
+                        if pdb_atom == at or pdb_atom in info["alternatives"]:
+                            atoms_found[comp] = at
+                            break
+                    else:
+                        # unable to match the atom in the RESIDUES_DATABASE
+                        # try the next residue name
+                        break
+                if None not in atoms_found:
+                    # matched all atoms found the residue so break
                     break
             else:
+                # went through all alternative names looks like we were
+                # unable to match the code to the MDANSE residues
                 raise PDBReaderError(
-                    "The atom {}{}:{} is unknown".format(code, residue.number, pdb_atom)
+                    "Unable to find residue for {}{}".format(code, residue.number)
                 )
+            # found the match from the alternate name to the name in
+            # mdanse lets rename the code
+            code = new_code
+            residue.name = code
+
+        else:
+            atoms_found = [None] * len(pdb_atoms)
+            for comp, pdb_atom in enumerate(pdb_atoms):
+                for at, info in RESIDUES_DATABASE[code]["atoms"].items():
+                    if pdb_atom == at or pdb_atom in info["alternatives"]:
+                        atoms_found[comp] = at
+                        break
+                else:
+                    raise PDBReaderError(
+                        "The atom {}{}:{} is unknown".format(
+                            code, residue.number, pdb_atom
+                        )
+                    )
 
         resname = "{}{}".format(code, residue.number)
         new_residue = Residue(code, resname, variant=None)