Merge pull request #52 from xchem/CpKRS

Cp krs
xchem · Dec 12, 2024 · b03a3cf · b03a3cf
2 parents 79cd22e + cbd8bb6
commit b03a3cf
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 29 deletions.
diff --git a/src/ligand_neighbourhood_alignment/cli.py b/src/ligand_neighbourhood_alignment/cli.py
@@ -446,6 +446,7 @@ def _get_structures(datasets):
 
     return structures
 
+
 def _get_dataset_protein_chains(structure):
     protein_chains = []
     for model in structure:
@@ -459,6 +460,7 @@ def _get_dataset_protein_chains(structure):
 
     return protein_chains
 
+
 def _get_closest_xtalform(xtalforms: dict[str, dt.XtalForm], structure, structures):
     structure_spacegroup = structure.spacegroup_hm
     structure_cell = structure.cell
@@ -475,7 +477,8 @@ def _get_closest_xtalform(xtalforms: dict[str, dt.XtalForm], structure, structur
             continue
 
         # Check they have the same protein chains
-        xtalform_protein_chains = [_chain for xtalform_assembly in xtalform.assemblies.values() for _chain in xtalform_assembly.chains]
+        xtalform_protein_chains = [_chain for xtalform_assembly in xtalform.assemblies.values() for _chain in
+                                   xtalform_assembly.chains]
         dataset_protein_chains = _get_dataset_protein_chains(structure)
         print(f'Xtalform chains: {set(xtalform_protein_chains)}')
         print(f'Dataseet chains: {set(dataset_protein_chains)}')
@@ -524,7 +527,8 @@ def _assign_dataset(dataset, assemblies, xtalforms, structure, structures):
         logger.info(f"Deltas to closest unit cell are: {deltas}")
         logger.info(f"Structure path is: {dataset.pdb}")
 
-        raise Exception(f"No reference for dataset: {dataset.dtag}\nDeltas to closest unit cell in {closest_xtalform_id} are: {deltas}\nStructure path is: {dataset.pdb}")
+        raise Exception(
+            f"No reference for dataset: {dataset.dtag}\nDeltas to closest unit cell in {closest_xtalform_id} are: {deltas}\nStructure path is: {dataset.pdb}")
 
     return closest_xtalform_id
 
@@ -534,7 +538,7 @@ def _save_assignments(fs_model: dt.FSModel, dataset_assignments: dict[str, str])
         yaml.safe_dump(dataset_assignments, f)
 
 
-def _generate_assembly(xtalform: dt.XtalForm, structure, assemblies: dict[str, dt.Assembly], pdb):
+def _generate_assembly(xtalform: dt.XtalForm, structure, assemblies: dict[str, dt.Assembly], pdb, dataset):
     full_st = structure.clone()
     chains_to_delete = []
     for model in full_st:
@@ -544,6 +548,7 @@ def _generate_assembly(xtalform: dt.XtalForm, structure, assemblies: dict[str, d
     for model_name, chain_name in chains_to_delete:
         del full_st[model_name][chain_name]
 
+    cloned_chains = []
     for xtalform_assembly_id, xtalform_assembly in xtalform.assemblies.items():
         assembly = assemblies[xtalform_assembly.assembly]
         # chains = xtalform_assembly.chains
@@ -553,6 +558,7 @@ def _generate_assembly(xtalform: dt.XtalForm, structure, assemblies: dict[str, d
                 xtalform_assembly.chains,
                 xtalform_assembly.transforms,
         ):
+            cloned_chains.append(_chain)
 
             # for generator in assembly.generators:
             #     op = gemmi.Op(generator.triplet)
@@ -579,6 +585,22 @@ def _generate_assembly(xtalform: dt.XtalForm, structure, assemblies: dict[str, d
             chain_clone.name = f"{_chain}~{_biogen.biomol}~{_transform}"
             full_st[0].add_chain(chain_clone)
 
+    # Catch any ligand only chains
+    for lbe in dataset.ligand_binding_events:
+        _chain = lbe[1]
+        if _chain not in cloned_chains:
+            op = gemmi.Op("x,y,z")
+            chain_clone = structure[0][_chain].clone()
+            for residue in chain_clone:
+                for atom in residue:
+                    atom_frac = structure.cell.fractionalize(atom.pos)
+                    new_pos_frac = op.apply_to_xyz([atom_frac.x, atom_frac.y, atom_frac.z])
+                    new_pos_orth = structure.cell.orthogonalize(gemmi.Fractional(*new_pos_frac))
+                    atom.pos = gemmi.Position(*new_pos_orth)
+            chain_clone.name = f"{_chain}~{_chain}~x,y,z"
+            full_st[0].add_chain(chain_clone)
+            cloned_chains.append(_chain)
+
     chains = []
     num_chains = 0
     for model in full_st:
@@ -632,7 +654,7 @@ def _get_dataset_neighbourhoods(
     logger.debug(f"{structure.cell}")
 
     # Get the rest of the assembly
-    assembly = _generate_assembly(xtalform, structure, assemblies, dataset.pdb)
+    assembly = _generate_assembly(xtalform, structure, assemblies, dataset.pdb, dataset)
 
     # Get the bound fragments
     fragments: dict[tuple[str, str, str, str], gemmi.Residue] = _get_structure_fragments(dataset, assembly, version)
@@ -967,16 +989,17 @@ def _crystalform_incremental_cluster(
     }
 
     # While observations remain to be assigned
-    assignments = {xtalform_site_id: [_x for _x in xtalform_sites[xtalform_site_id].members] for xtalform_site_id in xtalform_sites}
-    assigned_observations = [observation_id for xtalform_site_id in assignments for observation_id in assignments[xtalform_site_id]]
+    assignments = {xtalform_site_id: [_x for _x in xtalform_sites[xtalform_site_id].members] for xtalform_site_id in
+                   xtalform_sites}
+    assigned_observations = [observation_id for xtalform_site_id in assignments for observation_id in
+                             assignments[xtalform_site_id]]
     observations_to_assign = [observation_id for observation_id in centroid_ca_positions]
 
     while len(observations_to_assign) > 0:
         for observation_id in observations_to_assign:
             # Assign observations near current xtalform sites
             for xtalform_site_id, xtalform_site_pos in centre_residues_positions.items():
                 if _get_dist(centroid_ca_positions[observation_id], xtalform_site_pos) < cutoff:
-
                     assignments[xtalform_site_id].append(observation_id)
                     assigned_observations.append(observation_id)
                     break
@@ -1049,12 +1072,12 @@ def _update_xtalform_sites(
         for xtalform_name in crystalform_observations
     }
 
-
     # Spatially cluster
     crystalform_observation_cluster_assignments = {
         xtalform_name: _crystalform_incremental_cluster(
             crystalform_observation_centroids[xtalform_name],
-            {xid: xs for xid, xs in xtalform_sites.items() if (xs.xtalform_id == xtalform_name) & (xs.canonical_site_id == canonical_site_id)},
+            {xid: xs for xid, xs in xtalform_sites.items() if
+             (xs.xtalform_id == xtalform_name) & (xs.canonical_site_id == canonical_site_id)},
             neighbourhoods
         )
         for xtalform_name in crystalform_observations
@@ -1360,7 +1383,6 @@ def _update(
     # Get the structures
     structures: dict = _get_structures(datasets)
 
-
     # Get the assembly alignment hierarchy
     hierarchy, biochain_priorities = alignment_heirarchy._derive_alignment_heirarchy(assemblies)
     alignment_heirarchy.save_yaml(fs_model.hierarchy, hierarchy, lambda x: x)
@@ -1533,7 +1555,7 @@ def _update(
         # Check if residues match as usual, otherwise create a new canon site for it
         debug = False
         if canonical_site_id == 'LYSRSCPZ-x0426+A+805+1':
-            debug=True
+            debug = True
         _update_xtalform_sites(
             xtalform_sites,
             canonical_site,
@@ -1623,7 +1645,8 @@ def _update(
                             canonical_site_id,
                             aligned_structure_path,
                     ) in ligand_neighbourhood_output.aligned_structures.items():
-                        if not ( (fs_model.source_dir.parent / aligned_structure_path).exists() | Path(aligned_structure_path).exists()):
+                        if not ((fs_model.source_dir.parent / aligned_structure_path).exists() | Path(
+                                aligned_structure_path).exists()):
                             # _update_aligned_structures()
                             _structure = structures[dtag].clone()
                             canonical_site = canonical_sites[canonical_site_id]
@@ -1652,9 +1675,9 @@ def _update(
                                 _xtalform_id = _xtalform_site.xtalform_id
                                 _xtalform_canonical_site_id = _xtalform_site.canonical_site_id
                                 if (_xtalform_id == site_reference_ligand_xtalform_id) \
-                                    & (_xtalform_canonical_site_id == canonical_site_id) \
-                                    & (site_reference_ligand_id in _xtalform_site.members):
-                                        xtalform_site = _xtalform_site
+                                        & (_xtalform_canonical_site_id == canonical_site_id) \
+                                        & (site_reference_ligand_id in _xtalform_site.members):
+                                    xtalform_site = _xtalform_site
                             site_chain = xtalform_site.crystallographic_chain
 
                             # Aligns to conformer site, then to the corresponding assembly, then from that assembly to
@@ -1685,15 +1708,17 @@ def _update(
                                         )
                                     ],
                                     assembly_transform=assembly_transforms[
-                                        xtalforms[dataset_assignments[conformer_site.reference_ligand_id[0]]].assemblies[
+                                        xtalforms[
+                                            dataset_assignments[conformer_site.reference_ligand_id[0]]].assemblies[
                                             alignment_heirarchy._chain_to_xtalform_assembly(
                                                 # conformer_site.reference_ligand_id[1],
                                                 site_chain,
                                                 xtalforms[dataset_assignments[conformer_site.reference_ligand_id[0]]]
                                             )
                                         ].assembly
                                     ],
-                                    )
+                                    xtalform_sites=xtalform_sites
+                                )
 
                             except:
                                 logger.info(f"Failed to generate aligned structure {aligned_structure_path}")
@@ -1710,7 +1735,8 @@ def _update(
         for canonical_site_id, alignment_info in dataset_alignment_info.items():
             aligned_structure_path = alignment_info["aligned_structures"]
             logger.info(f"Outputting reference structure: {aligned_structure_path}")
-            if not ( (fs_model.source_dir.parent / aligned_structure_path).exists() | Path(aligned_structure_path).exists()):
+            if not ((fs_model.source_dir.parent / aligned_structure_path).exists() | Path(
+                    aligned_structure_path).exists()):
                 _structure = structures[dtag].clone()
                 _align_reference_structure(
                     _structure,

diff --git a/src/ligand_neighbourhood_alignment/generate_aligned_structures.py b/src/ligand_neighbourhood_alignment/generate_aligned_structures.py
@@ -282,6 +282,7 @@ def _drop_non_binding_chains_and_symmetrize_waters(
         moving_ligand_id,
         dataset_ligand_neighbourhood_ids,
         xtalform,
+        xtalform_sites
 ):
     # Other Ligand IDs
     other_ligand_ids = [(lid[1], lid[2]) for lid in dataset_ligand_neighbourhood_ids if not ((lid[1] == moving_ligand_id[1]) & (lid[2] == moving_ligand_id[2]))]
@@ -299,7 +300,12 @@ def _drop_non_binding_chains_and_symmetrize_waters(
     }
 
     # Get the assembly the ligand is modelled as part of
-    lig_assembly = chain_assemblies[moving_ligand_id[1]]
+    for xsid, _xtalform_site in xtalform_sites.items():
+        _xtalform_id = _xtalform_site.xtalform_id
+        if moving_ligand_id in _xtalform_site.members:
+                xtalform_site = _xtalform_site
+    site_chain = xtalform_site.crystallographic_chain
+    lig_assembly = chain_assemblies[site_chain]
 
     # Determine which waters are bound near the ligand, and at what positions
     ns = gemmi.NeighborSearch(new_structure[0], new_structure.cell, 10).populate(include_h=False)
@@ -451,6 +457,7 @@ def _align_structure(
         site_reference_xform,
         chain_to_assembly_transform,
         assembly_transform,
+xtalform_sites
 ):
     shortest_path: list[tuple[str, str, str]] = nx.shortest_path(g, moving_ligand_id, reference_ligand_id)
     logger.debug(f"Shortest path: {shortest_path}")
@@ -466,7 +473,9 @@ def _align_structure(
         neighbourhood,
         moving_ligand_id,
         dataset_ligand_neighbourhood_ids,
-        xtalform)
+        xtalform,
+        xtalform_sites
+    )
 
     previous_ligand_id = moving_ligand_id
     running_transform = gemmi.Transform()

diff --git a/src/ligand_neighbourhood_alignment/generate_sites_from_components.py b/src/ligand_neighbourhood_alignment/generate_sites_from_components.py
@@ -326,15 +326,15 @@ def _update_reference_structure_transforms(
     xtalform_sites,
     canonical_site_id
 ):
-    # Get the biochain of the canonical site
+    # # Get the biochain of the canonical site
     site_reference_ligand_id = conformer_sites[canonical_site.reference_conformer_site_id].reference_ligand_id
     site_reference_ligand_xtalform_id = dataset_assignments[site_reference_ligand_id[0]]
     site_reference_ligand_xtalform = xtalforms[site_reference_ligand_xtalform_id]
     for xsid, _xtalform_site in xtalform_sites.items():
         _xtalform_id = _xtalform_site.xtalform_id
-        if _xtalform_id == site_reference_ligand_xtalform_id:
-            _xtalform_canonical_site_id = _xtalform_site.canonical_site_id
-            if _xtalform_canonical_site_id == canonical_site_id:
+        if (_xtalform_id == site_reference_ligand_xtalform_id) &\
+            (_xtalform_site.canonical_site_id == canonical_site_id) & \
+                (site_reference_ligand_id in _xtalform_site.members):
                 xtalform_site = _xtalform_site
     site_chain = xtalform_site.crystallographic_chain
     canonical_site_biochain = alignment_heirarchy._chain_to_biochain(
@@ -343,6 +343,7 @@ def _update_reference_structure_transforms(
         assemblies
     )
 
+
     # Determine whether the biochain is shared, and if not skip
     reference_structure = structures[key[0]]
     reference_structure_xtalform = xtalforms[dataset_assignments[key[0]]]
@@ -354,10 +355,21 @@ def _update_reference_structure_transforms(
     }
     reference_structure_biochains_inv = {v: k for k, v in reference_structure_biochains.items()}
 
-    if canonical_site_biochain not in reference_structure_biochains.values():
-        return None
-
-    # Align the reference to the biochain reference using the canonical site residues
+    # # Get the reference residues whose image is the canonical site biochain aligned residues
+    # alignment_residues_ref_st = []
+    # alignment_residues_mov_st = []
+    # for rid in canonical_site.residues:
+    #     chain, res = rid[0], rid[1]
+    #     biochain = alignment_heirarchy._chain_to_biochain(
+    #         chain,
+    #         site_reference_ligand_xtalform,
+    #         assemblies
+    #     )
+    #     reference_structure_chain = reference_structure_biochains_inv[biochain]
+    #     alignment_residues_ref_st.append((reference_structure_chain, res))
+    #     alignment_residues_mov_st.append((reference_structure_chain, res))
+
+    # # Align the reference to the biochain reference using the canonical site residues
     alignment_residues_ref_st = []
     alignment_residues_mov_st = []
     core_chain = reference_structure_biochains_inv[canonical_site_biochain]
@@ -373,13 +385,14 @@ def _update_reference_structure_transforms(
             continue
         alignment_residues_ref_st.append((chain, res))
         alignment_residues_mov_st.append((core_chain, res))
-
     transform = _get_transform_from_residues(
         alignment_residues_ref_st,
         structures[site_reference_ligand_id[0]],
         reference_structure,
         other_rs=alignment_residues_mov_st,
     )
+
+
     reference_structure_transforms[key] = dt.Transform(
         transform.vec.tolist(),
         transform.mat.tolist(),