From 622d7b8da486cff3bb31eda4b61515c1b83a4a13 Mon Sep 17 00:00:00 2001 From: Cam Date: Sat, 2 Jul 2022 17:36:33 +1000 Subject: [PATCH 01/20] Fix param name typo in function docstring --- graphein/utils/config_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphein/utils/config_parser.py b/graphein/utils/config_parser.py index 6f270b22d..1d3973b4a 100644 --- a/graphein/utils/config_parser.py +++ b/graphein/utils/config_parser.py @@ -24,7 +24,7 @@ def config_constructor( :param loader: Given yaml loader :param type: yaml.FullLoader - :param loader: A mapping node + :param node: A mapping node :param type: yaml.nodes.MappingNode """ arg_map = loader.construct_mapping(node, deep=True) if node.value else {} @@ -42,7 +42,7 @@ def function_constructor( :param type: yaml.FullLoader :param tag_suffix: The name after the !func: tag :param type: str - :param loader: A mapping node if function parameters are given, a scalar node if not + :param node: A mapping node if function parameters are given, a scalar node if not :param type: Union[yaml.nodes.MappingNode, yaml.nodes.ScalarNode] """ arg_map = None From 8a3f3a42b85afc1bc7236e037bc5d7e2e7b3a530 Mon Sep 17 00:00:00 2001 From: Cam Date: Sun, 3 Jul 2022 01:32:26 +1000 Subject: [PATCH 02/20] add scaling node size by "rsa" feature as well as degree --- graphein/protein/visualisation.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index bda81baa7..6cd3b0e7c 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -193,6 +193,7 @@ def plotly_protein_structure_graph( node_alpha: float = 0.7, node_size_min: float = 20.0, node_size_multiplier: float = 20.0, + node_size_feature: str = "degree", label_node_ids: bool = True, node_colour_map=plt.cm.plasma, edge_color_map=plt.cm.plasma, @@ -214,6 +215,8 @@ def plotly_protein_structure_graph( :type node_size_min: float :param node_size_multiplier: Scales node size by a constant. Node sizes reflect degree. Defaults to ``20.0``. :type node_size_multiplier: float + :param node_size_feature: Which feature to scale the node size by. Defaults to ``degree``. + :type node_size_feature: str :param label_node_ids: bool indicating whether or not to plot ``node_id`` labels. Defaults to ``True``. :type label_node_ids: bool :param node_colour_map: colour map to use for nodes. Defaults to ``plt.cm.plasma``. @@ -239,6 +242,17 @@ def plotly_protein_structure_graph( G, colour_map=edge_color_map, colour_by=colour_edges_by ) + # Get node size + def node_scale_by(G, feature): + if feature == 'degree': + return lambda k : node_size_min + node_size_multiplier * G.degree[k] + elif feature == 'rsa': + return lambda k : node_size_min + node_size_multiplier * G.nodes(data=True)[k]['rsa'] + else: + raise ValueError(f"Cannot size nodes by feature '{feature}'") + + get_node_size = node_scale_by(G, node_size_feature) + # 3D network plot x_nodes = [] y_nodes = [] @@ -251,7 +265,7 @@ def plotly_protein_structure_graph( x_nodes.append(value[0]) y_nodes.append(value[1]) z_nodes.append(value[2]) - node_sizes.append(node_size_min + node_size_multiplier * G.degree[key]) + node_sizes.append(get_node_size(key)) if label_node_ids: node_labels.append(list(G.nodes())[i]) From 9c9520b15f78d0c86fb7cccd70deca86866a7571 Mon Sep 17 00:00:00 2001 From: Cam Date: Sun, 3 Jul 2022 01:33:49 +1000 Subject: [PATCH 03/20] add option for scaling node size by meiler embedding dimensions. Takes negative values to be zero. --- graphein/protein/visualisation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index 6cd3b0e7c..366e8a509 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -6,6 +6,7 @@ # Code Repository: https://github.com/a-r-j/graphein from __future__ import annotations +import re import logging from itertools import count from typing import Dict, List, Optional, Tuple, Union @@ -248,8 +249,12 @@ def node_scale_by(G, feature): return lambda k : node_size_min + node_size_multiplier * G.degree[k] elif feature == 'rsa': return lambda k : node_size_min + node_size_multiplier * G.nodes(data=True)[k]['rsa'] + # Meiler embedding dimension + p = re.compile("meiler-([1-7])") + if dim := p.search(feature).group(1): + return lambda k : node_size_min + node_size_multiplier * max(0, G.nodes(data=True)[k]['meiler'][f'dim_{dim}']) # Meiler values may be negative else: - raise ValueError(f"Cannot size nodes by feature '{feature}'") + raise ValueError(f"Cannot size nodes by feature '{feature}'") get_node_size = node_scale_by(G, node_size_feature) From 44a0bf8b796c0b0b8ccf7a50e65550eb0a5e93e3 Mon Sep 17 00:00:00 2001 From: Cam Date: Wed, 6 Jul 2022 17:12:24 +1000 Subject: [PATCH 04/20] remove walrus operator := for compatability --- graphein/protein/visualisation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index 366e8a509..afe5e3aee 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -251,7 +251,8 @@ def node_scale_by(G, feature): return lambda k : node_size_min + node_size_multiplier * G.nodes(data=True)[k]['rsa'] # Meiler embedding dimension p = re.compile("meiler-([1-7])") - if dim := p.search(feature).group(1): + dim = p.search(feature).group(1) + if dim: return lambda k : node_size_min + node_size_multiplier * max(0, G.nodes(data=True)[k]['meiler'][f'dim_{dim}']) # Meiler values may be negative else: raise ValueError(f"Cannot size nodes by feature '{feature}'") From a2806b6a27c0022d458c5e206d9eb832c7dcedc2 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Fri, 8 Jul 2022 12:04:44 +0200 Subject: [PATCH 05/20] Add type hints --- graphein/protein/visualisation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index afe5e3aee..9a01ba94a 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -244,7 +244,7 @@ def plotly_protein_structure_graph( ) # Get node size - def node_scale_by(G, feature): + def node_scale_by(G: nx.Graph, feature: str): if feature == 'degree': return lambda k : node_size_min + node_size_multiplier * G.degree[k] elif feature == 'rsa': From 024e7a068cca84c11c15ab4b0a9a4eb751a206f0 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Fri, 8 Jul 2022 12:04:50 +0200 Subject: [PATCH 06/20] Update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b9a4d4ee..18ca0cfae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,8 @@ #### Changes -* #187 updates sequence retrieval due to UniProt API changes. +* [Feature] - [#186](https://github.com/a-r-j/graphein/pull/186) adds support for scaling node sizes in plots by a computed feature. Contribution by @cimranm +* [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes. ### 1.5.0 From 75697513847a435189f6a552f5ace20ad6554239 Mon Sep 17 00:00:00 2001 From: Cam Date: Fri, 22 Jul 2022 17:22:23 +1000 Subject: [PATCH 07/20] add support for sizing nodes by RSA and colouring by hydrophobicity in asteroid_plot --- graphein/protein/visualisation.py | 68 ++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index 9a01ba94a..46ce5ef14 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -708,6 +708,7 @@ def asteroid_plot( node_id: str, k: int = 2, colour_nodes_by: str = "shell", # residue_name + size_nodes_by: str = "degree", colour_edges_by: str = "kind", edge_colour_map: plt.cm.Colormap = plt.cm.plasma, edge_alpha: float = 1.0, @@ -718,6 +719,7 @@ def asteroid_plot( use_plotly: bool = True, show_edges: bool = False, show_legend: bool = True, + node_size_min: float = 20, node_size_multiplier: float = 10, ) -> Union[plotly.graph_objects.Figure, matplotlib.figure.Figure]: """Plots a k-hop subgraph around a node as concentric shells. @@ -732,6 +734,8 @@ def asteroid_plot( :type k: int :param colour_nodes_by: Colour the nodes by this attribute. Currently only ``"shell"`` is supported. :type colour_nodes_by: str + :param size_nodes_by: Size the nodes by an attribute. + :type size_nodes_by: str :param colour_edges_by: Colour the edges by this attribute. Currently only ``"kind"`` is supported. :type colour_edges_by: str :param edge_colour_map: Colour map for edges. Defaults to ``plt.cm.plasma``. @@ -750,8 +754,10 @@ def asteroid_plot( :type show_edges: bool :param show_legend: Whether to show the legend of the edges. Fefaults to `True``. :type show_legend: bool + :param node_size_min: Specifies node minimum size. Defaults to ``20.0``. + :type node_size_min: float :param node_size_multiplier: Multiplier for the size of the nodes. Defaults to ``10``. - :type node_size_multiplier: float. + :type node_size_multiplier: float :returns: Plotly figure or matplotlib figure. :rtpye: Union[plotly.graph_objects.Figure, matplotlib.figure.Figure] """ @@ -811,9 +817,16 @@ def asteroid_plot( node_x.append(x) node_y.append(y) - degrees = [ - subgraph.degree(n) * node_size_multiplier for n in subgraph.nodes() - ] + def node_size_function(g: nx.Graph, feature: str): + if feature == 'degree': + return lambda k : g.degree(k) + elif feature == 'rsa': + return lambda k : g.nodes(data=True)[k]['rsa'] + else: + raise NotImplementedError(f"Size by {size_nodes_by} not implemented.") + + node_size = node_size_function(subgraph, size_nodes_by) + node_sizes = [node_size_min + node_size(n) * node_size_multiplier for n in subgraph.nodes()] if colour_nodes_by == "shell": node_colours = [] @@ -821,11 +834,44 @@ def asteroid_plot( for k, v in nodes.items(): if n in v: node_colours.append(k) + elif colour_nodes_by == "hydrophobicity": + + """ + TODO Does a function like this already exist somewhere? + """ + def hydrophobicity_of_residue(res: str, mapping: str = 'a'): + hmap = { + "ILE" : 4.5, + "VAL" : 4.2, + "LEU" : 3.8, + "PHE" : 2.8, + "CYS" : 2.5, + "MET" : 1.9, + "ALA" : 1.8, + "GLY" : -0.4, + "THR" : -0.7, + "SER" : -0.8, + "TRP" : -0.9, + "TYR" : -1.3, + "PRO" : -1.6, + "HIS" : -3.2, + "GLU" : -3.5, + "GLN" : -3.5, + "ASP" : -3.5, + "ASN" : -3.5, + "LYS" : -3.9, + "ARG" : -4.5, + } + return hmap[res] + + node_colours = [] + for n in subgraph.nodes(): + for k, v in nodes.items(): + if n in v: + node_colours.append(hydrophobicity_of_residue(n.split(':')[1])) else: - raise NotImplementedError( - f"Colour by {colour_nodes_by} not implemented." - ) - # TODO colour by AA type + raise NotImplementedError(f"Colour by {colour_nodes_by} not implemented.") + node_trace = go.Scatter( x=node_x, y=node_y, @@ -835,13 +881,13 @@ def asteroid_plot( textposition="bottom center", showlegend=False, marker=dict( - colorscale="YlGnBu", + colorscale="viridis", reversescale=True, color=node_colours, - size=degrees, + size=node_sizes, colorbar=dict( thickness=15, - title="Shell", + title=str.capitalize(colour_nodes_by), tickvals=list(range(k)), xanchor="left", titleside="right", From 21d496a2268d9e04fb48a2f83bcfbf0ea300607e Mon Sep 17 00:00:00 2001 From: Cam Date: Fri, 29 Jul 2022 11:16:08 +1000 Subject: [PATCH 08/20] add amino acid 3-letter code mapping to hydrophobicity scales from the literature. --- graphein/protein/resi_atoms.py | 143 +++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index 2c3d0f226..12b6abae9 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -789,6 +789,149 @@ SULPHUR_RESIS: List[str] = ["MET", "CYS"] """Residues containing sulphur atoms.""" +HYDROPHOBICITY_SCALES: Dict[str, Dict[str, float]] = { + "kd": { # kdHydrophobicity (a) + "ILE": 4.5, + "VAL": 4.2, + "LEU": 3.8, + "PHE": 2.8, + "CYS": 2.5, + "MET": 1.9, + "ALA": 1.8, + "GLY": -0.4, + "THR": -0.7, + "SER": -0.8, + "TRP": -0.9, + "TYR": -1.3, + "PRO": -1.6, + "HIS": -3.2, + "GLU": -3.5, + "GLN": -3.5, + "ASP": -3.5, + "ASN": -3.5, + "LYS": -3.9, + "ARG": -4.5, + }, + "ww": { # wwHydrophobicity (b) + "ILE": 0.31, + "VAL": -0.07, + "LEU": 0.56, + "PHE": 1.13, + "CYS": 0.24, + "MET": 0.23, + "ALA": -0.17, + "GLY": -0.01, + "THR": -0.14, + "SER": -0.13, + "TRP": 1.85, + "TYR": 0.94, + "PRO": -0.45, + "HIS": -0.96, + "GLU": -2.02, + "GLN": -0.58, + "ASP": -1.23, + "ASN": -0.42, + "LYS": -0.99, + "ARG": -0.81, + }, + "hh": { # hhHydrophobicity (c) + "ILE": -0.60, + "VAL": -0.31, + "LEU": -0.55, + "PHE": -0.32, + "CYS": -0.13, + "MET": -0.10, + "ALA": 0.11, + "GLY": 0.74, + "THR": 0.52, + "SER": 0.84, + "TRP": 0.30, + "TYR": 0.68, + "PRO": 2.23, + "HIS": 2.06, + "GLU": 2.68, + "GLN": 2.36, + "ASP": 3.49, + "ASN": 2.05, + "LYS": 2.71, + "ARG": 2.58, + }, + "mf": { # mfHydrophobicity (d) + "ILE": -1.56, + "VAL": -0.78, + "LEU": -1.81, + "PHE": -2.20, + "CYS": 0.49, + "MET": -0.76, + "ALA": 0.0, + "GLY": 1.72, + "THR": 1.78, + "SER": 1.83, + "TRP": -0.38, + "TYR": -1.09, + "PRO": -1.52, + "HIS": 4.76, + "GLU": 1.64, + "GLN": 3.01, + "ASP": 2.95, + "ASN": 3.47, + "LYS": 5.39, + "ARG": 3.71, + }, + "tt": { # ttHydrophobicity (e) + "ILE": 1.97, + "VAL": 1.46, + "LEU": 1.82, + "PHE": 1.98, + "CYS": -0.30, + "MET": 1.40, + "ALA": 0.38, + "GLY": -0.19, + "THR": -0.32, + "SER": -0.53, + "TRP": 1.53, + "TYR": 0.49, + "PRO": -1.44, + "HIS": -1.44, + "GLU": -2.90, + "GLN": -1.84, + "ASP": -3.27, + "ASN": -1.62, + "LYS": -3.46, + "ARG": -2.57, + } +} +""" +Set of (5) dictionaries that map amino acid 3-letter codes to their hydrophobicity. + +The scales included are from Chimera (UCSF) https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html +and are as follows: + + * kdHydrophobicity + (a) A simple method for displaying the hydropathic character of a protein. Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32. + https://www.ncbi.nlm.nih.gov/pubmed/7108955 + + * wwHydrophobicity + (b) Experimentally determined hydrophobicity scale for proteins at membrane interfaces. Wimley WC, White SH. Nat Struct Biol. 1996 Oct;3(10):842-8. + https://www.ncbi.nlm.nih.gov/pubmed/8836100 + + * hhHydrophobicity + (c) Recognition of transmembrane helices by the endoplasmic reticulum translocon. Hessa T, Kim H, Bihlmaier K, Lundin C, Boekel J, Andersson H, Nilsson I, White SH, von Heijne G. Nature. 2005 Jan 27;433(7024):377-81, supplementary data. + https://www.ncbi.nlm.nih.gov/pubmed/15674282 + + In this scale, negative values indicate greater hydrophobicity. + + * mfHydrophobicity + (d) Side-chain hydrophobicity scale derived from transmembrane protein folding into lipid bilayers. Moon CP, Fleming KG. Proc Natl Acad Sci USA. 2011 Jun 21;108(25):10174-7, supplementary data. + https://www.ncbi.nlm.nih.gov/pubmed/21606332 + + In this scale, negative values indicate greater hydrophobicity. + + * ttHydrophobicity + (e) An amino acid “transmembrane tendency” scale that approaches the theoretical limit to accuracy for prediction of transmembrane helices: relationship to biological hydrophobicity. Zhao G, London E. Protein Sci. 2006 Aug;15(8):1987-2001. + https://www.ncbi.nlm.nih.gov/pubmed/16877712 +""" + ISOELECTRIC_POINTS: Dict[str, float] = { "ALA": 6.11, "ARG": 10.76, From fbac7ea70aa04ea94cbffc95f5a36988bdc49448 Mon Sep 17 00:00:00 2001 From: Cam Date: Fri, 29 Jul 2022 11:40:48 +1000 Subject: [PATCH 09/20] colour by hydrophobicity implemented for different scales --- graphein/protein/visualisation.py | 41 +++++++++---------------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index cc2447a58..0d76ce6c2 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -24,6 +24,7 @@ from graphein.protein.subgraphs import extract_k_hop_subgraph from graphein.utils.utils import import_message +from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES log = logging.getLogger(__name__) @@ -835,47 +836,27 @@ def node_size_function(g: nx.Graph, feature: str): node_size = node_size_function(subgraph, size_nodes_by) node_sizes = [node_size_min + node_size(n) * node_size_multiplier for n in subgraph.nodes()] + colour_nodes_by = colour_nodes_by.lower() if colour_nodes_by == "shell": node_colours = [] for n in subgraph.nodes(): for k, v in nodes.items(): if n in v: node_colours.append(k) - elif colour_nodes_by == "hydrophobicity": - - """ - TODO Does a function like this already exist somewhere? - """ - def hydrophobicity_of_residue(res: str, mapping: str = 'a'): - hmap = { - "ILE" : 4.5, - "VAL" : 4.2, - "LEU" : 3.8, - "PHE" : 2.8, - "CYS" : 2.5, - "MET" : 1.9, - "ALA" : 1.8, - "GLY" : -0.4, - "THR" : -0.7, - "SER" : -0.8, - "TRP" : -0.9, - "TYR" : -1.3, - "PRO" : -1.6, - "HIS" : -3.2, - "GLU" : -3.5, - "GLN" : -3.5, - "ASP" : -3.5, - "ASN" : -3.5, - "LYS" : -3.9, - "ARG" : -4.5, - } - return hmap[res] + + # Hydrophobicity + p = re.compile("([a-z]{2})?-?(hydrophobicity)") # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity" + match = p.search(colour_nodes_by) + if match and match.group(2): + scale: str = match.group(1) if match.group(1) else "kd" # use 'kdhydrophobicity' as default if no scale specified + try: hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale] + except: raise KeyError(f"'{scale}' not a valid hydrophobicity scale.") node_colours = [] for n in subgraph.nodes(): for k, v in nodes.items(): if n in v: - node_colours.append(hydrophobicity_of_residue(n.split(':')[1])) + node_colours.append(hydrophob[n.split(':')[1]]) else: raise NotImplementedError(f"Colour by {colour_nodes_by} not implemented.") From 5878e4f71e50ca65ae5b8fbdd07bca01f3544379 Mon Sep 17 00:00:00 2001 From: Cam Date: Fri, 29 Jul 2022 17:41:26 +1000 Subject: [PATCH 10/20] refactor `_node_feature` function; colour_by and size_by msupported with more features --- graphein/protein/visualisation.py | 175 +++++++++++++++++++++--------- 1 file changed, 123 insertions(+), 52 deletions(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index 0d76ce6c2..d0a184123 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -5,12 +5,13 @@ # Project Website: https://github.com/a-r-j/graphein # Code Repository: https://github.com/a-r-j/graphein from __future__ import annotations +from optparse import Option import re import logging import re from itertools import count -from typing import Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import matplotlib import matplotlib.pyplot as plt @@ -24,7 +25,9 @@ from graphein.protein.subgraphs import extract_k_hop_subgraph from graphein.utils.utils import import_message -from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES +from protein.resi_atoms import HYDROPHOBICITY_SCALES + +#### TODO: change to graphein.protein.resi_atoms log = logging.getLogger(__name__) @@ -49,6 +52,111 @@ ) +""" +TODO: Functino that gets ``min`` and ``max`` values in a graph for a given feature so that we can scale / offset to > 0 + +TODO: should feature `distance` actually contain the site itself i.e. in the string? +""" +def _node_feature_func( + g: nx.Graph, + feature: str, + focal_node: Optional[str] = None, + focal_point: Optional[tuple] = None, + no_negatives: bool = False, +) -> Callable: + """ + Maps a feature as described by a string to a function that can be applied on nodes from a graph. + + :param g: Protein graph. + :type g: nx.Graph + :param feature: Name of feature to extract. + :type feature: str + :param focal_node: A specific node within ``g`` to use in feature calculation; e.g. when calculating ``distance`` to a given site. + :type focal_node: Optional[str] + :param focal_point: Use specific coordinates instead of a node within the graph. + :type focal_point: tuple + :param no_negatives: Take the max of ``0`` and the feature's value. Defaults to ``False``. + :type no_negatives: bool + + :return: Function that returns a value for a given node ID. + :rtype: Callable + + TODO is there a way to wrap a lambda with another function i.e. max(0, f) for `no_negatives` ? + TODO some features do not require the graph to be supplied e.g. hydrophobicity mapping from residue 3-letter code. Handle this? + """ + if feature == "degree": + return lambda k: g.degree[k] + + if feature in ["seq-position", "seq_position"]: + return lambda k: int(k.split(':')[-1]) + + elif feature == "rsa": + return lambda k: g.nodes(data=True)[k]["rsa"] + + elif feature in ["bfac", "bfactor", "b_factor", "b-factor"]: + return lambda k: g.nodes(data=True)[k]["b_factor"] + + elif feature == "distance": # Euclidean distance to a specific node / coordinate + def get_coords(g: nx.Graph, node: str) -> np.ndarray: + return np.array(g.nodes()[node]["coords"]) + + if focal_node: + assert focal_node in g.nodes() + return lambda k: np.linalg.norm(get_coords(g, k) - get_coords(g, focal_node)) + elif focal_point: + assert len(focal_point) == 3 + return lambda k: np.linalg.norm(get_coords(g, k) - np.array(focal_point)) + else: + raise ValueError(f"Node feature 'distance' requires one of `focal_node` or `focal_point`.") + + # Meiler embedding dimension + p = re.compile("meiler-?([0-9])") + match = p.search(feature) + if match: + dim = match.group(1) + if int(dim) in range(1,8): + if no_negatives: return lambda k: max(0, g.nodes(data=True)[k]["meiler"][f"dim_{dim}"]) + else: return lambda k: g.nodes(data=True)[k]["meiler"][f"dim_{dim}"] + else: + raise ValueError(f"Meiler embeddings have dimensions 1-7, received {dim}.") + + # Hydrophobicity + p = re.compile("([a-z]{2})?-?(hydrophobicity)") # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity" + match = p.search(feature) + if match and match.group(2): + + # TODO: check if nodes actually have 'hydrophobicity' already; if they do, then use this. if not, then map to kd. + scale: str = match.group(1) if match.group(1) else "kd" # use 'kdhydrophobicity' as default if no scale specified + try: hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale] + except: raise KeyError(f"'{scale}' not a valid hydrophobicity scale.") + return lambda k: hydrophob[k.split(':')[1]] + + else: + raise NotImplementedError(f"Feature '{feature}' not implemented.") + + +def _node_size_func( + g: nx.Graph, + feature: str, + min: float, + multiplier: float +) -> Callable: + """ + Returns a function that can be use to generate node sizes for plotting. + + :param g: Protein graph + :type g: nx.Graph + :param feature: Name of feature to scale node sizes by. + :type feature: str + :param min: Number to offset size with. + :type min: float + :param multiplier: Number to scale feature values by. + :type multiplier: float + """ + get_feature = _node_feature_func(g=g, feature=feature, no_negatives=True) + return lambda k: min + multiplier * get_feature(k) + + def plot_pointcloud(mesh: Meshes, title: str = "") -> Axes3D: """ Plots pytorch3d Meshes object as pointcloud. @@ -245,27 +353,7 @@ def plotly_protein_structure_graph( G, colour_map=edge_color_map, colour_by=colour_edges_by ) - # Get node size - def node_scale_by(G: nx.Graph, feature: str): - if feature == "degree": - return lambda k: node_size_min + node_size_multiplier * G.degree[k] - elif feature == "rsa": - return ( - lambda k: node_size_min - + node_size_multiplier * G.nodes(data=True)[k]["rsa"] - ) - - # Meiler embedding dimension - p = re.compile("meiler-([1-7])") - dim = p.search(feature).group(1) - if dim: - return lambda k: node_size_min + node_size_multiplier * max( - 0, G.nodes(data=True)[k]["meiler"][f"dim_{dim}"] - ) # Meiler values may be negative - else: - raise ValueError(f"Cannot size nodes by feature '{feature}'") - - get_node_size = node_scale_by(G, node_size_feature) + size_by = _node_size_func(G, node_size_feature, min=node_size_min, multiplier=node_size_multiplier) # 3D network plot x_nodes = [] @@ -279,7 +367,7 @@ def node_scale_by(G: nx.Graph, feature: str): x_nodes.append(value[0]) y_nodes.append(value[1]) z_nodes.append(value[2]) - node_sizes.append(get_node_size(key)) + node_sizes.append(size_by(key)) if label_node_ids: node_labels.append(list(G.nodes())[i]) @@ -824,42 +912,25 @@ def asteroid_plot( x, y = subgraph.nodes[node]["pos"] node_x.append(x) node_y.append(y) - - def node_size_function(g: nx.Graph, feature: str): - if feature == 'degree': - return lambda k : g.degree(k) - elif feature == 'rsa': - return lambda k : g.nodes(data=True)[k]['rsa'] - else: - raise NotImplementedError(f"Size by {size_nodes_by} not implemented.") - - node_size = node_size_function(subgraph, size_nodes_by) - node_sizes = [node_size_min + node_size(n) * node_size_multiplier for n in subgraph.nodes()] + + size_by = _node_size_func(subgraph, size_nodes_by, min=node_size_min, multiplier=node_size_multiplier) + node_sizes = [size_by(n) for n in subgraph.nodes()] colour_nodes_by = colour_nodes_by.lower() + node_colours = [] if colour_nodes_by == "shell": - node_colours = [] for n in subgraph.nodes(): for k, v in nodes.items(): if n in v: node_colours.append(k) - - # Hydrophobicity - p = re.compile("([a-z]{2})?-?(hydrophobicity)") # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity" - match = p.search(colour_nodes_by) - if match and match.group(2): - scale: str = match.group(1) if match.group(1) else "kd" # use 'kdhydrophobicity' as default if no scale specified - try: hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale] - except: raise KeyError(f"'{scale}' not a valid hydrophobicity scale.") - - node_colours = [] - for n in subgraph.nodes(): - for k, v in nodes.items(): - if n in v: - node_colours.append(hydrophob[n.split(':')[1]]) else: - raise NotImplementedError(f"Colour by {colour_nodes_by} not implemented.") - + try: get_feature = _node_feature_func(g=subgraph, feature=colour_nodes_by, no_negatives=False) + except: raise NotImplementedError(f"Colour by {colour_nodes_by} not implemented.") + + for n, d in subgraph.nodes(data=True): + node_colours.append(get_feature(n)) + print(f"value: {get_feature(n)}") + node_trace = go.Scatter( x=node_x, y=node_y, From 8948b1f447cae1e3b017e8127fdc74d79d8aed8b Mon Sep 17 00:00:00 2001 From: Cam Date: Fri, 29 Jul 2022 17:48:49 +1000 Subject: [PATCH 11/20] fix import statement to use graphein actual --- graphein/protein/visualisation.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index d0a184123..e27ed513e 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -25,9 +25,7 @@ from graphein.protein.subgraphs import extract_k_hop_subgraph from graphein.utils.utils import import_message -from protein.resi_atoms import HYDROPHOBICITY_SCALES - -#### TODO: change to graphein.protein.resi_atoms +from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES log = logging.getLogger(__name__) From 1212d5e830d24d171674c0c4f2421d8b9a2efff7 Mon Sep 17 00:00:00 2001 From: Cam Date: Fri, 29 Jul 2022 18:05:14 +1000 Subject: [PATCH 12/20] add `hydrophobicity()`. not sure if should be passed a parameter deciding which scale to use; or just return vector of values for all 5 scales. --- graphein/protein/features/nodes/amino_acid.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/graphein/protein/features/nodes/amino_acid.py b/graphein/protein/features/nodes/amino_acid.py index dfa109ec4..2270de278 100644 --- a/graphein/protein/features/nodes/amino_acid.py +++ b/graphein/protein/features/nodes/amino_acid.py @@ -18,6 +18,7 @@ HYDROGEN_BOND_ACCEPTORS, HYDROGEN_BOND_DONORS, RESI_THREE_TO_1, + HYDROPHOBICITY_SCALES ) from graphein.utils.utils import onek_encoding_unk @@ -248,3 +249,46 @@ def hydrogen_bond_acceptor( if not sum_features: features = np.array(features > 0).astype(int) d["hbond_acceptors"] = features + + + +""" +TODO: add a similar 'load in' function from .csv as above? +or stick to hydrophobicity as dict? + +TODO: make vector of all hydrophobicity scales instead of one chosen scale? + +TODO: sum features bool? +""" +def hydrophobicity( + n: str, + d: Dict[str, any], + mapping: str = "kd", + return_array: bool = True, + +) -> None: + """ + :param n: node ID + :type n: str + :param d: dict of node attributes + :type d: Dict[str, any] + + :param mapping: which hydrophobicity scale to use. + :type mapping: str + :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns a ``pd.Series``. Default is ``True``. + :type return_array: bool + """ + assert mapping in HYDROPHOBICITY_SCALES.keys() + hydr = HYDROPHOBICITY_SCALES[mapping] + + amino_acid = d["residue_name"] + try: + features = hydr[amino_acid] + except: + features = pd.Series(np.zeros(1)) + + if return_array: + features = np.array(features) + + d["hydrophobicity"] = features + return features From 6b29498d42cf13298bf1118b282c4adec8cebfe7 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 2 Aug 2022 01:19:53 +0200 Subject: [PATCH 13/20] "Add a utility for getting the names of node, edge and graph attributes present" --- graphein/protein/features/nodes/amino_acid.py | 40 +++++++++---------- graphein/protein/resi_atoms.py | 5 ++- graphein/protein/visualisation.py | 16 +------- graphein/utils/utils.py | 34 ++++++++++++++++ 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/graphein/protein/features/nodes/amino_acid.py b/graphein/protein/features/nodes/amino_acid.py index 2270de278..9f79f671d 100644 --- a/graphein/protein/features/nodes/amino_acid.py +++ b/graphein/protein/features/nodes/amino_acid.py @@ -8,7 +8,7 @@ import logging from functools import lru_cache from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Literal import numpy as np import pandas as pd @@ -18,7 +18,8 @@ HYDROGEN_BOND_ACCEPTORS, HYDROGEN_BOND_DONORS, RESI_THREE_TO_1, - HYDROPHOBICITY_SCALES + HYDROPHOBICITY_SCALES, + HYDROPHOBICITY_TYPES ) from graphein.utils.utils import onek_encoding_unk @@ -251,34 +252,29 @@ def hydrogen_bond_acceptor( d["hbond_acceptors"] = features - -""" -TODO: add a similar 'load in' function from .csv as above? -or stick to hydrophobicity as dict? - -TODO: make vector of all hydrophobicity scales instead of one chosen scale? - -TODO: sum features bool? -""" def hydrophobicity( n: str, d: Dict[str, any], - mapping: str = "kd", + mapping: HYDROPHOBICITY_TYPE = "kd", return_array: bool = True, - -) -> None: +) -> Union[np.ndarray, pd.Series]: """ - :param n: node ID + Adds hydrophobicity values for each residue to graph nodes. + See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for + values and available scales. + + :param n: Node ID. Unused - kept to maintain consistent function signature. :type n: str - :param d: dict of node attributes + :param d: Dictionary of node attributes. :type d: Dict[str, any] - - :param mapping: which hydrophobicity scale to use. - :type mapping: str - :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns a ``pd.Series``. Default is ``True``. + :param mapping: Which hydrophobicity scale to use. See + :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_TYPE` for supported types. + :type mapping: graphien.protein.resi_atoms.HYDROPHOBICITY_TYPE + :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns + a ``pd.Series``. Default is ``True``. :type return_array: bool """ - assert mapping in HYDROPHOBICITY_SCALES.keys() + assert mapping in HYDROPHOBICITY_SCALES.keys(), f"Unsupported mapping: {mapping}. Supported mappings: {HYDROPHOBICITY_SCALES.keys()}" hydr = HYDROPHOBICITY_SCALES[mapping] amino_acid = d["residue_name"] @@ -290,5 +286,5 @@ def hydrophobicity( if return_array: features = np.array(features) - d["hydrophobicity"] = features + d[f"hydrophobicity_{mapping}"] = features return features diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index b7aea3cbd..50fd5e43f 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -14,7 +14,7 @@ # Code Repository: https://github.com/a-r-j/graphein -from typing import Dict, List +from typing import Dict, List, Literal import numpy as np from sklearn.preprocessing import StandardScaler @@ -836,6 +836,9 @@ https://pubs.acs.org/doi/10.1021/j100785a001 """ +HYDROPHOBICITY_TYPE = Literal["kd", "ww", "hh", "mf", "tt"] +"""Supported hydrophobicity types. See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for further details.""" + HYDROPHOBICITY_SCALES: Dict[str, Dict[str, float]] = { "kd": { # kdHydrophobicity (a) "ILE": 4.5, diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index e27ed513e..935706a23 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -5,7 +5,6 @@ # Project Website: https://github.com/a-r-j/graphein # Code Repository: https://github.com/a-r-j/graphein from __future__ import annotations -from optparse import Option import re import logging @@ -52,7 +51,6 @@ """ TODO: Functino that gets ``min`` and ``max`` values in a graph for a given feature so that we can scale / offset to > 0 - TODO: should feature `distance` actually contain the site itself i.e. in the string? """ def _node_feature_func( @@ -75,29 +73,22 @@ def _node_feature_func( :type focal_point: tuple :param no_negatives: Take the max of ``0`` and the feature's value. Defaults to ``False``. :type no_negatives: bool - :return: Function that returns a value for a given node ID. :rtype: Callable TODO is there a way to wrap a lambda with another function i.e. max(0, f) for `no_negatives` ? - TODO some features do not require the graph to be supplied e.g. hydrophobicity mapping from residue 3-letter code. Handle this? """ if feature == "degree": return lambda k: g.degree[k] - - if feature in ["seq-position", "seq_position"]: - return lambda k: int(k.split(':')[-1]) - + elif feature in ["seq-position", "seq_position"]: + return lambda k: g.nodes(data=True)[k]["residue_number"] elif feature == "rsa": return lambda k: g.nodes(data=True)[k]["rsa"] - elif feature in ["bfac", "bfactor", "b_factor", "b-factor"]: return lambda k: g.nodes(data=True)[k]["b_factor"] - elif feature == "distance": # Euclidean distance to a specific node / coordinate def get_coords(g: nx.Graph, node: str) -> np.ndarray: return np.array(g.nodes()[node]["coords"]) - if focal_node: assert focal_node in g.nodes() return lambda k: np.linalg.norm(get_coords(g, k) - get_coords(g, focal_node)) @@ -122,13 +113,11 @@ def get_coords(g: nx.Graph, node: str) -> np.ndarray: p = re.compile("([a-z]{2})?-?(hydrophobicity)") # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity" match = p.search(feature) if match and match.group(2): - # TODO: check if nodes actually have 'hydrophobicity' already; if they do, then use this. if not, then map to kd. scale: str = match.group(1) if match.group(1) else "kd" # use 'kdhydrophobicity' as default if no scale specified try: hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale] except: raise KeyError(f"'{scale}' not a valid hydrophobicity scale.") return lambda k: hydrophob[k.split(':')[1]] - else: raise NotImplementedError(f"Feature '{feature}' not implemented.") @@ -927,7 +916,6 @@ def asteroid_plot( for n, d in subgraph.nodes(data=True): node_colours.append(get_feature(n)) - print(f"value: {get_feature(n)}") node_trace = go.Scatter( x=node_x, diff --git a/graphein/utils/utils.py b/graphein/utils/utils.py index dd46e4c3d..888cadc3b 100644 --- a/graphein/utils/utils.py +++ b/graphein/utils/utils.py @@ -384,3 +384,37 @@ def ping(host: str) -> bool: command = ["ping", param, "1", host] return subprocess.call(command) == 0 + + +def get_node_attribute_names(g: nx.Graph) -> List[str]: + """Returns a list of node attribute names present within a graph. + + :param g: Networkx Graph. + :type g: nx.Graph + :returns: List of node attribute names + :rtype: List[str] + """ + + return list(set(np.array([list(g.nodes[n].keys()) for n in g.nodes()]).flatten())) + +def get_edge_attribute_names(g: nx.Graph) -> List[str]: + """Returns a list of edge attribute names present within a graph. + + :param g: Networkx Graph. + :type g: nx.Graph + :returns: List of edge attribute names + :rtype: List[str] + """ + return list(set(np.array([list(g.edges[u][v].keys()) for u, v in g.edges()]).flatten())) + + +def get_graph_attribute_names(g: nx.Graph) -> List[str]: + """Returns a list of graph attribute names present within a graph. + + :param g: Networkx Graph. + :type g: nx.Graph + :returns: List of graph attribute names + :rtype: List[str] + """ + return list(g.graph.keys()) + From 00f99a522a964e12a2d3f7a8c5510d5ba70cd12f Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 2 Aug 2022 01:28:46 +0200 Subject: [PATCH 14/20] fix edge attribute selection in util --- graphein/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/utils/utils.py b/graphein/utils/utils.py index 888cadc3b..d15091d45 100644 --- a/graphein/utils/utils.py +++ b/graphein/utils/utils.py @@ -405,7 +405,7 @@ def get_edge_attribute_names(g: nx.Graph) -> List[str]: :returns: List of edge attribute names :rtype: List[str] """ - return list(set(np.array([list(g.edges[u][v].keys()) for u, v in g.edges()]).flatten())) + return list(set(np.array([list(g.edges[u, v].keys()) for u, v in g.edges()]).flatten())) def get_graph_attribute_names(g: nx.Graph) -> List[str]: From 28b0ee18b872de67e92b38c0a343924c8e5850c5 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 2 Aug 2022 01:31:10 +0200 Subject: [PATCH 15/20] add test for attribute name selection util --- tests/utils/test_utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 tests/utils/test_utils.py diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py new file mode 100644 index 000000000..0ae05ff02 --- /dev/null +++ b/tests/utils/test_utils.py @@ -0,0 +1,23 @@ +from graphein.protein.graphs import construct_graph +from graphein.utils.utils import get_graph_attribute_names, get_node_attribute_names, get_edge_attribute_names + + +def test_get_graph_attribute_names(): + g = gp.construct_graph(pdb_code="3eiy") + DEFAULT_ATTRS = ["name", "pdb_code", "pdb_path", "chain_ids", "pdb_df", "raw_pdb_df", "rgroup_df", "coords", "node_type", "sequence_A", "config", "dist_mat"] + graph_attrs = get_graph_attribute_names(g) + assert set(graph_attrs) == set(DEFAULT_ATTRS), "Graph attributes do not match expected attributes." + + +def test_get_node_attribute_names(): + g = construct_graph(pdb_code="3eiy") + DEFAULT_ATTRS = ["chain_id", "residue_name", "residue_number", "atom_type", "element_symbol", "coords", "b_factor", "meiler"] + node_attrs = get_node_attribute_names(g) + assert set(node_attrs) == set(DEFAULT_ATTRS), "Node attributes do not match expected attributes." + + +def test_get_edge_attribute_names(): + g = construct_graph(pdb_code="3eiy") + DEFAULT_ATTRS = ["kind", "distance"] + edge_attrs = get_edge_attribute_names(g) + assert set(edge_attrs) == set(DEFAULT_ATTRS), "Edge attributes do not match expected attributes." From 8f22fed33954eb8324a8daeddb26fd7b150c8822 Mon Sep 17 00:00:00 2001 From: Arian Jamasb Date: Tue, 2 Aug 2022 04:28:45 +0200 Subject: [PATCH 16/20] use typing_extensions Literal for 3.7 support and update changelog --- CHANGELOG.md | 2 +- graphein/protein/features/nodes/amino_acid.py | 2 +- graphein/protein/resi_atoms.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d0d543c6..4b49fa61c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ### 1.5.1 * [Feature] - [#197](https://github.com/a-r-j/graphein/pull/197/) adds support for sizing and colouring nodes in asteroid plots - +* [Feature] - [#197](https://github.com/a-r-j/graphein/pull/197/) adds utilities for retrieving a list of graph/node/edge attribute names in `graphein.utils.utils`. #### Protein diff --git a/graphein/protein/features/nodes/amino_acid.py b/graphein/protein/features/nodes/amino_acid.py index 9f79f671d..7e6b13bd3 100644 --- a/graphein/protein/features/nodes/amino_acid.py +++ b/graphein/protein/features/nodes/amino_acid.py @@ -8,7 +8,7 @@ import logging from functools import lru_cache from pathlib import Path -from typing import Any, Dict, List, Optional, Union, Literal +from typing import Any, Dict, List, Optional, Union import numpy as np import pandas as pd diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index 50fd5e43f..013301944 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -14,7 +14,8 @@ # Code Repository: https://github.com/a-r-j/graphein -from typing import Dict, List, Literal +from typing import Dict, List +from typing_extensions import Literal import numpy as np from sklearn.preprocessing import StandardScaler From 2b69d6e5d51258f05932474dfc9d4f1a1602d96b Mon Sep 17 00:00:00 2001 From: a-r-j Date: Sun, 23 Oct 2022 19:08:50 +0200 Subject: [PATCH 17/20] docstring, black --- tests/utils/test_utils.py | 48 +++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 0ae05ff02..83a20ee05 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -1,23 +1,57 @@ +"""Tests for graphein.utils.utils""" + from graphein.protein.graphs import construct_graph -from graphein.utils.utils import get_graph_attribute_names, get_node_attribute_names, get_edge_attribute_names +from graphein.utils.utils import ( + get_edge_attribute_names, + get_graph_attribute_names, + get_node_attribute_names, +) def test_get_graph_attribute_names(): - g = gp.construct_graph(pdb_code="3eiy") - DEFAULT_ATTRS = ["name", "pdb_code", "pdb_path", "chain_ids", "pdb_df", "raw_pdb_df", "rgroup_df", "coords", "node_type", "sequence_A", "config", "dist_mat"] + g = construct_graph(pdb_code="3eiy") + DEFAULT_ATTRS = [ + "name", + "pdb_code", + "pdb_path", + "chain_ids", + "pdb_df", + "raw_pdb_df", + "rgroup_df", + "coords", + "node_type", + "sequence_A", + "config", + "dist_mat", + ] graph_attrs = get_graph_attribute_names(g) - assert set(graph_attrs) == set(DEFAULT_ATTRS), "Graph attributes do not match expected attributes." + assert set(graph_attrs) == set( + DEFAULT_ATTRS + ), "Graph attributes do not match expected attributes." def test_get_node_attribute_names(): g = construct_graph(pdb_code="3eiy") - DEFAULT_ATTRS = ["chain_id", "residue_name", "residue_number", "atom_type", "element_symbol", "coords", "b_factor", "meiler"] + DEFAULT_ATTRS = [ + "chain_id", + "residue_name", + "residue_number", + "atom_type", + "element_symbol", + "coords", + "b_factor", + "meiler", + ] node_attrs = get_node_attribute_names(g) - assert set(node_attrs) == set(DEFAULT_ATTRS), "Node attributes do not match expected attributes." + assert set(node_attrs) == set( + DEFAULT_ATTRS + ), "Node attributes do not match expected attributes." def test_get_edge_attribute_names(): g = construct_graph(pdb_code="3eiy") DEFAULT_ATTRS = ["kind", "distance"] edge_attrs = get_edge_attribute_names(g) - assert set(edge_attrs) == set(DEFAULT_ATTRS), "Edge attributes do not match expected attributes." + assert set(edge_attrs) == set( + DEFAULT_ATTRS + ), "Edge attributes do not match expected attributes." From 9743332e0e1b9ddf6b7d02aa1350dfd470680742 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Sun, 23 Oct 2022 19:09:36 +0200 Subject: [PATCH 18/20] black --- graphein/utils/utils.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/graphein/utils/utils.py b/graphein/utils/utils.py index d15091d45..4d7498ff9 100644 --- a/graphein/utils/utils.py +++ b/graphein/utils/utils.py @@ -388,33 +388,41 @@ def ping(host: str) -> bool: def get_node_attribute_names(g: nx.Graph) -> List[str]: """Returns a list of node attribute names present within a graph. - + :param g: Networkx Graph. :type g: nx.Graph :returns: List of node attribute names :rtype: List[str] """ - return list(set(np.array([list(g.nodes[n].keys()) for n in g.nodes()]).flatten())) + return list( + set(np.array([list(g.nodes[n].keys()) for n in g.nodes()]).flatten()) + ) + def get_edge_attribute_names(g: nx.Graph) -> List[str]: """Returns a list of edge attribute names present within a graph. - + :param g: Networkx Graph. :type g: nx.Graph :returns: List of edge attribute names :rtype: List[str] """ - return list(set(np.array([list(g.edges[u, v].keys()) for u, v in g.edges()]).flatten())) + return list( + set( + np.array( + [list(g.edges[u, v].keys()) for u, v in g.edges()] + ).flatten() + ) + ) def get_graph_attribute_names(g: nx.Graph) -> List[str]: """Returns a list of graph attribute names present within a graph. - + :param g: Networkx Graph. :type g: nx.Graph :returns: List of graph attribute names :rtype: List[str] """ return list(g.graph.keys()) - From 43bc240916b7b08fda99fe7e4457b242c1168d89 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Sun, 23 Oct 2022 19:21:40 +0200 Subject: [PATCH 19/20] fix type import; black --- graphein/protein/features/nodes/amino_acid.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/graphein/protein/features/nodes/amino_acid.py b/graphein/protein/features/nodes/amino_acid.py index 7e6b13bd3..91069d6da 100644 --- a/graphein/protein/features/nodes/amino_acid.py +++ b/graphein/protein/features/nodes/amino_acid.py @@ -17,9 +17,9 @@ BASE_AMINO_ACIDS, HYDROGEN_BOND_ACCEPTORS, HYDROGEN_BOND_DONORS, - RESI_THREE_TO_1, HYDROPHOBICITY_SCALES, - HYDROPHOBICITY_TYPES + HYDROPHOBICITY_TYPE, + RESI_THREE_TO_1, ) from graphein.utils.utils import onek_encoding_unk @@ -253,28 +253,30 @@ def hydrogen_bond_acceptor( def hydrophobicity( - n: str, - d: Dict[str, any], + n: str, + d: Dict[str, Any], mapping: HYDROPHOBICITY_TYPE = "kd", return_array: bool = True, ) -> Union[np.ndarray, pd.Series]: """ - Adds hydrophobicity values for each residue to graph nodes. - See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for + Adds hydrophobicity values for each residue to graph nodes. + See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for values and available scales. :param n: Node ID. Unused - kept to maintain consistent function signature. :type n: str :param d: Dictionary of node attributes. - :type d: Dict[str, any] - :param mapping: Which hydrophobicity scale to use. See + :type d: Dict[str, Any] + :param mapping: Which hydrophobicity scale to use. See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_TYPE` for supported types. :type mapping: graphien.protein.resi_atoms.HYDROPHOBICITY_TYPE - :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns + :param return_array: If ``True``, returns a ``np.ndarray``, otherwise returns a ``pd.Series``. Default is ``True``. :type return_array: bool """ - assert mapping in HYDROPHOBICITY_SCALES.keys(), f"Unsupported mapping: {mapping}. Supported mappings: {HYDROPHOBICITY_SCALES.keys()}" + assert ( + mapping in HYDROPHOBICITY_SCALES.keys() + ), f"Unsupported mapping: {mapping}. Supported mappings: {HYDROPHOBICITY_SCALES.keys()}" hydr = HYDROPHOBICITY_SCALES[mapping] amino_acid = d["residue_name"] From fd1f996569b0b771e1292f8581341be0d7cd8396 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Feb 2023 23:40:10 +0000 Subject: [PATCH 20/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- graphein/protein/resi_atoms.py | 34 ++++----- graphein/protein/visualisation.py | 113 ++++++++++++++++++++---------- graphein/utils/utils.py | 3 +- 3 files changed, 93 insertions(+), 57 deletions(-) diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index c588a1075..71ed2eb62 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -15,10 +15,10 @@ from typing import Dict, List, Union -from typing_extensions import Literal import numpy as np from sklearn.preprocessing import StandardScaler +from typing_extensions import Literal BACKBONE_ATOMS: List[str] = ["N", "CA", "C", "O"] """Atoms present in Amino Acid Backbones.""" @@ -1094,7 +1094,7 @@ """Supported hydrophobicity types. See :const:`~graphein.protein.resi_atoms.HYDROPHOBICITY_SCALES` for further details.""" HYDROPHOBICITY_SCALES: Dict[str, Dict[str, float]] = { - "kd": { # kdHydrophobicity (a) + "kd": { # kdHydrophobicity (a) "ILE": 4.5, "VAL": 4.2, "LEU": 3.8, @@ -1116,7 +1116,7 @@ "LYS": -3.9, "ARG": -4.5, }, - "ww": { # wwHydrophobicity (b) + "ww": { # wwHydrophobicity (b) "ILE": 0.31, "VAL": -0.07, "LEU": 0.56, @@ -1138,7 +1138,7 @@ "LYS": -0.99, "ARG": -0.81, }, - "hh": { # hhHydrophobicity (c) + "hh": { # hhHydrophobicity (c) "ILE": -0.60, "VAL": -0.31, "LEU": -0.55, @@ -1160,7 +1160,7 @@ "LYS": 2.71, "ARG": 2.58, }, - "mf": { # mfHydrophobicity (d) + "mf": { # mfHydrophobicity (d) "ILE": -1.56, "VAL": -0.78, "LEU": -1.81, @@ -1182,8 +1182,8 @@ "LYS": 5.39, "ARG": 3.71, }, - "tt": { # ttHydrophobicity (e) - "ILE": 1.97, + "tt": { # ttHydrophobicity (e) + "ILE": 1.97, "VAL": 1.46, "LEU": 1.82, "PHE": 1.98, @@ -1202,18 +1202,18 @@ "ASP": -3.27, "ASN": -1.62, "LYS": -3.46, - "ARG": -2.57, - } + "ARG": -2.57, + }, } """ -Set of (5) dictionaries that map amino acid 3-letter codes to their hydrophobicity. +Set of (5) dictionaries that map amino acid 3-letter codes to their hydrophobicity. -The scales included are from Chimera (UCSF) https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html +The scales included are from Chimera (UCSF) https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/midas/hydrophob.html and are as follows: * kdHydrophobicity - (a) A simple method for displaying the hydropathic character of a protein. Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32. - https://www.ncbi.nlm.nih.gov/pubmed/7108955 + (a) A simple method for displaying the hydropathic character of a protein. Kyte J, Doolittle RF. J Mol Biol. 1982 May 5;157(1):105-32. + https://www.ncbi.nlm.nih.gov/pubmed/7108955 * wwHydrophobicity (b) Experimentally determined hydrophobicity scale for proteins at membrane interfaces. Wimley WC, White SH. Nat Struct Biol. 1996 Oct;3(10):842-8. @@ -1223,17 +1223,17 @@ (c) Recognition of transmembrane helices by the endoplasmic reticulum translocon. Hessa T, Kim H, Bihlmaier K, Lundin C, Boekel J, Andersson H, Nilsson I, White SH, von Heijne G. Nature. 2005 Jan 27;433(7024):377-81, supplementary data. https://www.ncbi.nlm.nih.gov/pubmed/15674282 - In this scale, negative values indicate greater hydrophobicity. + In this scale, negative values indicate greater hydrophobicity. * mfHydrophobicity (d) Side-chain hydrophobicity scale derived from transmembrane protein folding into lipid bilayers. Moon CP, Fleming KG. Proc Natl Acad Sci USA. 2011 Jun 21;108(25):10174-7, supplementary data. - https://www.ncbi.nlm.nih.gov/pubmed/21606332 + https://www.ncbi.nlm.nih.gov/pubmed/21606332 In this scale, negative values indicate greater hydrophobicity. - + * ttHydrophobicity (e) An amino acid “transmembrane tendency” scale that approaches the theoretical limit to accuracy for prediction of transmembrane helices: relationship to biological hydrophobicity. Zhao G, London E. Protein Sci. 2006 Aug;15(8):1987-2001. - https://www.ncbi.nlm.nih.gov/pubmed/16877712 + https://www.ncbi.nlm.nih.gov/pubmed/16877712 """ ISOELECTRIC_POINTS: Dict[str, float] = { diff --git a/graphein/protein/visualisation.py b/graphein/protein/visualisation.py index 72b4fef6b..70556e232 100644 --- a/graphein/protein/visualisation.py +++ b/graphein/protein/visualisation.py @@ -21,9 +21,9 @@ from loguru import logger as log from mpl_toolkits.mplot3d import Axes3D +from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES from graphein.protein.subgraphs import extract_k_hop_subgraph from graphein.utils.utils import import_message -from graphein.protein.resi_atoms import HYDROPHOBICITY_SCALES try: from pytorch3d.ops import sample_points_from_meshes @@ -49,23 +49,25 @@ TODO: Functino that gets ``min`` and ``max`` values in a graph for a given feature so that we can scale / offset to > 0 TODO: should feature `distance` actually contain the site itself i.e. in the string? """ + + def _node_feature_func( - g: nx.Graph, - feature: str, + g: nx.Graph, + feature: str, focal_node: Optional[str] = None, focal_point: Optional[tuple] = None, no_negatives: bool = False, ) -> Callable: """ - Maps a feature as described by a string to a function that can be applied on nodes from a graph. + Maps a feature as described by a string to a function that can be applied on nodes from a graph. :param g: Protein graph. :type g: nx.Graph :param feature: Name of feature to extract. :type feature: str - :param focal_node: A specific node within ``g`` to use in feature calculation; e.g. when calculating ``distance`` to a given site. + :param focal_node: A specific node within ``g`` to use in feature calculation; e.g. when calculating ``distance`` to a given site. :type focal_node: Optional[str] - :param focal_point: Use specific coordinates instead of a node within the graph. + :param focal_point: Use specific coordinates instead of a node within the graph. :type focal_point: tuple :param no_negatives: Take the max of ``0`` and the feature's value. Defaults to ``False``. :type no_negatives: bool @@ -74,7 +76,7 @@ def _node_feature_func( TODO is there a way to wrap a lambda with another function i.e. max(0, f) for `no_negatives` ? """ - if feature == "degree": + if feature == "degree": return lambda k: g.degree[k] elif feature in ["seq-position", "seq_position"]: return lambda k: g.nodes(data=True)[k]["residue_number"] @@ -82,63 +84,82 @@ def _node_feature_func( return lambda k: g.nodes(data=True)[k]["rsa"] elif feature in ["bfac", "bfactor", "b_factor", "b-factor"]: return lambda k: g.nodes(data=True)[k]["b_factor"] - elif feature == "distance": # Euclidean distance to a specific node / coordinate + elif ( + feature == "distance" + ): # Euclidean distance to a specific node / coordinate + def get_coords(g: nx.Graph, node: str) -> np.ndarray: - return np.array(g.nodes()[node]["coords"]) + return np.array(g.nodes()[node]["coords"]) + if focal_node: assert focal_node in g.nodes() - return lambda k: np.linalg.norm(get_coords(g, k) - get_coords(g, focal_node)) + return lambda k: np.linalg.norm( + get_coords(g, k) - get_coords(g, focal_node) + ) elif focal_point: assert len(focal_point) == 3 - return lambda k: np.linalg.norm(get_coords(g, k) - np.array(focal_point)) - else: - raise ValueError(f"Node feature 'distance' requires one of `focal_node` or `focal_point`.") + return lambda k: np.linalg.norm( + get_coords(g, k) - np.array(focal_point) + ) + else: + raise ValueError( + f"Node feature 'distance' requires one of `focal_node` or `focal_point`." + ) # Meiler embedding dimension p = re.compile("meiler-?([0-9])") match = p.search(feature) if match: dim = match.group(1) - if int(dim) in range(1,8): - if no_negatives: return lambda k: max(0, g.nodes(data=True)[k]["meiler"][f"dim_{dim}"]) - else: return lambda k: g.nodes(data=True)[k]["meiler"][f"dim_{dim}"] + if int(dim) in range(1, 8): + if no_negatives: + return lambda k: max( + 0, g.nodes(data=True)[k]["meiler"][f"dim_{dim}"] + ) + else: + return lambda k: g.nodes(data=True)[k]["meiler"][f"dim_{dim}"] else: - raise ValueError(f"Meiler embeddings have dimensions 1-7, received {dim}.") - + raise ValueError( + f"Meiler embeddings have dimensions 1-7, received {dim}." + ) + # Hydrophobicity - p = re.compile("([a-z]{2})?-?(hydrophobicity)") # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity" + p = re.compile( + "([a-z]{2})?-?(hydrophobicity)" + ) # e.g. "kd-hydrophobicity", "tthydrophobicity", "hydrophobicity" match = p.search(feature) if match and match.group(2): # TODO: check if nodes actually have 'hydrophobicity' already; if they do, then use this. if not, then map to kd. - scale: str = match.group(1) if match.group(1) else "kd" # use 'kdhydrophobicity' as default if no scale specified - try: hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale] - except: raise KeyError(f"'{scale}' not a valid hydrophobicity scale.") - return lambda k: hydrophob[k.split(':')[1]] + scale: str = ( + match.group(1) if match.group(1) else "kd" + ) # use 'kdhydrophobicity' as default if no scale specified + try: + hydrophob: Dict[str, float] = HYDROPHOBICITY_SCALES[scale] + except: + raise KeyError(f"'{scale}' not a valid hydrophobicity scale.") + return lambda k: hydrophob[k.split(":")[1]] else: raise NotImplementedError(f"Feature '{feature}' not implemented.") def _node_size_func( - g: nx.Graph, - feature: str, - min: float, - multiplier: float + g: nx.Graph, feature: str, min: float, multiplier: float ) -> Callable: """ Returns a function that can be use to generate node sizes for plotting. - :param g: Protein graph + :param g: Protein graph :type g: nx.Graph - :param feature: Name of feature to scale node sizes by. + :param feature: Name of feature to scale node sizes by. :type feature: str - :param min: Number to offset size with. + :param min: Number to offset size with. :type min: float :param multiplier: Number to scale feature values by. :type multiplier: float """ get_feature = _node_feature_func(g=g, feature=feature, no_negatives=True) return lambda k: min + multiplier * get_feature(k) - + def plot_pointcloud(mesh: Meshes, title: str = "") -> Axes3D: """ @@ -348,7 +369,12 @@ def plotly_protein_structure_graph( G, colour_map=edge_color_map, colour_by=colour_edges_by ) - size_by = _node_size_func(G, node_size_feature, min=node_size_min, multiplier=node_size_multiplier) + size_by = _node_size_func( + G, + node_size_feature, + min=node_size_min, + multiplier=node_size_multiplier, + ) # 3D network plot x_nodes = [] @@ -837,7 +863,7 @@ def asteroid_plot( :param colour_nodes_by: Colour the nodes by this attribute. Currently only ``"shell"`` is supported. :type colour_nodes_by: str - :param size_nodes_by: Size the nodes by an attribute. + :param size_nodes_by: Size the nodes by an attribute. :type size_nodes_by: str :param colour_edges_by: Colour the edges by this attribute. Currently only ``"kind"`` is supported. @@ -922,8 +948,13 @@ def asteroid_plot( x, y = subgraph.nodes[node]["pos"] node_x.append(x) node_y.append(y) - - size_by = _node_size_func(subgraph, size_nodes_by, min=node_size_min, multiplier=node_size_multiplier) + + size_by = _node_size_func( + subgraph, + size_nodes_by, + min=node_size_min, + multiplier=node_size_multiplier, + ) node_sizes = [size_by(n) for n in subgraph.nodes()] colour_nodes_by = colour_nodes_by.lower() @@ -934,9 +965,15 @@ def asteroid_plot( if n in v: node_colours.append(k) else: - try: get_feature = _node_feature_func(g=subgraph, feature=colour_nodes_by, no_negatives=False) - except: raise NotImplementedError(f"Colour by {colour_nodes_by} not implemented.") - + try: + get_feature = _node_feature_func( + g=subgraph, feature=colour_nodes_by, no_negatives=False + ) + except: + raise NotImplementedError( + f"Colour by {colour_nodes_by} not implemented." + ) + for n, d in subgraph.nodes(data=True): node_colours.append(get_feature(n)) diff --git a/graphein/utils/utils.py b/graphein/utils/utils.py index 5384a0aa0..29d298778 100644 --- a/graphein/utils/utils.py +++ b/graphein/utils/utils.py @@ -431,7 +431,7 @@ def get_graph_attribute_names(g: nx.Graph) -> List[str]: """ return list(g.graph.keys()) - + def parse_aggregation_type(aggregation_type: AggregationType) -> Callable: """Returns an aggregation function by name @@ -457,4 +457,3 @@ def parse_aggregation_type(aggregation_type: AggregationType) -> Callable: f" Please use min, max, mean, median, sum" ) return func -