Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/pifasta decomm #3

Merged
merged 16 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
265 changes: 265 additions & 0 deletions liabilities/liabilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
from rdkit import Chem

# Define the liabilities with SMARTS patterns, their corresponding liabilities, pH ranges, and references
liabilities = [
{
"smarts": "N[CH1]([CH2][CH0](=[OH0])[OH1,O-])[CH0](=[OH0])[NH0]1[CH2][CH2][CH2][CH1]1[CH0](=[OH0])",
"fasta_pattern":"DP",
"liability": "Asp-Pro bond cleavage",
"pH_stability_range": "6.0–7.5",
"pH_instability_range": "<5.0 or >8.0",
"references": "Manning et al. (2010); Cleland et al. (1993)"
},
{
"smarts": "[OH0]=[CH0][CH1](N)([CH2][CH2][SX2][CH3])",
"fasta_pattern":"M",
"liability": "Methionine oxidation",
"pH_stability_range": "6.0–7.5",
"pH_instability_range": "<5.0 or >8.0",
"references": "Manning et al. (2010); Cleland et al. (1993); Al Musaimi et al. (2022)"
},
{
"smarts": "N[CH1]([CH2][CH0](=[OH0])[NH2])[CH0](=[OH0])",
"fasta_pattern":"N",
"liability": "Deamidation of Asn",
"pH_stability_range": "4.5–6.0",
"pH_instability_range": "<4.0 or >6.0",
"references": "Wakankar & Borchardt (2006)"
},

{
"smarts": "N[CH1]([CH2][CH0](=[OH0])[NH2])[CH0](=[OH0])[NH1][CH2][CH0](=[OH0])",
"fasta_pattern":"NG",
"liability": "Deamidationn of Asn-Gly",
"pH_stability_range": "4.5–6.5",
"pH_instability_range": "<4.0 or >7.0",
"references": "Cleland et al. (1993); Li et al. (1995); Al Musaimi et al. (2022)"
},

{
"smarts": "C[CH2][SX2][SX2][CH2]C",
"fasta_pattern":"C",
"liability": "Disulfide bond scrambling",
"pH_stability_range": "5.5–8.0",
"pH_instability_range": "<4.0 or >9.0",
"references": "Wang & Roberts (2018)"
},

{
"smarts": "[CH3][CH1]([OH1])[CH1]([NH1][CH0](=[OH0])[CH1](N)[CH2][OH1])[CH0](=[OH0])",
"fasta_pattern":"ST",
"liability": "Hydrolysis at Ser-Thr bonds",
"pH_stability_range": "7.0–8.5",
"pH_instability_range": "<6.0 or >9.0",
"references": "Shire et al. (2004)"
},

{
"smarts": "[NH2,N+H3][CH2][CH2][CH2][CH2][CH1](N)[CH0](=[OH0])",
"liability": "Lys oxidation",
"fasta_pattern":"K",
"pH_stability_range": "4.5–6.5",
"pH_instability_range": "<4.0 or >7.0",
"references": "Cleland et al. (1993); Li et al. (1995); Al Musaimi et al. (2022)"
},

{
"smarts": "[CH0](=[OH0])[CH1]1[CH2][CH2][CH2]N1",
"liability": "Hydrolysis near Pro residues",
"fasta_pattern":"P",
"pH_stability_range": "6.0–7.0",
"pH_instability_range": "<5.0 or >8.5",
"references": "Shire et al. (2004); Nugrahadi et al. (2003)"
},

{
"smarts": "[NH2,N+H3][CH1]([CH2][cH0]1[cH1][nH,n,n+H][cH1][nH,n,n+H]1)[CH0](=[OH0])",
"liability": "N-terminal His oxidation",
"fasta_pattern":"H",
"pH_stability_range": "6.5–8.0",
"pH_instability_range": "<6.0 or >8.5",
"references": "Manning et al. (2010); Li et al. (1995); Al Musaimi et al. (2022)"
}

]


def calculate_liabilities_from_mol(mol):
"""
This function calculates the sequence liability based on the SMARTS patterns
of specific amino acid degradation pathways. The RDKit molecule is checked
for matching SMARTS patterns, and liabilities are flagged accordingly.
It also counts how many times each liability pattern is matched.

:param mol: RDKit molecule object
:return: Dictionary of sequence liabilities with SMARTS patterns, pH ranges, references, and counts
"""

# Initialize a dictionary to store detected liabilities
detected_liabilities = {}

# Check for substructure matches with each SMARTS pattern
for liability in liabilities:
smarts_pattern = liability["smarts"]
smarts_mol = Chem.MolFromSmarts(smarts_pattern)

# Get the list of all matches for the SMARTS pattern
matches = mol.GetSubstructMatches(smarts_mol)
match_count = len(matches) # Count the number of matches

# If there are matches, add the liability to the detected liabilities dictionary
if match_count > 0:
detected_liabilities[liability["liability"]] = {
"liability": liability["liability"],
"fasta_pattern": liability["fasta_pattern"],
# "SMARTS_pattern": smarts_pattern,
"match_count": match_count,
"pH_stability_range": liability["pH_stability_range"],
"pH_instability_range": liability["pH_instability_range"],
"references": liability["references"]
}

return detected_liabilities


# # Example usage with RDKit
# from rdkit import Chem

# # Example peptide molecule (you can replace this with any RDKit peptide)
# smiles_string = "CC(=O)NC(=O)C" # Simple example peptide (Acetyl-Glycine)

# mol = Chem.MolFromSmiles(smiles_string)

# # Calculate liabilities
# liabilities = calculate_liabilities_from_rdkit_mol(mol)

# # Output the identified liabilities
# if liabilities:
# for liability, details in liabilities.items():
# print(f"Liability: {liability}")
# print(f"SMARTS Pattern: {details['SMARTS_pattern']}")
# print(f"Match Count: {details['match_count']}")
# print(f"P-H Stability Range: {details['pH_stability_range']}")
# print(f"P-H Instability Range: {details['pH_instability_range']}")
# print(f"References: {details['references']}")
# else:
# print("No liabilities detected in the sequence.")



def calculate_liabilities_from_fasta(fasta_sequence):
"""
This function calculates the sequence liability based on the presence of specific amino acid sequences
in the input peptide sequence and includes pH stability and instability ranges.

:param fasta_sequence: Input peptide sequence as a string (FASTA format)
:return: Dictionary of sequence liabilities with associated pH ranges
"""

# Initialize a dictionary to store detected liabilities
detected_liabilities = {}

# Convert the sequence to uppercase to handle both L- and D- amino acids
fasta_sequence_upper = fasta_sequence.upper()

# Check each liability pattern against the sequence
for liability in liabilities:
# case of N-terminal histidine
if liability["liability"] == "N-terminal His oxidation":
if liability["fasta_pattern"].upper() == fasta_sequence_upper[0]:
detected_liabilities[liability["liability"]] = {
"liability": liability["liability"],
"fasta_pattern": liability["fasta_pattern"],
"match_count": 1,
"pH_stability_range": liability["pH_stability_range"],
"pH_instability_range": liability["pH_instability_range"],
"references": liability["references"]
}
# general case
else:
if liability["fasta_pattern"].upper() in fasta_sequence_upper:
detected_liabilities[liability["liability"]] = {
"liability": liability["liability"],
"fasta_pattern": liability["fasta_pattern"],
"match_count": fasta_sequence.count(liability["fasta_pattern"]),
"pH_stability_range": liability["pH_stability_range"],
"pH_instability_range": liability["pH_instability_range"],
"references": liability["references"]
}

return detected_liabilities


# if __name__ == "__main__"
# # Example usage
# fasta_sequence = "MKTPIALDEKDPQCVLNTG" # Example peptide sequence

# liabilities = calculate_liabilities_from_fasta(fasta_sequence)

# # Output the identified liabilities
# if liabilities:
# for liability, details in liabilities.items():
# print(f"Liability: {liability}")
# print(f"P-H Stability Range: {details['pH_stability_range']}")
# print(f"P-H Instability Range: {details['pH_instability_range']}")
# print(f"References: {details['references']}")
# else:
# print("No liabilities detected in the sequence.")




def calc_liabilities_from_dict(input_dict, input_type="structure"):
"""
Calculates liabilities from an input dictionary.

Parameters:
input_dict (dict): A dictionary containing molecular data.
input_type (str): Defines the input type, either "structure" (RDKit Molecule) or "fasta" (Amino Acid Sequence).

Returns:
dict_output (dict): A dictionary with liabilities for each molecule or peptide.
"""
dict_output = dict()

if input_type == "structure":
# If the input is a structure (RDKit Molecule)
for mol_idx, mol_data in input_dict.items():
mol_name = mol_data.get("mol_name", "Unnamed Molecule")
mol_object = mol_data.get("mol_obj")

if mol_object is None:
raise ValueError(f"Molecule object for {mol_name} is missing.")

# Calculate liabilities based on SMARTS patterns
liabilities = calculate_liabilities_from_mol(mol_object)

# Store the result in the output dictionary
dict_output[mol_idx] = {
"mol_name": mol_name,
"liabilities": liabilities
}

elif input_type == "fasta":
# If the input is a fasta sequence (peptide sequence)
for mol_idx, mol_data in input_dict.items():
mol_name = mol_data.get("mol_name", "Unnamed Sequence")
fasta = mol_data.get("fasta")

if fasta is None:
raise ValueError(f"FASTA sequence for {mol_name} is missing.")

# Calculate liabilities based on FASTA sequence
liabilities = calculate_liabilities_from_fasta(fasta)

# Store the result in the output dictionary
dict_output[mol_idx] = {
"mol_name": mol_name,
"liabilities": liabilities
}

else:
raise ValueError("Unknown input_type. It must be either 'structure' or 'fasta'.")

return dict_output

Loading