From 3bd34e63aa961de316ab381ce1fd8123167205e4 Mon Sep 17 00:00:00 2001 From: johnbraisted Date: Thu, 7 Mar 2024 11:27:37 -0500 Subject: [PATCH] rhea uniprot mods for TrEMBL, rhea and uniprot parsers --- config/external_resource_config.txt | 2 +- config/ramp_resource_version_update.txt | 14 +++--- src/parse/ChebiOwlParser.py | 4 +- src/parse/RheaParser.py | 62 ++++++++++++++++++++++++- src/parse/UniprotParser.py | 23 +++++++++ 5 files changed, 94 insertions(+), 11 deletions(-) diff --git a/config/external_resource_config.txt b/config/external_resource_config.txt index d69ced1..07f4ae9 100644 --- a/config/external_resource_config.txt +++ b/config/external_resource_config.txt @@ -4,7 +4,7 @@ hmdb_gene http https://hmdb.ca/system/downloads/current/hmdb_proteins.zip hmdb_p hmdb_met_sdf http https://hmdb.ca/system/downloads/current/structures.zip structures.zip structures.sdf ../misc/data/chemprops/hmdb/ zip chem_props_sdf reactome_met http http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_mets reactome_gene http http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_genes -wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231110-rdf-wp.zip wikipathways-20231110-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes +wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20240210-rdf-wp.zip wikipathways-20240210-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes chebi_met_sdf ftp https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf ../misc/data/chemprops/chebi/ gzip chem_props_sdf lipidmaps_met http https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip LMSD.sdf.zip structures.sdf ../misc/data/chemprops/lipidmaps/ zip chem_props_sdf swissprot_human http https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz uniprot_sprot_human.dat.gz uniprot_sprot_human.dat ../misc/data/uniprot_human/ gzip proteins diff --git a/config/ramp_resource_version_update.txt b/config/ramp_resource_version_update.txt index ea4f13a..eb48a44 100644 --- a/config/ramp_resource_version_update.txt +++ b/config/ramp_resource_version_update.txt @@ -1,8 +1,8 @@ ramp_db_version db_mod_date status data_source_id data_source_name data_source_url data_source_version -v2.4.1 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17) -v2.4.1 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023) -v2.4.1 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10) -v2.4.1 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17) -v2.4.1 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01) -v2.4.1 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24 -v2.4.1 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 129 (2023-09-13) +v2.5.0 03/06/2024 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17) +v2.5.0 03/06/2024 current reactome Reactome https://reactome.org/ v87 (Dec 2023) +v2.5.0 03/06/2024 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20240210 (2024-02-10) +v2.5.0 03/06/2024 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17) +v2.5.0 03/06/2024 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 231 (2024-03-01) +v2.5.0 03/06/2024 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2024-03-06 +v2.5.0 03/06/2024 current rhea Rhea https://www.rhea-db.org/ Release 131 (2024-01-) diff --git a/src/parse/ChebiOwlParser.py b/src/parse/ChebiOwlParser.py index ce5565c..96e966a 100644 --- a/src/parse/ChebiOwlParser.py +++ b/src/parse/ChebiOwlParser.py @@ -66,7 +66,7 @@ def getChebiFiles(self): os.mkdir(self.relDir + localDir) #make an output dir for parsing - self.outputDir = self.relDir + "../misc/output/chebi" + self.outputDir = self.relDir + "../misc/output/chebi/" if not exists(self.outputDir): os.mkdir(self.outputDir) @@ -444,7 +444,7 @@ def extractHumanMetaboliteStatus(self): print(str(len(set(xlist)))) print(self.localOntoDir) - with open(self.outputDir + '\human_chebi_ids.txt', 'w') as f: + with open(self.outputDir + '/human_chebi_ids.txt', 'w') as f: for line in xlist: f.write(self.uriToChebiId(line)+"\n") diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py index 6b1001d..be4f9e2 100644 --- a/src/parse/RheaParser.py +++ b/src/parse/RheaParser.py @@ -53,6 +53,8 @@ def __init__(self, resConfig): self.rheaLocalRheaToUniprotFile = "" + self.rheaLocalRheaToSwissprotFile = "" + self.rheaLocalRheaToEcFile = "" self.rheaLocalRxnDirectionFile = "" @@ -98,6 +100,8 @@ def processRhea(self): def buildSupportingUniprotData(self): print("building uniprot data store") uniParser = UniprotParser(self.config) + + # this will parse human uniprot, including SwissProt and TrEMBL accessions. uniParser.parseUniprot() self.humanUniprotRecordDict = uniParser.uniprotRecords @@ -117,6 +121,7 @@ def buildSupportingUniprotData(self): print("length of the dict"+str(len(self.humanUniprotRecordDict.keys()))) print("length of the set"+str(len(self.humanUniprotAccSet))) + def buildSupportingChebiData(self): print("building chebi data store") @@ -140,6 +145,7 @@ def getRheaFiles(self): rdfConf = self.config.getConfig('rhea_rdf') uniprotToRheaConf = self.config.getConfig('uniprot_to_rhea') + swissprotToRheaConf = self.config.getConfig('swissprot_to_rhea') rheaToEcConf = self.config.getConfig('rhea_to_ec') rheaDirectionConf = self.config.getConfig('rhea_rxn_direction') expasyEc2EnzymeClassConf = self.config.getConfig('expasy_ec2class') @@ -179,7 +185,25 @@ def getRheaFiles(self): self.download_files(rhea2UniprotUrl, self.relDir + localDir + rhea2UniprotRemoteFile) else: print("Using cached Rhea Uniprot-to-Rhea file.") + + + + # rhea to swissprot + rhea2SwissprotFile = swissprotToRheaConf.extractFileName + + self.rheaLocalRheaToSwissprotFile = self.relDir + localDir + rhea2SwissprotFile + + if not exists(self.relDir + localDir + rhea2SwissprotFile): + rhea2SwissprotUrl = swissprotToRheaConf.sourceURL + rhea2SwissprotRemoteFile = swissprotToRheaConf.sourceFileName + + self.download_files(rhea2SwissprotUrl, self.relDir + localDir + rhea2SwissprotRemoteFile) + else: + print("Using cached Rhea swissprot-to-Rhea file.") + + + # rhea to ec rhea2EcFile = rheaToEcConf.extractFileName @@ -710,6 +734,8 @@ def appendUniprotToReaction(self): else: unis.append('uniprot:'+row.ID) + + for rxn in r2uMap: #print('adding uniprot') #print('reaction '+rxn) @@ -721,7 +747,40 @@ def appendUniprotToReaction(self): if currRxn is not None: currRxn.proteins = uniSet #print("setting proteins, len:"+str(len(currRxn.proteins))) + + + + # swiss prot + r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0) + + print(str(r2u.shape)) + + for idx, row in r2u.iterrows(): + #print(row) + #print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ " " +str(row.ID)) + + # !!! just adding human uniprot + if ("uniprot:" + row.ID) in self.humanUniprotAccSet: + #print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + unis = r2uMap.get("rhea:" + str(row.RHEA_ID)) + if unis is None: + unis = ['uniprot:'+row.ID] + r2uMap['rhea:'+str(row.RHEA_ID)] = unis + else: + unis.append('uniprot:'+row.ID) + + for rxn in r2uMap: + #print('adding uniprot') + #print('reaction '+rxn) + uniSet = r2uMap[rxn] + currRxn = self.rheaReactionDict.get(rxn, None) + if currRxn is None: + currRxn = self.rheaReactionDict.get("rhea:"+rxn, None) + if currRxn is not None: + currRxn.proteins = uniSet + #print("setting proteins, len:"+str(len(currRxn.proteins))) + def appendEcToReaction(self): @@ -753,8 +812,9 @@ def ecToEnzymeClassFromExpasy(self): with open(self.expasyLocalEc2ClassFile, 'r') as ec2c: ec2classStrings = ec2c.readlines() + # The file has a header and a footer that have to be skipped. start = 11 - end = len(ec2classStrings) - 6 + end = len(ec2classStrings) - 5 for i in range(start, end): line = ec2classStrings[i].strip() diff --git a/src/parse/UniprotParser.py b/src/parse/UniprotParser.py index ab88dd9..fe20b06 100644 --- a/src/parse/UniprotParser.py +++ b/src/parse/UniprotParser.py @@ -58,8 +58,16 @@ def parseUniprot(self): else: print("Uniprot (Human) exists. Using cached copy.") + + + print("starting to parse uniprot trembl dat file") + print(extractFile) + self.parseUniprotFile(self.relDir + localDir + extractFile) + print("number of uniprot trembl records") + tremblCount = len(self.uniprotRecords) + print(str(tremblCount)) # now add SwissProt human proteinConfig = self.resourceConfig.getConfig("swissprot_human") @@ -83,8 +91,17 @@ def parseUniprot(self): shutil.copyfileobj(f_in, f_out) else: print("Uniprot (Human) exists. Using cached copy.") + + + + print("starting to parse uniprot swissprot dat file") + print(extractFile) self.parseUniprotFile(self.relDir + localDir + extractFile) + + print("number of uniprot trembl PLUSE swissprot records") + tremblCount = len(self.uniprotRecords) + print(str(tremblCount)) self.exportUniprotIntermediatFiles() @@ -148,8 +165,14 @@ def processData(self, prefix, line, proteinDB, protein): protein.uniprotAcc = accs[0] protein.secondaryAccs = accs + if protein.uniprotAcc == 'uniprot:P19835': + print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 in uniprot parser") + print(accs) else: for acc in accs: + if acc == 'uniprot:P19835': + print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 AS A SECONDARY ACC in uniprot parser") + protein.secondaryAccs.append(acc)