rhea uniprot mods for TrEMBL, rhea and uniprot parsers

ncats · Mar 7, 2024 · 3bd34e6 · 3bd34e6
1 parent 98cf94d
commit 3bd34e6
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 11 deletions.
diff --git a/config/external_resource_config.txt b/config/external_resource_config.txt
@@ -4,7 +4,7 @@ hmdb_gene	http	https://hmdb.ca/system/downloads/current/hmdb_proteins.zip	hmdb_p
 hmdb_met_sdf	http	https://hmdb.ca/system/downloads/current/structures.zip	structures.zip	structures.sdf	../misc/data/chemprops/hmdb/	zip	chem_props_sdf
 reactome_met	http	http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt	ChEBI2Reactome_All_Levels.txt	ChEBI2Reactome_All_Levels.txt	../misc/data/reactome/	none	pathways_mets
 reactome_gene	http	http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt	UniProt2Reactome_All_Levels.txt	UniProt2Reactome_All_Levels.txt	../misc/data/reactome/	none	pathways_genes
-wiki_pathways_mets_genes	http	https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231110-rdf-wp.zip	wikipathways-20231110-rdf-wp.zip	./wp/	../misc/data/wikipathwaysRDF/	zip	pathways_mets_genes
+wiki_pathways_mets_genes	http	https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20240210-rdf-wp.zip	wikipathways-20240210-rdf-wp.zip	./wp/	../misc/data/wikipathwaysRDF/	zip	pathways_mets_genes
 chebi_met_sdf	ftp	https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz	ChEBI_complete_3star.sdf.gz	ChEBI_complete_3star.sdf	../misc/data/chemprops/chebi/	gzip	chem_props_sdf
 lipidmaps_met	http	https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip	LMSD.sdf.zip	structures.sdf	../misc/data/chemprops/lipidmaps/	zip	chem_props_sdf
 swissprot_human	http	https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz	uniprot_sprot_human.dat.gz	uniprot_sprot_human.dat	../misc/data/uniprot_human/	gzip	proteins

diff --git a/config/ramp_resource_version_update.txt b/config/ramp_resource_version_update.txt
@@ -1,8 +1,8 @@
 ramp_db_version	db_mod_date	status	data_source_id	data_source_name	data_source_url	data_source_version
-v2.4.1	10/24/2023	current	hmdb	HMDB	https://hmdb.ca/	v5.0 (2021-11-17)
-v2.4.1	10/24/2023	current	reactome	Reactome	https://reactome.org/	v86 (Sep 2023)
-v2.4.1	10/24/2023	current	wiki	WikiPathways	https://www.wikipathways.org/index.php/WikiPathways	v20231010 (2023-10-10)
-v2.4.1	10/24/2023	current	kegg	KEGG	https://www.genome.jp/kegg/	from HMDB (v5.0) (2021-11-17) 
-v2.4.1	10/24/2023	current	chebi	ChEBI	https://www.ebi.ac.uk/chebi/	Release 226 (2023-10-01)
-v2.4.1	10/24/2023	current	lipidmaps	Lipid Maps	https://www.lipidmaps.org/	Release 2023-10-24
-v2.4.1	10/24/2023	current	rhea	Rhea	https://www.rhea-db.org/	Release 129 (2023-09-13)
+v2.5.0	03/06/2024	current	hmdb	HMDB	https://hmdb.ca/	v5.0 (2021-11-17)
+v2.5.0	03/06/2024	current	reactome	Reactome	https://reactome.org/	v87 (Dec 2023)
+v2.5.0	03/06/2024	current	wiki	WikiPathways	https://www.wikipathways.org/index.php/WikiPathways	v20240210 (2024-02-10)
+v2.5.0	03/06/2024	current	kegg	KEGG	https://www.genome.jp/kegg/	from HMDB (v5.0) (2021-11-17) 
+v2.5.0	03/06/2024	current	chebi	ChEBI	https://www.ebi.ac.uk/chebi/	Release 231 (2024-03-01)
+v2.5.0	03/06/2024	current	lipidmaps	Lipid Maps	https://www.lipidmaps.org/	Release 2024-03-06
+v2.5.0	03/06/2024	current	rhea	Rhea	https://www.rhea-db.org/	Release 131 (2024-01-)
diff --git a/src/parse/ChebiOwlParser.py b/src/parse/ChebiOwlParser.py
@@ -66,7 +66,7 @@ def getChebiFiles(self):
             os.mkdir(self.relDir + localDir)
 
         #make an output dir for parsing
-        self.outputDir = self.relDir + "../misc/output/chebi"
+        self.outputDir = self.relDir + "../misc/output/chebi/"
         if not exists(self.outputDir):
             os.mkdir(self.outputDir)
 
@@ -444,7 +444,7 @@ def extractHumanMetaboliteStatus(self):
         print(str(len(set(xlist))))
         print(self.localOntoDir)
 
-        with open(self.outputDir + '\human_chebi_ids.txt', 'w') as f:
+        with open(self.outputDir + '/human_chebi_ids.txt', 'w') as f:
             for line in xlist:
                 f.write(self.uriToChebiId(line)+"\n")
 

diff --git a/src/parse/RheaParser.py b/src/parse/RheaParser.py
@@ -53,6 +53,8 @@ def __init__(self, resConfig):
 
         self.rheaLocalRheaToUniprotFile = ""
 
+        self.rheaLocalRheaToSwissprotFile = ""
+
         self.rheaLocalRheaToEcFile = ""
 
         self.rheaLocalRxnDirectionFile = ""
@@ -98,6 +100,8 @@ def processRhea(self):
     def buildSupportingUniprotData(self):
         print("building uniprot data store")
         uniParser = UniprotParser(self.config)
+
+        # this will parse human uniprot, including SwissProt and TrEMBL accessions.
         uniParser.parseUniprot()
         self.humanUniprotRecordDict = uniParser.uniprotRecords
 
@@ -117,6 +121,7 @@ def buildSupportingUniprotData(self):
         print("length of the dict"+str(len(self.humanUniprotRecordDict.keys())))
         print("length of the set"+str(len(self.humanUniprotAccSet)))
 
+
     def buildSupportingChebiData(self):
         print("building chebi data store")
 
@@ -140,6 +145,7 @@ def getRheaFiles(self):
 
         rdfConf = self.config.getConfig('rhea_rdf')
         uniprotToRheaConf = self.config.getConfig('uniprot_to_rhea')
+        swissprotToRheaConf = self.config.getConfig('swissprot_to_rhea')
         rheaToEcConf = self.config.getConfig('rhea_to_ec')
         rheaDirectionConf = self.config.getConfig('rhea_rxn_direction')
         expasyEc2EnzymeClassConf = self.config.getConfig('expasy_ec2class')
@@ -179,7 +185,25 @@ def getRheaFiles(self):
             self.download_files(rhea2UniprotUrl, self.relDir + localDir + rhea2UniprotRemoteFile)            
         else:
             print("Using cached Rhea Uniprot-to-Rhea file.")
+
+
+
+        # rhea to swissprot
+        rhea2SwissprotFile = swissprotToRheaConf.extractFileName
+
+        self.rheaLocalRheaToSwissprotFile = self.relDir + localDir + rhea2SwissprotFile
+
+        if not exists(self.relDir + localDir + rhea2SwissprotFile):
+            rhea2SwissprotUrl = swissprotToRheaConf.sourceURL
+            rhea2SwissprotRemoteFile = swissprotToRheaConf.sourceFileName
+
+            self.download_files(rhea2SwissprotUrl, self.relDir + localDir + rhea2SwissprotRemoteFile)            
+        else:
+            print("Using cached Rhea swissprot-to-Rhea file.")
 
+
+
+
         # rhea to ec
         rhea2EcFile = rheaToEcConf.extractFileName
 
@@ -710,6 +734,8 @@ def appendUniprotToReaction(self):
                 else:
                     unis.append('uniprot:'+row.ID)
 
+
+
         for rxn in r2uMap:
             #print('adding uniprot')
             #print('reaction '+rxn)
@@ -721,7 +747,40 @@ def appendUniprotToReaction(self):
             if currRxn is not None:   
                 currRxn.proteins = uniSet
                 #print("setting proteins, len:"+str(len(currRxn.proteins)))
+
+
+
+        # swiss prot    
+        r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0)
+
+        print(str(r2u.shape))
+
+        for idx, row in r2u.iterrows():
+            #print(row)
+            #print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ "  " +str(row.ID))
+
+            # !!! just adding human uniprot            
+            if ("uniprot:" + row.ID) in self.humanUniprotAccSet:
+                #print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+                unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
+                if unis is None:     
+                    unis = ['uniprot:'+row.ID]
+                    r2uMap['rhea:'+str(row.RHEA_ID)] = unis
+                else:
+                    unis.append('uniprot:'+row.ID)
+
+        for rxn in r2uMap:
+            #print('adding uniprot')
+            #print('reaction '+rxn)
 
+            uniSet = r2uMap[rxn]
+            currRxn = self.rheaReactionDict.get(rxn, None)
+            if currRxn is None:
+                currRxn = self.rheaReactionDict.get("rhea:"+rxn, None)
+            if currRxn is not None:   
+                currRxn.proteins = uniSet
+                #print("setting proteins, len:"+str(len(currRxn.proteins)))
+
 
 
     def appendEcToReaction(self):
@@ -753,8 +812,9 @@ def ecToEnzymeClassFromExpasy(self):
         with open(self.expasyLocalEc2ClassFile, 'r') as ec2c:
             ec2classStrings = ec2c.readlines()
 
+            # The file has a header and a footer that have to be skipped.
             start = 11
-            end = len(ec2classStrings) - 6
+            end = len(ec2classStrings) - 5
 
         for i in range(start, end):
             line = ec2classStrings[i].strip()

diff --git a/src/parse/UniprotParser.py b/src/parse/UniprotParser.py
@@ -58,8 +58,16 @@ def parseUniprot(self):
         else:
             print("Uniprot (Human) exists. Using cached copy.")
 
+
+
+        print("starting to parse uniprot trembl dat file")
+        print(extractFile)                
+
         self.parseUniprotFile(self.relDir + localDir + extractFile)
 
+        print("number of uniprot trembl records")
+        tremblCount = len(self.uniprotRecords)
+        print(str(tremblCount))
 
         # now add SwissProt human
         proteinConfig = self.resourceConfig.getConfig("swissprot_human")
@@ -83,8 +91,17 @@ def parseUniprot(self):
                     shutil.copyfileobj(f_in, f_out)
         else:
             print("Uniprot (Human) exists. Using cached copy.")
+
+
+
+        print("starting to parse uniprot swissprot dat file")
+        print(extractFile)
 
         self.parseUniprotFile(self.relDir + localDir + extractFile)
+
+        print("number of uniprot trembl PLUSE swissprot records")
+        tremblCount = len(self.uniprotRecords)
+        print(str(tremblCount))
 
         self.exportUniprotIntermediatFiles()
 
@@ -148,8 +165,14 @@ def processData(self, prefix, line, proteinDB, protein):
                 protein.uniprotAcc = accs[0]
                 protein.secondaryAccs = accs
 
+                if protein.uniprotAcc == 'uniprot:P19835':
+                    print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 in uniprot parser")
+                    print(accs)
             else:
                 for acc in accs:
+                    if acc == 'uniprot:P19835':
+                        print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 AS A SECONDARY ACC in uniprot parser")
+
                     protein.secondaryAccs.append(acc)