Skip to content

Commit

Permalink
rhea uniprot mods for TrEMBL, rhea and uniprot parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbraisted committed Mar 7, 2024
1 parent 98cf94d commit 3bd34e6
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 11 deletions.
2 changes: 1 addition & 1 deletion config/external_resource_config.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ hmdb_gene http https://hmdb.ca/system/downloads/current/hmdb_proteins.zip hmdb_p
hmdb_met_sdf http https://hmdb.ca/system/downloads/current/structures.zip structures.zip structures.sdf ../misc/data/chemprops/hmdb/ zip chem_props_sdf
reactome_met http http://www.reactome.org/download/current/ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ChEBI2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_mets
reactome_gene http http://www.reactome.org/download/current/UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt UniProt2Reactome_All_Levels.txt ../misc/data/reactome/ none pathways_genes
wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20231110-rdf-wp.zip wikipathways-20231110-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
wiki_pathways_mets_genes http https://wikipathways-data.wmcloud.org/current/rdf/wikipathways-20240210-rdf-wp.zip wikipathways-20240210-rdf-wp.zip ./wp/ ../misc/data/wikipathwaysRDF/ zip pathways_mets_genes
chebi_met_sdf ftp https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf.gz ChEBI_complete_3star.sdf ../misc/data/chemprops/chebi/ gzip chem_props_sdf
lipidmaps_met http https://www.lipidmaps.org/files/?file=LMSD&ext=sdf.zip LMSD.sdf.zip structures.sdf ../misc/data/chemprops/lipidmaps/ zip chem_props_sdf
swissprot_human http https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_human.dat.gz uniprot_sprot_human.dat.gz uniprot_sprot_human.dat ../misc/data/uniprot_human/ gzip proteins
Expand Down
14 changes: 7 additions & 7 deletions config/ramp_resource_version_update.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
ramp_db_version db_mod_date status data_source_id data_source_name data_source_url data_source_version
v2.4.1 10/24/2023 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17)
v2.4.1 10/24/2023 current reactome Reactome https://reactome.org/ v86 (Sep 2023)
v2.4.1 10/24/2023 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20231010 (2023-10-10)
v2.4.1 10/24/2023 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17)
v2.4.1 10/24/2023 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 226 (2023-10-01)
v2.4.1 10/24/2023 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2023-10-24
v2.4.1 10/24/2023 current rhea Rhea https://www.rhea-db.org/ Release 129 (2023-09-13)
v2.5.0 03/06/2024 current hmdb HMDB https://hmdb.ca/ v5.0 (2021-11-17)
v2.5.0 03/06/2024 current reactome Reactome https://reactome.org/ v87 (Dec 2023)
v2.5.0 03/06/2024 current wiki WikiPathways https://www.wikipathways.org/index.php/WikiPathways v20240210 (2024-02-10)
v2.5.0 03/06/2024 current kegg KEGG https://www.genome.jp/kegg/ from HMDB (v5.0) (2021-11-17)
v2.5.0 03/06/2024 current chebi ChEBI https://www.ebi.ac.uk/chebi/ Release 231 (2024-03-01)
v2.5.0 03/06/2024 current lipidmaps Lipid Maps https://www.lipidmaps.org/ Release 2024-03-06
v2.5.0 03/06/2024 current rhea Rhea https://www.rhea-db.org/ Release 131 (2024-01-)
4 changes: 2 additions & 2 deletions src/parse/ChebiOwlParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def getChebiFiles(self):
os.mkdir(self.relDir + localDir)

#make an output dir for parsing
self.outputDir = self.relDir + "../misc/output/chebi"
self.outputDir = self.relDir + "../misc/output/chebi/"
if not exists(self.outputDir):
os.mkdir(self.outputDir)

Expand Down Expand Up @@ -444,7 +444,7 @@ def extractHumanMetaboliteStatus(self):
print(str(len(set(xlist))))
print(self.localOntoDir)

with open(self.outputDir + '\human_chebi_ids.txt', 'w') as f:
with open(self.outputDir + '/human_chebi_ids.txt', 'w') as f:
for line in xlist:
f.write(self.uriToChebiId(line)+"\n")

Expand Down
62 changes: 61 additions & 1 deletion src/parse/RheaParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def __init__(self, resConfig):

self.rheaLocalRheaToUniprotFile = ""

self.rheaLocalRheaToSwissprotFile = ""

self.rheaLocalRheaToEcFile = ""

self.rheaLocalRxnDirectionFile = ""
Expand Down Expand Up @@ -98,6 +100,8 @@ def processRhea(self):
def buildSupportingUniprotData(self):
print("building uniprot data store")
uniParser = UniprotParser(self.config)

# this will parse human uniprot, including SwissProt and TrEMBL accessions.
uniParser.parseUniprot()
self.humanUniprotRecordDict = uniParser.uniprotRecords

Expand All @@ -117,6 +121,7 @@ def buildSupportingUniprotData(self):
print("length of the dict"+str(len(self.humanUniprotRecordDict.keys())))
print("length of the set"+str(len(self.humanUniprotAccSet)))


def buildSupportingChebiData(self):
print("building chebi data store")

Expand All @@ -140,6 +145,7 @@ def getRheaFiles(self):

rdfConf = self.config.getConfig('rhea_rdf')
uniprotToRheaConf = self.config.getConfig('uniprot_to_rhea')
swissprotToRheaConf = self.config.getConfig('swissprot_to_rhea')
rheaToEcConf = self.config.getConfig('rhea_to_ec')
rheaDirectionConf = self.config.getConfig('rhea_rxn_direction')
expasyEc2EnzymeClassConf = self.config.getConfig('expasy_ec2class')
Expand Down Expand Up @@ -179,7 +185,25 @@ def getRheaFiles(self):
self.download_files(rhea2UniprotUrl, self.relDir + localDir + rhea2UniprotRemoteFile)
else:
print("Using cached Rhea Uniprot-to-Rhea file.")



# rhea to swissprot
rhea2SwissprotFile = swissprotToRheaConf.extractFileName

self.rheaLocalRheaToSwissprotFile = self.relDir + localDir + rhea2SwissprotFile

if not exists(self.relDir + localDir + rhea2SwissprotFile):
rhea2SwissprotUrl = swissprotToRheaConf.sourceURL
rhea2SwissprotRemoteFile = swissprotToRheaConf.sourceFileName

self.download_files(rhea2SwissprotUrl, self.relDir + localDir + rhea2SwissprotRemoteFile)
else:
print("Using cached Rhea swissprot-to-Rhea file.")




# rhea to ec
rhea2EcFile = rheaToEcConf.extractFileName

Expand Down Expand Up @@ -710,6 +734,8 @@ def appendUniprotToReaction(self):
else:
unis.append('uniprot:'+row.ID)



for rxn in r2uMap:
#print('adding uniprot')
#print('reaction '+rxn)
Expand All @@ -721,7 +747,40 @@ def appendUniprotToReaction(self):
if currRxn is not None:
currRxn.proteins = uniSet
#print("setting proteins, len:"+str(len(currRxn.proteins)))



# swiss prot
r2u = pd.read_csv(self.rheaLocalRheaToSwissprotFile, sep="\t", header=0)

print(str(r2u.shape))

for idx, row in r2u.iterrows():
#print(row)
#print("appending protein accessions to reactions..." + str(row.RHEA_ID)+ " " +str(row.ID))

# !!! just adding human uniprot
if ("uniprot:" + row.ID) in self.humanUniprotAccSet:
#print("Have the human id!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
unis = r2uMap.get("rhea:" + str(row.RHEA_ID))
if unis is None:
unis = ['uniprot:'+row.ID]
r2uMap['rhea:'+str(row.RHEA_ID)] = unis
else:
unis.append('uniprot:'+row.ID)

for rxn in r2uMap:
#print('adding uniprot')
#print('reaction '+rxn)

uniSet = r2uMap[rxn]
currRxn = self.rheaReactionDict.get(rxn, None)
if currRxn is None:
currRxn = self.rheaReactionDict.get("rhea:"+rxn, None)
if currRxn is not None:
currRxn.proteins = uniSet
#print("setting proteins, len:"+str(len(currRxn.proteins)))



def appendEcToReaction(self):
Expand Down Expand Up @@ -753,8 +812,9 @@ def ecToEnzymeClassFromExpasy(self):
with open(self.expasyLocalEc2ClassFile, 'r') as ec2c:
ec2classStrings = ec2c.readlines()

# The file has a header and a footer that have to be skipped.
start = 11
end = len(ec2classStrings) - 6
end = len(ec2classStrings) - 5

for i in range(start, end):
line = ec2classStrings[i].strip()
Expand Down
23 changes: 23 additions & 0 deletions src/parse/UniprotParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,16 @@ def parseUniprot(self):
else:
print("Uniprot (Human) exists. Using cached copy.")



print("starting to parse uniprot trembl dat file")
print(extractFile)

self.parseUniprotFile(self.relDir + localDir + extractFile)

print("number of uniprot trembl records")
tremblCount = len(self.uniprotRecords)
print(str(tremblCount))

# now add SwissProt human
proteinConfig = self.resourceConfig.getConfig("swissprot_human")
Expand All @@ -83,8 +91,17 @@ def parseUniprot(self):
shutil.copyfileobj(f_in, f_out)
else:
print("Uniprot (Human) exists. Using cached copy.")



print("starting to parse uniprot swissprot dat file")
print(extractFile)

self.parseUniprotFile(self.relDir + localDir + extractFile)

print("number of uniprot trembl PLUSE swissprot records")
tremblCount = len(self.uniprotRecords)
print(str(tremblCount))

self.exportUniprotIntermediatFiles()

Expand Down Expand Up @@ -148,8 +165,14 @@ def processData(self, prefix, line, proteinDB, protein):
protein.uniprotAcc = accs[0]
protein.secondaryAccs = accs

if protein.uniprotAcc == 'uniprot:P19835':
print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 in uniprot parser")
print(accs)
else:
for acc in accs:
if acc == 'uniprot:P19835':
print("HEYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY we have P19835 AS A SECONDARY ACC in uniprot parser")

protein.secondaryAccs.append(acc)


Expand Down

0 comments on commit 3bd34e6

Please sign in to comment.