Skip to content

Commit

Permalink
lib: improve protein build (mainly protein relationships) by taking i…
Browse files Browse the repository at this point in the history
…nto account reactome's xref
  • Loading branch information
jtarraga committed Mar 17, 2021
1 parent 5983246 commit b5230be
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,43 +165,37 @@ public void build(Path path) throws IOException {
break;
}
case "Protein": {
// Get the protein primary ID
Protein proteinBP = (Protein) bioPAXElement;

String protPrimaryId = null;
String protName = ((Protein) bioPAXElement).getDisplayName();
String protName = proteinBP.getDisplayName();

if (StringUtils.isEmpty(protName)) {
Set<Xref> xrefs = ((Protein)bioPAXElement).getXref();
for (Xref xref: xrefs) {
if (!StringUtils.containsIgnoreCase(xref.getDb(), "Reactome")) {
protPrimaryId = csv.getProteinCache().getPrimaryId(xref.getId());
if (StringUtils.isNotEmpty(protPrimaryId)) {
break;
}
}
}
if (StringUtils.isEmpty(protPrimaryId)) {
protPrimaryId = getBioPaxId(bioPAXElement.getRDFId());
}
} else {
if (StringUtils.isNotEmpty(protName)) {
protPrimaryId = csv.getProteinCache().getPrimaryId(protName);
if (StringUtils.isEmpty(protPrimaryId)) {
Set<Xref> xrefs = ((Protein)bioPAXElement).getXref();
for (Xref xref: xrefs) {
if (!StringUtils.containsIgnoreCase(xref.getDb(), "Reactome")) {
}
if (StringUtils.isEmpty(protPrimaryId)) {
if (proteinBP.getEntityReference() != null) {
EntityReference entityReference = proteinBP.getEntityReference();
//
for (Xref xref : entityReference.getXref()) {
if ("UniProt".equals(xref.getDb())) {
protPrimaryId = csv.getProteinCache().getPrimaryId(xref.getId());
if (StringUtils.isNotEmpty(protPrimaryId)) {
break;
}
}
if (StringUtils.isNotEmpty(protPrimaryId)) {
break;
}
}
}
if (!"HSP70".equals(protName) && StringUtils.isEmpty(protPrimaryId)) {
protPrimaryId = protName;
}
}
if (StringUtils.isEmpty(protPrimaryId)) {
protPrimaryId = getBioPaxId(bioPAXElement.getRDFId());
}

// Get the protein UID
Long protUid = (protPrimaryId == null ? null : csv.getLong(protPrimaryId, Node.Label.PROTEIN.name()));
if (protUid == null) {
node = loadProtein(bioPAXElement);
node = loadProtein(bioPAXElement);
updateAuxMaps(node);

updatePhysicalEntity(bioPAXElement);
Expand Down Expand Up @@ -419,15 +413,21 @@ private Node loadDna(BioPAXElement bioPAXElement) {
addSetAttributes(entityReference.getComment(), "description", node);

// // xref
// Set<Xref> xrefs = entityReference.getXref();
// for (Xref xref : xrefs) {
Set<Xref> xrefs = entityReference.getXref();
List<String> xrefIds = new ArrayList<>();
List<String> xrefDbs = new ArrayList<>();
for (Xref xref : xrefs) {
xrefIds.add(xref.getId());
xrefDbs.add(xref.getDb());
// org.opencb.bionetdb.core.models.Xref x = new org.opencb.bionetdb.core.models.Xref();
// x.setSource(xref.getDb());
// x.setSourceVersion(xref.getDbVersion());
// x.setId(xref.getId());
// x.setIdVersion(xref.getIdVersion());
//// dna.setXref(x);
// }
// dna.setXref(x);
}
node.addAttribute("xrefIds", StringUtils.join(xrefIds, ";"));
node.addAttribute("xrefDbs", StringUtils.join(xrefDbs, ";"));
}

return node;
Expand Down Expand Up @@ -458,15 +458,21 @@ private Node loadRna(BioPAXElement bioPAXElement) {
addSetAttributes(entityReference.getComment(), "description", node);

// // xref
// Set<Xref> xrefs = entityReference.getXref();
// for (Xref xref : xrefs) {
Set<Xref> xrefs = entityReference.getXref();
List<String> xrefIds = new ArrayList<>();
List<String> xrefDbs = new ArrayList<>();
for (Xref xref : xrefs) {
xrefIds.add(xref.getId());
xrefDbs.add(xref.getDb());
// org.opencb.bionetdb.core.models.Xref x = new org.opencb.bionetdb.core.models.Xref();
// x.setSource(xref.getDb());
// x.setSourceVersion(xref.getDbVersion());
// x.setId(xref.getId());
// x.setIdVersion(xref.getIdVersion());
//// rna.setXref(x);
// }
}
node.addAttribute("xrefIds", StringUtils.join(xrefIds, ";"));
node.addAttribute("xrefDbs", StringUtils.join(xrefDbs, ";"));
}

return node;
Expand Down Expand Up @@ -496,16 +502,22 @@ private Node loadProtein(BioPAXElement bioPAXElement) {
// description
addSetAttributes(entityReference.getComment(), "description", node);
//
// // xref
// Set<Xref> xrefs = entityReference.getXref();
// for (Xref xref : xrefs) {
// xref
Set<Xref> xrefs = entityReference.getXref();
List<String> xrefIds = new ArrayList<>();
List<String> xrefDbs = new ArrayList<>();
for (Xref xref : xrefs) {
xrefIds.add(xref.getId());
xrefDbs.add(xref.getDb());
// org.opencb.bionetdb.core.models.Xref x = new org.opencb.bionetdb.core.models.Xref();
// x.setSource(xref.getDb());
// x.setSourceVersion(xref.getDbVersion());
// x.setId(xref.getId());
// x.setIdVersion(xref.getIdVersion());
// protein.setXref(x);
// }
}
node.addAttribute("xrefIds", StringUtils.join(xrefIds, ";"));
node.addAttribute("xrefDbs", StringUtils.join(xrefDbs, ";"));
}

return node;
Expand Down
115 changes: 64 additions & 51 deletions bionetdb-lib/src/main/java/org/opencb/bionetdb/lib/utils/Builder.java
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,11 @@ public void processNodes(List<Node> nodes) throws IOException {
bw = builder.getCsvInfo().getWriter(node.getLabels().get(0).name());

if (StringUtils.isNotEmpty(node.getName())) {
if (node.getLabels().contains(PROTEIN)) {
// Complete node proteins
node = builder.completeProteinNode(node);
} else if (node.getLabels().contains(DNA)) {
// if (node.getLabels().contains(PROTEIN)) {
// // Complete node proteins
// node = builder.completeProteinNode(node);
// } else
if (node.getLabels().contains(DNA)) {
// Save save gene nodes to process further, in the post-processing phase
dnaNodes.add(node);
continue;
Expand Down Expand Up @@ -607,7 +608,6 @@ private Node createProteinNode(Entry protein) throws IOException {

private Node createProteinNode(Entry protein, Long uid) throws IOException {
Node n;
PrintWriter pw;

// Create protein node and save protein UID
Node proteinNode = NodeBuilder.newNode(uid, protein);
Expand Down Expand Up @@ -651,21 +651,42 @@ private Node createProteinNode(Entry protein, Long uid) throws IOException {
if (CollectionUtils.isNotEmpty(protein.getDbReference())) {
Set<String> done = new HashSet<>();
for (DbReferenceType dbRef: protein.getDbReference()) {
if ("Ensembl".equals(dbRef.getType())) {
for (PropertyType propertyType : dbRef.getProperty()) {
if ("protein sequence ID".equals(propertyType.getType())) {
String xrefId = dbRef.getType() + "." + propertyType.getValue();
// In the list, one db reference can be multiple times
if (!done.contains(xrefId)) {
n = NodeBuilder.newNode(csv.getAndIncUid(), dbRef);
n.setId(propertyType.getValue());
updateCSVFiles(uid, n, ANNOTATION___PROTEIN___XREF.name());

done.add(xrefId);
}
break;
}
}
}
String xrefId = dbRef.getType() + "." + dbRef.getId();
if (!done.contains(xrefId)) {
Long xrefUid = csv.getLong(xrefId, XREF.name());
if (xrefUid == null) {
n = NodeBuilder.newNode(csv.getAndIncUid(), dbRef);
writeNodeLine(n);

xrefUid = n.getUid();
csv.putLong(dbRef.getType() + "." + dbRef.getId(), XREF.name(), xrefUid);
}
writeRelationLine(ANNOTATION___PROTEIN___XREF.name(), uid, xrefUid);
n = NodeBuilder.newNode(csv.getAndIncUid(), dbRef);
updateCSVFiles(uid, n, ANNOTATION___PROTEIN___XREF.name());

done.add(xrefId);
}
}
if (CollectionUtils.isNotEmpty(protein.getAccession())) {
for (String acc: protein.getAccession()) {
String xrefId = "UniProtKB." + acc;
if (!done.contains(xrefId)) {
n = new Node(csv.getAndIncUid(), acc, null, Node.Label.XREF);
n.addAttribute("dbName", "UniProtKB");
updateCSVFiles(uid, n, ANNOTATION___PROTEIN___XREF.name());

done.add(xrefId);
}
}
}
}

// Return node
Expand All @@ -684,38 +705,15 @@ private Node createTranscriptNode(Transcript transcript, Long uid) throws IOExce
Node n;

// Get protein, remember that all proteins were created before genes/transcripts
if (CollectionUtils.isNotEmpty(transcript.getXrefs())) {
Long proteinUid = null;
for (Xref xref: transcript.getXrefs()) {
String proteinId = csv.getProteinCache().getPrimaryId(xref.getId());
if (proteinId != null) {
proteinUid = csv.getLong(proteinId, PROTEIN.name());
// if (proteinUid == null) {
// Entry protein = csv.getProtein(proteinId);
// if (protein != null) {
// // Create protein node and write the CSV file
// Node proteinNode = createProteinNode(protein);
// csv.getWriter(Node.Label.PROTEIN.name()).println(csv.nodeLine(proteinNode));
// proteinUid = proteinNode.getUid();
//
// // Save protein UID
// csv.saveProteinUid(proteinId, proteinUid);
// } else {
// logger.info("Protein not found for ID {}", proteinId);
// }
// }

if (proteinUid != null) {
// Write transcript-protein relation
writeRelationLine(IS___TRANSCRIPT___PROTEIN.name(), uid, proteinUid);
break;
}
if (StringUtils.isNotEmpty(transcript.getProteinId())) {
String proteinId = csv.getProteinCache().getPrimaryId(transcript.getSource() + "." + transcript.getProteinId());
if (proteinId != null) {
Long proteinUid = csv.getLong(proteinId, PROTEIN.name());
if (proteinUid != null) {
// Write transcript-protein relation
writeRelationLine(IS___TRANSCRIPT___PROTEIN.name(), uid, proteinUid);
}
}
// if (proteinUid == null && StringUtils.isNotEmpty(transcript.getProteinId())) {
// System.out.println("Protein not found!!! Transcript " + transcript.getId() + " with proteinId = "
// + transcript.getProteinId());
// }
}

// Model exon
Expand Down Expand Up @@ -1359,17 +1357,18 @@ public void buildGenes(Path genePath) throws IOException {
geneCache.saveObject(geneId, jsonGene);

// Save xrefs for that gene
geneCache.saveXref(geneId, geneId);
geneCache.saveXref(gene.getSource() + "." + geneId, geneId);
if (StringUtils.isNotEmpty(gene.getName())) {
geneCache.saveXref(gene.getName(), geneId);
geneCache.saveXref(gene.getSource() + "." + gene.getName(), geneId);
}

if (CollectionUtils.isNotEmpty(gene.getTranscripts())) {
for (Transcript transcr : gene.getTranscripts()) {
if (CollectionUtils.isNotEmpty(transcr.getXrefs())) {
for (Xref xref: transcr.getXrefs()) {
if (StringUtils.isNotEmpty(xref.getId())) {
geneCache.saveXref(xref.getId(), geneId);
String xrefId = xref.getDbName() + "." + xref.getId();
if (StringUtils.isNotEmpty(xrefId)) {
geneCache.saveXref(xrefId, geneId);
}
}
}
Expand Down Expand Up @@ -1477,10 +1476,24 @@ public void buildProteins(Path proteinPath) throws IOException {
if (CollectionUtils.isNotEmpty(protein.getDbReference())) {
Set<String> done = new HashSet<>();
for (DbReferenceType dbRef: protein.getDbReference()) {
if ("Ensembl".equals(dbRef.getType())) {
for (PropertyType propertyType : dbRef.getProperty()) {
if ("protein sequence ID".equals(propertyType.getType())) {
String xrefId = dbRef.getType() + "." + propertyType.getValue();
// In the list, one db reference can be multiple times
if (!done.contains(xrefId)) {
proteinCache.saveXref(xrefId, proteinAcc);
done.add(xrefId);
}
break;
}
}
}
String xrefId = dbRef.getType() + "." + dbRef.getId();
// In the list, one db reference can be multiple times
if (!done.contains(dbRef.getId())) {
proteinCache.saveXref(dbRef.getId(), proteinAcc);
done.add(dbRef.getId());
if (!done.contains(xrefId)) {
proteinCache.saveXref(xrefId, proteinAcc);
done.add(xrefId);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,8 @@ private Map<String, List<String>> createNodeAttributes(List<File> variantFiles)
//

// Protein
attrs = Arrays.asList("protId", "id", "name", "accession", "dataset", "proteinExistence", "evidence", "object");
attrs = Arrays.asList("protId", "id", "name", "accession", "dataset", "proteinExistence", "evidence", "object",
"xrefIds", "xrefDbs");
nodeAttributes.put(Node.Label.PROTEIN.toString(), new ArrayList<>(attrs));

// Protein keyword
Expand Down Expand Up @@ -784,7 +785,7 @@ private Map<String, List<String>> createNodeAttributes(List<File> variantFiles)
nodeAttributes.put(Node.Label.SMALL_MOLECULE.toString(), new ArrayList<>(attrs));

// RNA
attrs = Arrays.asList("rnaId", "id", "name", "evidence");
attrs = Arrays.asList("rnaId", "id", "name", "evidence", "xrefIds", "xrefDbs");
nodeAttributes.put(Node.Label.RNA.toString(), new ArrayList<>(attrs));

// catalysis
Expand All @@ -800,7 +801,7 @@ private Map<String, List<String>> createNodeAttributes(List<File> variantFiles)
nodeAttributes.put(Node.Label.REACTION.toString(), new ArrayList<>(attrs));

// DNA
attrs = Arrays.asList("dnaId", "id", "name");
attrs = Arrays.asList("dnaId", "id", "name", "xrefIds", "xrefDbs");
nodeAttributes.put(Node.Label.DNA.toString(), new ArrayList<>(attrs));

// Undefined
Expand Down

0 comments on commit b5230be

Please sign in to comment.