Skip to content

Commit

Permalink
Merge branch 'develop' into store-interactor-xrefs-in-solr
Browse files Browse the repository at this point in the history
  • Loading branch information
jmedinaebi committed Sep 25, 2024
2 parents c08eca6 + f1615df commit 55fc22f
Show file tree
Hide file tree
Showing 25 changed files with 1,034 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ private static boolean associationLine(StringBuilder associationSb, IntactComple
return false;
}
}
} else {
} else if (!intactComplex.isPredictedComplex()) {
// Go xrefs are only expected for curated complexes
System.err.println("ERROR: Complex " + intactComplex.getComplexAc() + " (" + intactComplex.getAc() + ") [ " + intactComplex.getOrganism().getScientificName() + "] has empty complex GO Xref");
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ private static boolean associationLine(StringBuilder associationSb, IntactComple
return false;
}
}
} else {
} else if (!intactComplex.isPredictedComplex()) {
// Go xrefs are only expected for curated complexes
System.err.println("ERROR: Complex " + intactComplex.getComplexAc() + " (" + intactComplex.getAc() + ") [ " + intactComplex.getOrganism().getScientificName() + "] has empty complex GO Xref");
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ public void exportToDR() throws IOException {

ComplexService complexService = ApplicationContextProvider.getBean("complexService");
final String query = "select distinct i from IntactComplex i " +
"where predictedComplex is false";
"where predictedComplex is false or predictedComplex is null";
final String countQuery = "select count(distinct i.ac) from IntactComplex i " +
"where predictedComplex is false";
"where predictedComplex is false or predictedComplex is null";
Iterator<Complex> complexes = complexService.iterateAll(countQuery, query, new HashMap<>());

System.err.println("Complexes to export: " + complexService.countAll(countQuery, new HashMap<>()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,12 @@ protected void processAnnotations(Experiment objectToEnrich, Experiment objectSo
if (objectToEnrich.getPublication() != null){
Publication publication = objectToEnrich.getPublication();
if (publication.getPublicationDate() != null) {
Annotation pubYear = AnnotationUtils.collectFirstAnnotationWithTopic(objectToEnrich.getAnnotations(),
Collection<Annotation> pubYears = AnnotationUtils.collectAllAnnotationsHavingTopic(objectToEnrich.getAnnotations(),
Annotation.PUBLICATION_YEAR_MI, Annotation.PUBLICATION_YEAR);
Annotation pubYear = pubYears.stream()
.filter(ann -> IntactUtils.YEAR_FORMAT_REGEX.matcher(ann.getValue()).matches())
.findFirst()
.orElse(null);
if (pubYear != null){
pubYear.setValue(IntactUtils.YEAR_FORMAT.format(publication.getPublicationDate()));
}
Expand Down
25 changes: 25 additions & 0 deletions intact-orthology-import/importOrtholgy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

#SBATCH --time=02-00:00:00 # walltime
#SBATCH --ntasks=1 # number of tasks
#SBATCH --cpus-per-task=5 # number of CPUs Per Task i.e if your code is multi-threaded
#SBATCH -p research # partition(s)
#SBATCH --mem=32G # memory per node
#SBATCH -J "ORTHOLOG_IMPORT" # job name
#SBATCH -o "/nfs/production/hhe/intact/data/panther/logs/ortholog-import-%j.out" # job output file
#SBATCH --mail-type=ALL
#SBATCH [email protected] # email address
export JAVA_HOME=/hps/software/users/hhe/intact/third-party-softwares/latest_intact_jdk11

if [ $# -ne 1 ]; then
echo ""
echo "ERROR: wrong number of parameters ($#)."
echo ""
exit 1
fi

PROFILE=$1;

echo "Profile: $PROFILE"

mvn clean -U install -P import-orthology,${PROFILE} -Djob.name=orthologyImport -Dmaven.test.skip
132 changes: 132 additions & 0 deletions intact-orthology-import/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>uk.ac.ebi.intact.dataexchange</groupId>
<artifactId>intact-dataexchange-master</artifactId>
<version>4.1.0-SNAPSHOT</version>
</parent>

<artifactId>intact-orthology-import</artifactId>
<packaging>jar</packaging>
<name>intact-orthology-import</name>


<properties>
<db.url>${db.protocol}:${db.subprotocol}${db.separator}${db.alias}</db.url>
<db.hbm2ddl>none</db.hbm2ddl>
</properties>

<profiles>
<profile>
<id>import-orthology</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<phase>install</phase>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
<configuration>
<executable>java</executable>
<arguments>
<argument>-Xmx12288m</argument>
<argument>-Xms2048m</argument>
<argument>-classpath</argument>
<classpath/>
<argument>psidev.psi.mi.jami.batch.MIBatchJobManager</argument>
<argument>classpath*:/META-INF/orthology-import-spring.xml</argument>
<argument>${job.name}</argument>
</arguments>
</configuration>
</plugin>
</plugins>
</build>
</profile>
</profiles>


<dependencies>

<!-- base dependencies -->

<dependency>
<groupId>psidev.psi.mi.jami</groupId>
<artifactId>jami-core</artifactId>
<version>${psi.jami.version}</version>
</dependency>

<dependency>
<groupId>psidev.psi.mi.jami</groupId>
<artifactId>jami-batch</artifactId>
<version>${psi.jami.version}</version>
</dependency>

<dependency>
<groupId>psidev.psi.mi.jami.bridges</groupId>
<artifactId>jami-uniprot</artifactId>
<version>${psi.jami.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>uk.ac.ebi.intact.jami</groupId>
<artifactId>intact-jami</artifactId>
<version>${intact.jami.version}</version>
</dependency>

<dependency>
<groupId>org.springframework.retry</groupId>
<artifactId>spring-retry</artifactId>
<version>1.3.4</version>
</dependency>

<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aspects</artifactId>
<version>${spring.version}</version>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
</dependency>

<!-- Added dependencies -->

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.21</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>jakarta.persistence</groupId>
<artifactId>jakarta.persistence-api</artifactId>
<version>2.2.3</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package uk.ac.ebi.intact.ortholog;

import lombok.extern.log4j.Log4j;
import org.apache.commons.io.FileUtils;

import java.io.*;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Log4j
public class OrthologsFileParser {

private static final Pattern UNIPROT_KB_REGEX = Pattern.compile("UniProtKB=([A-Z0-9]+)");
private static final Pattern PANTHER_REGEX = Pattern.compile("PTHR\\d+");

public static void parseFileAndSave(String inputFilePath, String outputDirPath) throws IOException {
log.info("Parsing file...");

File outputDir = new File(outputDirPath);
// First, we empty de directory to start clean
if (outputDir.exists()) {
FileUtils.deleteDirectory(outputDir);
}
outputDir.mkdirs();
long linesRead = 0;

// First we store all matches in a map to ensure there's no duplication
Map<String, Set<String>> uniprotAndPTHR = new HashMap<>();
try (BufferedReader reader = new BufferedReader(new FileReader(inputFilePath))) {
String line;
while ((line = reader.readLine()) != null) {
linesRead++;
ArrayList<String> uniprotMatches = new ArrayList<>();

Matcher uniprotMatcher = UNIPROT_KB_REGEX.matcher(line);
Matcher pantherMatcher = PANTHER_REGEX.matcher(line);

while (uniprotMatcher.find()) {
uniprotMatches.add(uniprotMatcher.group(1));
}
while (pantherMatcher.find()) {
for (String uniprotMatch : uniprotMatches) {
uniprotAndPTHR.putIfAbsent(uniprotMatch, new HashSet<>());
uniprotAndPTHR.get(uniprotMatch).add(pantherMatcher.group());
}
}

if (linesRead % 250_000 == 0) {
log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read");
}
}
}

log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read");
log.info("File parsed.");

log.info("Saving map to files...");

// Then, we write all the files
long uniprotAndPantherCount = 0;
for (String uniprotMatch : uniprotAndPTHR.keySet()) {
for (String pantherMatch : uniprotAndPTHR.get(uniprotMatch)) {
writePair(outputDir.toPath(), uniprotMatch, pantherMatch);
}
uniprotAndPantherCount += uniprotAndPTHR.get(uniprotMatch).size();
if (uniprotAndPantherCount % 25_000 == 0) {
log.info(uniprotAndPantherCount + " proteins saved");
}
}

log.info("All protein files saved.");
log.info("Number of Panther identifiers: " + uniprotAndPantherCount);
}

private static void writePair(Path dirPath, String uniprotId, String pantherId) throws IOException {
Path filePath = dirPath.resolve(uniprotId);
try (FileWriter fileWriter = new FileWriter(filePath.toFile(), true);
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(uniprotId + "," + pantherId);
bufferedWriter.newLine();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package uk.ac.ebi.intact.ortholog;

import lombok.extern.log4j.Log4j;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.io.IOUtils;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.zip.GZIPInputStream;

@Log4j
public class OrthologsFileReader{

public static void decompressGzip(String url, String filePath) throws IOException {
URL gzipUrl = new URL(url);
HttpURLConnection connection = (HttpURLConnection) gzipUrl.openConnection();
int responseCode = connection.getResponseCode();

if (responseCode == HttpURLConnection.HTTP_OK) {
log.info("Connected to URL.");
try (InputStream in = connection.getInputStream();
GZIPInputStream gis = new GZIPInputStream(in);
TarArchiveInputStream tis = new TarArchiveInputStream(gis)) {
log.info("Decompressing...");
while (tis.getNextTarEntry() != null) {
File outputFile = new File(filePath);
try (FileOutputStream fos = new FileOutputStream(outputFile, false)) {
// the false make it write over existing data
IOUtils.copy(tis, fos);
}
}
log.info("File decompressed, data in " + filePath);
}
finally {
connection.disconnect();
log.info("Disconnected from URL.");
}
}
else {
log.info("GZIP returned unexpected response: " + responseCode);
}
}
}
Loading

0 comments on commit 55fc22f

Please sign in to comment.