-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' into store-interactor-xrefs-in-solr
- Loading branch information
Showing
25 changed files
with
1,034 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --time=02-00:00:00 # walltime | ||
#SBATCH --ntasks=1 # number of tasks | ||
#SBATCH --cpus-per-task=5 # number of CPUs Per Task i.e if your code is multi-threaded | ||
#SBATCH -p research # partition(s) | ||
#SBATCH --mem=32G # memory per node | ||
#SBATCH -J "ORTHOLOG_IMPORT" # job name | ||
#SBATCH -o "/nfs/production/hhe/intact/data/panther/logs/ortholog-import-%j.out" # job output file | ||
#SBATCH --mail-type=ALL | ||
#SBATCH [email protected] # email address | ||
export JAVA_HOME=/hps/software/users/hhe/intact/third-party-softwares/latest_intact_jdk11 | ||
|
||
if [ $# -ne 1 ]; then | ||
echo "" | ||
echo "ERROR: wrong number of parameters ($#)." | ||
echo "" | ||
exit 1 | ||
fi | ||
|
||
PROFILE=$1; | ||
|
||
echo "Profile: $PROFILE" | ||
|
||
mvn clean -U install -P import-orthology,${PROFILE} -Djob.name=orthologyImport -Dmaven.test.skip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<parent> | ||
<groupId>uk.ac.ebi.intact.dataexchange</groupId> | ||
<artifactId>intact-dataexchange-master</artifactId> | ||
<version>4.1.0-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>intact-orthology-import</artifactId> | ||
<packaging>jar</packaging> | ||
<name>intact-orthology-import</name> | ||
|
||
|
||
<properties> | ||
<db.url>${db.protocol}:${db.subprotocol}${db.separator}${db.alias}</db.url> | ||
<db.hbm2ddl>none</db.hbm2ddl> | ||
</properties> | ||
|
||
<profiles> | ||
<profile> | ||
<id>import-orthology</id> | ||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.codehaus.mojo</groupId> | ||
<artifactId>exec-maven-plugin</artifactId> | ||
<executions> | ||
<execution> | ||
<phase>install</phase> | ||
<goals> | ||
<goal>exec</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
<configuration> | ||
<executable>java</executable> | ||
<arguments> | ||
<argument>-Xmx12288m</argument> | ||
<argument>-Xms2048m</argument> | ||
<argument>-classpath</argument> | ||
<classpath/> | ||
<argument>psidev.psi.mi.jami.batch.MIBatchJobManager</argument> | ||
<argument>classpath*:/META-INF/orthology-import-spring.xml</argument> | ||
<argument>${job.name}</argument> | ||
</arguments> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</profile> | ||
</profiles> | ||
|
||
|
||
<dependencies> | ||
|
||
<!-- base dependencies --> | ||
|
||
<dependency> | ||
<groupId>psidev.psi.mi.jami</groupId> | ||
<artifactId>jami-core</artifactId> | ||
<version>${psi.jami.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>psidev.psi.mi.jami</groupId> | ||
<artifactId>jami-batch</artifactId> | ||
<version>${psi.jami.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>psidev.psi.mi.jami.bridges</groupId> | ||
<artifactId>jami-uniprot</artifactId> | ||
<version>${psi.jami.version}</version> | ||
<exclusions> | ||
<exclusion> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>jcl-over-slf4j</artifactId> | ||
</exclusion> | ||
</exclusions> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>uk.ac.ebi.intact.jami</groupId> | ||
<artifactId>intact-jami</artifactId> | ||
<version>${intact.jami.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.springframework.retry</groupId> | ||
<artifactId>spring-retry</artifactId> | ||
<version>1.3.4</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.springframework</groupId> | ||
<artifactId>spring-aspects</artifactId> | ||
<version>${spring.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.projectlombok</groupId> | ||
<artifactId>lombok</artifactId> | ||
<version>1.18.30</version> | ||
</dependency> | ||
|
||
<!-- Added dependencies --> | ||
|
||
<dependency> | ||
<groupId>org.apache.commons</groupId> | ||
<artifactId>commons-compress</artifactId> | ||
<version>1.21</version> | ||
<scope>compile</scope> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>commons-io</groupId> | ||
<artifactId>commons-io</artifactId> | ||
<version>2.4</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>jakarta.persistence</groupId> | ||
<artifactId>jakarta.persistence-api</artifactId> | ||
<version>2.2.3</version> | ||
<scope>compile</scope> | ||
</dependency> | ||
</dependencies> | ||
</project> |
89 changes: 89 additions & 0 deletions
89
intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package uk.ac.ebi.intact.ortholog; | ||
|
||
import lombok.extern.log4j.Log4j; | ||
import org.apache.commons.io.FileUtils; | ||
|
||
import java.io.*; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
@Log4j | ||
public class OrthologsFileParser { | ||
|
||
private static final Pattern UNIPROT_KB_REGEX = Pattern.compile("UniProtKB=([A-Z0-9]+)"); | ||
private static final Pattern PANTHER_REGEX = Pattern.compile("PTHR\\d+"); | ||
|
||
public static void parseFileAndSave(String inputFilePath, String outputDirPath) throws IOException { | ||
log.info("Parsing file..."); | ||
|
||
File outputDir = new File(outputDirPath); | ||
// First, we empty de directory to start clean | ||
if (outputDir.exists()) { | ||
FileUtils.deleteDirectory(outputDir); | ||
} | ||
outputDir.mkdirs(); | ||
long linesRead = 0; | ||
|
||
// First we store all matches in a map to ensure there's no duplication | ||
Map<String, Set<String>> uniprotAndPTHR = new HashMap<>(); | ||
try (BufferedReader reader = new BufferedReader(new FileReader(inputFilePath))) { | ||
String line; | ||
while ((line = reader.readLine()) != null) { | ||
linesRead++; | ||
ArrayList<String> uniprotMatches = new ArrayList<>(); | ||
|
||
Matcher uniprotMatcher = UNIPROT_KB_REGEX.matcher(line); | ||
Matcher pantherMatcher = PANTHER_REGEX.matcher(line); | ||
|
||
while (uniprotMatcher.find()) { | ||
uniprotMatches.add(uniprotMatcher.group(1)); | ||
} | ||
while (pantherMatcher.find()) { | ||
for (String uniprotMatch : uniprotMatches) { | ||
uniprotAndPTHR.putIfAbsent(uniprotMatch, new HashSet<>()); | ||
uniprotAndPTHR.get(uniprotMatch).add(pantherMatcher.group()); | ||
} | ||
} | ||
|
||
if (linesRead % 250_000 == 0) { | ||
log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read"); | ||
} | ||
} | ||
} | ||
|
||
log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read"); | ||
log.info("File parsed."); | ||
|
||
log.info("Saving map to files..."); | ||
|
||
// Then, we write all the files | ||
long uniprotAndPantherCount = 0; | ||
for (String uniprotMatch : uniprotAndPTHR.keySet()) { | ||
for (String pantherMatch : uniprotAndPTHR.get(uniprotMatch)) { | ||
writePair(outputDir.toPath(), uniprotMatch, pantherMatch); | ||
} | ||
uniprotAndPantherCount += uniprotAndPTHR.get(uniprotMatch).size(); | ||
if (uniprotAndPantherCount % 25_000 == 0) { | ||
log.info(uniprotAndPantherCount + " proteins saved"); | ||
} | ||
} | ||
|
||
log.info("All protein files saved."); | ||
log.info("Number of Panther identifiers: " + uniprotAndPantherCount); | ||
} | ||
|
||
private static void writePair(Path dirPath, String uniprotId, String pantherId) throws IOException { | ||
Path filePath = dirPath.resolve(uniprotId); | ||
try (FileWriter fileWriter = new FileWriter(filePath.toFile(), true); | ||
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) { | ||
bufferedWriter.write(uniprotId + "," + pantherId); | ||
bufferedWriter.newLine(); | ||
} | ||
} | ||
} |
43 changes: 43 additions & 0 deletions
43
intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package uk.ac.ebi.intact.ortholog; | ||
|
||
import lombok.extern.log4j.Log4j; | ||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; | ||
import org.apache.commons.io.IOUtils; | ||
import java.io.*; | ||
import java.net.HttpURLConnection; | ||
import java.net.URL; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
@Log4j | ||
public class OrthologsFileReader{ | ||
|
||
public static void decompressGzip(String url, String filePath) throws IOException { | ||
URL gzipUrl = new URL(url); | ||
HttpURLConnection connection = (HttpURLConnection) gzipUrl.openConnection(); | ||
int responseCode = connection.getResponseCode(); | ||
|
||
if (responseCode == HttpURLConnection.HTTP_OK) { | ||
log.info("Connected to URL."); | ||
try (InputStream in = connection.getInputStream(); | ||
GZIPInputStream gis = new GZIPInputStream(in); | ||
TarArchiveInputStream tis = new TarArchiveInputStream(gis)) { | ||
log.info("Decompressing..."); | ||
while (tis.getNextTarEntry() != null) { | ||
File outputFile = new File(filePath); | ||
try (FileOutputStream fos = new FileOutputStream(outputFile, false)) { | ||
// the false make it write over existing data | ||
IOUtils.copy(tis, fos); | ||
} | ||
} | ||
log.info("File decompressed, data in " + filePath); | ||
} | ||
finally { | ||
connection.disconnect(); | ||
log.info("Disconnected from URL."); | ||
} | ||
} | ||
else { | ||
log.info("GZIP returned unexpected response: " + responseCode); | ||
} | ||
} | ||
} |
Oops, something went wrong.