-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' into export-datasets-on-mitab-format
- Loading branch information
Showing
21 changed files
with
1,064 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --time=02-00:00:00 # walltime | ||
#SBATCH --ntasks=1 # number of tasks | ||
#SBATCH --cpus-per-task=5 # number of CPUs Per Task i.e if your code is multi-threaded | ||
#SBATCH -p research # partition(s) | ||
#SBATCH --mem=32G # memory per node | ||
#SBATCH -J "ORTHOLOG_IMPORT" # job name | ||
#SBATCH -o "/nfs/production/hhe/intact/data/panther/logs/ortholog-import-%j.out" # job output file | ||
#SBATCH --mail-type=ALL | ||
#SBATCH [email protected] # email address | ||
export JAVA_HOME=/hps/software/users/hhe/intact/third-party-softwares/latest_intact_jdk11 | ||
|
||
if [ $# -ne 1 ]; then | ||
echo "" | ||
echo "ERROR: wrong number of parameters ($#)." | ||
echo "" | ||
exit 1 | ||
fi | ||
|
||
PROFILE=$1; | ||
|
||
echo "Profile: $PROFILE" | ||
|
||
mvn clean -U install -P import-orthology,${PROFILE} -Djob.name=orthologyImport -Dmaven.test.skip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<parent> | ||
<groupId>uk.ac.ebi.intact.dataexchange</groupId> | ||
<artifactId>intact-dataexchange-master</artifactId> | ||
<version>4.2.0-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>intact-orthology-import</artifactId> | ||
<packaging>jar</packaging> | ||
<name>intact-orthology-import</name> | ||
|
||
|
||
<properties> | ||
<db.url>${db.protocol}:${db.subprotocol}${db.separator}${db.alias}</db.url> | ||
<db.hbm2ddl>none</db.hbm2ddl> | ||
</properties> | ||
|
||
<profiles> | ||
<profile> | ||
<id>import-orthology</id> | ||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.codehaus.mojo</groupId> | ||
<artifactId>exec-maven-plugin</artifactId> | ||
<executions> | ||
<execution> | ||
<phase>install</phase> | ||
<goals> | ||
<goal>exec</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
<configuration> | ||
<executable>java</executable> | ||
<arguments> | ||
<argument>-Xmx12288m</argument> | ||
<argument>-Xms2048m</argument> | ||
<argument>-classpath</argument> | ||
<classpath/> | ||
<argument>psidev.psi.mi.jami.batch.MIBatchJobManager</argument> | ||
<argument>classpath*:/META-INF/orthology-import-spring.xml</argument> | ||
<argument>${job.name}</argument> | ||
</arguments> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</profile> | ||
</profiles> | ||
|
||
|
||
<dependencies> | ||
|
||
<!-- base dependencies --> | ||
|
||
<dependency> | ||
<groupId>psidev.psi.mi.jami</groupId> | ||
<artifactId>jami-core</artifactId> | ||
<version>${psi.jami.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>psidev.psi.mi.jami</groupId> | ||
<artifactId>jami-batch</artifactId> | ||
<version>${psi.jami.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>psidev.psi.mi.jami.bridges</groupId> | ||
<artifactId>jami-uniprot</artifactId> | ||
<version>${psi.jami.version}</version> | ||
<exclusions> | ||
<exclusion> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>jcl-over-slf4j</artifactId> | ||
</exclusion> | ||
</exclusions> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>uk.ac.ebi.intact.jami</groupId> | ||
<artifactId>intact-jami</artifactId> | ||
<version>${intact.jami.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.springframework.retry</groupId> | ||
<artifactId>spring-retry</artifactId> | ||
<version>1.3.4</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.springframework</groupId> | ||
<artifactId>spring-aspects</artifactId> | ||
<version>${spring.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.projectlombok</groupId> | ||
<artifactId>lombok</artifactId> | ||
<version>1.18.30</version> | ||
</dependency> | ||
|
||
<!-- Added dependencies --> | ||
|
||
<dependency> | ||
<groupId>org.apache.commons</groupId> | ||
<artifactId>commons-compress</artifactId> | ||
<version>1.21</version> | ||
<scope>compile</scope> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>commons-io</groupId> | ||
<artifactId>commons-io</artifactId> | ||
<version>2.4</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>jakarta.persistence</groupId> | ||
<artifactId>jakarta.persistence-api</artifactId> | ||
<version>2.2.3</version> | ||
<scope>compile</scope> | ||
</dependency> | ||
</dependencies> | ||
</project> |
89 changes: 89 additions & 0 deletions
89
intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package uk.ac.ebi.intact.ortholog; | ||
|
||
import lombok.extern.log4j.Log4j; | ||
import org.apache.commons.io.FileUtils; | ||
|
||
import java.io.*; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
@Log4j | ||
public class OrthologsFileParser { | ||
|
||
private static final Pattern UNIPROT_KB_REGEX = Pattern.compile("UniProtKB=([A-Z0-9]+)"); | ||
private static final Pattern PANTHER_REGEX = Pattern.compile("PTHR\\d+"); | ||
|
||
public static void parseFileAndSave(String inputFilePath, String outputDirPath) throws IOException { | ||
log.info("Parsing file..."); | ||
|
||
File outputDir = new File(outputDirPath); | ||
// First, we empty de directory to start clean | ||
if (outputDir.exists()) { | ||
FileUtils.deleteDirectory(outputDir); | ||
} | ||
outputDir.mkdirs(); | ||
long linesRead = 0; | ||
|
||
// First we store all matches in a map to ensure there's no duplication | ||
Map<String, Set<String>> uniprotAndPTHR = new HashMap<>(); | ||
try (BufferedReader reader = new BufferedReader(new FileReader(inputFilePath))) { | ||
String line; | ||
while ((line = reader.readLine()) != null) { | ||
linesRead++; | ||
ArrayList<String> uniprotMatches = new ArrayList<>(); | ||
|
||
Matcher uniprotMatcher = UNIPROT_KB_REGEX.matcher(line); | ||
Matcher pantherMatcher = PANTHER_REGEX.matcher(line); | ||
|
||
while (uniprotMatcher.find()) { | ||
uniprotMatches.add(uniprotMatcher.group(1)); | ||
} | ||
while (pantherMatcher.find()) { | ||
for (String uniprotMatch : uniprotMatches) { | ||
uniprotAndPTHR.putIfAbsent(uniprotMatch, new HashSet<>()); | ||
uniprotAndPTHR.get(uniprotMatch).add(pantherMatcher.group()); | ||
} | ||
} | ||
|
||
if (linesRead % 250_000 == 0) { | ||
log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read"); | ||
} | ||
} | ||
} | ||
|
||
log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read"); | ||
log.info("File parsed."); | ||
|
||
log.info("Saving map to files..."); | ||
|
||
// Then, we write all the files | ||
long uniprotAndPantherCount = 0; | ||
for (String uniprotMatch : uniprotAndPTHR.keySet()) { | ||
for (String pantherMatch : uniprotAndPTHR.get(uniprotMatch)) { | ||
writePair(outputDir.toPath(), uniprotMatch, pantherMatch); | ||
} | ||
uniprotAndPantherCount += uniprotAndPTHR.get(uniprotMatch).size(); | ||
if (uniprotAndPantherCount % 25_000 == 0) { | ||
log.info(uniprotAndPantherCount + " proteins saved"); | ||
} | ||
} | ||
|
||
log.info("All protein files saved."); | ||
log.info("Number of Panther identifiers: " + uniprotAndPantherCount); | ||
} | ||
|
||
private static void writePair(Path dirPath, String uniprotId, String pantherId) throws IOException { | ||
Path filePath = dirPath.resolve(uniprotId); | ||
try (FileWriter fileWriter = new FileWriter(filePath.toFile(), true); | ||
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) { | ||
bufferedWriter.write(uniprotId + "," + pantherId); | ||
bufferedWriter.newLine(); | ||
} | ||
} | ||
} |
43 changes: 43 additions & 0 deletions
43
intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package uk.ac.ebi.intact.ortholog; | ||
|
||
import lombok.extern.log4j.Log4j; | ||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; | ||
import org.apache.commons.io.IOUtils; | ||
import java.io.*; | ||
import java.net.HttpURLConnection; | ||
import java.net.URL; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
@Log4j | ||
public class OrthologsFileReader{ | ||
|
||
public static void decompressGzip(String url, String filePath) throws IOException { | ||
URL gzipUrl = new URL(url); | ||
HttpURLConnection connection = (HttpURLConnection) gzipUrl.openConnection(); | ||
int responseCode = connection.getResponseCode(); | ||
|
||
if (responseCode == HttpURLConnection.HTTP_OK) { | ||
log.info("Connected to URL."); | ||
try (InputStream in = connection.getInputStream(); | ||
GZIPInputStream gis = new GZIPInputStream(in); | ||
TarArchiveInputStream tis = new TarArchiveInputStream(gis)) { | ||
log.info("Decompressing..."); | ||
while (tis.getNextTarEntry() != null) { | ||
File outputFile = new File(filePath); | ||
try (FileOutputStream fos = new FileOutputStream(outputFile, false)) { | ||
// the false make it write over existing data | ||
IOUtils.copy(tis, fos); | ||
} | ||
} | ||
log.info("File decompressed, data in " + filePath); | ||
} | ||
finally { | ||
connection.disconnect(); | ||
log.info("Disconnected from URL."); | ||
} | ||
} | ||
else { | ||
log.info("GZIP returned unexpected response: " + responseCode); | ||
} | ||
} | ||
} |
60 changes: 60 additions & 0 deletions
60
...orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsProteinAssociation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package uk.ac.ebi.intact.ortholog; | ||
|
||
import lombok.extern.log4j.Log4j; | ||
import uk.ac.ebi.intact.jami.dao.IntactDao; | ||
import uk.ac.ebi.intact.jami.model.extension.IntactProtein; | ||
import javax.annotation.Resource; | ||
import javax.persistence.Query; | ||
import java.io.*; | ||
import java.nio.file.Path; | ||
import java.util.*; | ||
|
||
@Log4j | ||
public class OrthologsProteinAssociation { | ||
|
||
@Resource(name="intactDao") | ||
private final IntactDao intactDao; | ||
|
||
public OrthologsProteinAssociation(IntactDao intactDao) { | ||
this.intactDao = intactDao; | ||
} | ||
|
||
public List<Integer> getProteinAcs() { | ||
String sqlQuery = "select CAST(REPLACE(ac,'EBI-','') as integer) as numberAC from intact.ia_interactor p where category = 'protein' order by numberAC asc"; | ||
Query query = intactDao.getEntityManager().createNativeQuery(sqlQuery); | ||
return query.getResultList(); | ||
} | ||
|
||
public List<IntactProtein> fetchProteins(Integer startAc, Integer endAc) { | ||
String sqlQuery = "select p FROM IntactProtein p where CAST(REPLACE(ac,'EBI-','') as integer) BETWEEN :startAc and :endAc"; | ||
Query query = intactDao.getEntityManager().createQuery(sqlQuery); | ||
query.setParameter("startAc", startAc); | ||
query.setParameter("endAc", endAc); | ||
return query.getResultList(); | ||
} | ||
|
||
public static Collection<String> associateOneProteinToPantherIds(String dirPath, IntactProtein protein) throws IOException { | ||
String proteinAc = protein.getUniprotkb(); | ||
List<String> pantherIds = new ArrayList<>(); | ||
if (proteinAc != null) { | ||
Path filePath = Path.of(dirPath).resolve(protein.getUniprotkb()); | ||
if (filePath.toFile().exists()) { | ||
try (BufferedReader reader = new BufferedReader(new FileReader(filePath.toFile()))) { | ||
String line; | ||
while ((line = reader.readLine()) != null) { | ||
String[] parts = line.split(","); | ||
if (parts.length == 2) { | ||
String proteinId = parts[0]; | ||
if (proteinId.equals(protein.getUniprotkb())) { | ||
pantherIds.add(parts[1]); | ||
} | ||
|
||
} | ||
} | ||
} | ||
return pantherIds; | ||
} | ||
} | ||
return pantherIds; | ||
} | ||
} |
Oops, something went wrong.