Skip to content

Commit

Permalink
Merge branch 'develop' into export-datasets-on-mitab-format
Browse files Browse the repository at this point in the history
  • Loading branch information
jmedinaebi committed Sep 26, 2024
2 parents a7a1f98 + cd95a06 commit 6a38dd7
Show file tree
Hide file tree
Showing 21 changed files with 1,064 additions and 4 deletions.
25 changes: 25 additions & 0 deletions intact-orthology-import/importOrtholgy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

#SBATCH --time=02-00:00:00 # walltime
#SBATCH --ntasks=1 # number of tasks
#SBATCH --cpus-per-task=5 # number of CPUs Per Task i.e if your code is multi-threaded
#SBATCH -p research # partition(s)
#SBATCH --mem=32G # memory per node
#SBATCH -J "ORTHOLOG_IMPORT" # job name
#SBATCH -o "/nfs/production/hhe/intact/data/panther/logs/ortholog-import-%j.out" # job output file
#SBATCH --mail-type=ALL
#SBATCH [email protected] # email address
export JAVA_HOME=/hps/software/users/hhe/intact/third-party-softwares/latest_intact_jdk11

if [ $# -ne 1 ]; then
echo ""
echo "ERROR: wrong number of parameters ($#)."
echo ""
exit 1
fi

PROFILE=$1;

echo "Profile: $PROFILE"

mvn clean -U install -P import-orthology,${PROFILE} -Djob.name=orthologyImport -Dmaven.test.skip
132 changes: 132 additions & 0 deletions intact-orthology-import/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>uk.ac.ebi.intact.dataexchange</groupId>
<artifactId>intact-dataexchange-master</artifactId>
<version>4.2.0-SNAPSHOT</version>
</parent>

<artifactId>intact-orthology-import</artifactId>
<packaging>jar</packaging>
<name>intact-orthology-import</name>


<properties>
<db.url>${db.protocol}:${db.subprotocol}${db.separator}${db.alias}</db.url>
<db.hbm2ddl>none</db.hbm2ddl>
</properties>

<profiles>
<profile>
<id>import-orthology</id>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<phase>install</phase>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
<configuration>
<executable>java</executable>
<arguments>
<argument>-Xmx12288m</argument>
<argument>-Xms2048m</argument>
<argument>-classpath</argument>
<classpath/>
<argument>psidev.psi.mi.jami.batch.MIBatchJobManager</argument>
<argument>classpath*:/META-INF/orthology-import-spring.xml</argument>
<argument>${job.name}</argument>
</arguments>
</configuration>
</plugin>
</plugins>
</build>
</profile>
</profiles>


<dependencies>

<!-- base dependencies -->

<dependency>
<groupId>psidev.psi.mi.jami</groupId>
<artifactId>jami-core</artifactId>
<version>${psi.jami.version}</version>
</dependency>

<dependency>
<groupId>psidev.psi.mi.jami</groupId>
<artifactId>jami-batch</artifactId>
<version>${psi.jami.version}</version>
</dependency>

<dependency>
<groupId>psidev.psi.mi.jami.bridges</groupId>
<artifactId>jami-uniprot</artifactId>
<version>${psi.jami.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>uk.ac.ebi.intact.jami</groupId>
<artifactId>intact-jami</artifactId>
<version>${intact.jami.version}</version>
</dependency>

<dependency>
<groupId>org.springframework.retry</groupId>
<artifactId>spring-retry</artifactId>
<version>1.3.4</version>
</dependency>

<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-aspects</artifactId>
<version>${spring.version}</version>
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
</dependency>

<!-- Added dependencies -->

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.21</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>jakarta.persistence</groupId>
<artifactId>jakarta.persistence-api</artifactId>
<version>2.2.3</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package uk.ac.ebi.intact.ortholog;

import lombok.extern.log4j.Log4j;
import org.apache.commons.io.FileUtils;

import java.io.*;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Log4j
public class OrthologsFileParser {

private static final Pattern UNIPROT_KB_REGEX = Pattern.compile("UniProtKB=([A-Z0-9]+)");
private static final Pattern PANTHER_REGEX = Pattern.compile("PTHR\\d+");

public static void parseFileAndSave(String inputFilePath, String outputDirPath) throws IOException {
log.info("Parsing file...");

File outputDir = new File(outputDirPath);
// First, we empty de directory to start clean
if (outputDir.exists()) {
FileUtils.deleteDirectory(outputDir);
}
outputDir.mkdirs();
long linesRead = 0;

// First we store all matches in a map to ensure there's no duplication
Map<String, Set<String>> uniprotAndPTHR = new HashMap<>();
try (BufferedReader reader = new BufferedReader(new FileReader(inputFilePath))) {
String line;
while ((line = reader.readLine()) != null) {
linesRead++;
ArrayList<String> uniprotMatches = new ArrayList<>();

Matcher uniprotMatcher = UNIPROT_KB_REGEX.matcher(line);
Matcher pantherMatcher = PANTHER_REGEX.matcher(line);

while (uniprotMatcher.find()) {
uniprotMatches.add(uniprotMatcher.group(1));
}
while (pantherMatcher.find()) {
for (String uniprotMatch : uniprotMatches) {
uniprotAndPTHR.putIfAbsent(uniprotMatch, new HashSet<>());
uniprotAndPTHR.get(uniprotMatch).add(pantherMatcher.group());
}
}

if (linesRead % 250_000 == 0) {
log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read");
}
}
}

log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read");
log.info("File parsed.");

log.info("Saving map to files...");

// Then, we write all the files
long uniprotAndPantherCount = 0;
for (String uniprotMatch : uniprotAndPTHR.keySet()) {
for (String pantherMatch : uniprotAndPTHR.get(uniprotMatch)) {
writePair(outputDir.toPath(), uniprotMatch, pantherMatch);
}
uniprotAndPantherCount += uniprotAndPTHR.get(uniprotMatch).size();
if (uniprotAndPantherCount % 25_000 == 0) {
log.info(uniprotAndPantherCount + " proteins saved");
}
}

log.info("All protein files saved.");
log.info("Number of Panther identifiers: " + uniprotAndPantherCount);
}

private static void writePair(Path dirPath, String uniprotId, String pantherId) throws IOException {
Path filePath = dirPath.resolve(uniprotId);
try (FileWriter fileWriter = new FileWriter(filePath.toFile(), true);
BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
bufferedWriter.write(uniprotId + "," + pantherId);
bufferedWriter.newLine();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package uk.ac.ebi.intact.ortholog;

import lombok.extern.log4j.Log4j;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.io.IOUtils;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.zip.GZIPInputStream;

@Log4j
public class OrthologsFileReader{

public static void decompressGzip(String url, String filePath) throws IOException {
URL gzipUrl = new URL(url);
HttpURLConnection connection = (HttpURLConnection) gzipUrl.openConnection();
int responseCode = connection.getResponseCode();

if (responseCode == HttpURLConnection.HTTP_OK) {
log.info("Connected to URL.");
try (InputStream in = connection.getInputStream();
GZIPInputStream gis = new GZIPInputStream(in);
TarArchiveInputStream tis = new TarArchiveInputStream(gis)) {
log.info("Decompressing...");
while (tis.getNextTarEntry() != null) {
File outputFile = new File(filePath);
try (FileOutputStream fos = new FileOutputStream(outputFile, false)) {
// the false make it write over existing data
IOUtils.copy(tis, fos);
}
}
log.info("File decompressed, data in " + filePath);
}
finally {
connection.disconnect();
log.info("Disconnected from URL.");
}
}
else {
log.info("GZIP returned unexpected response: " + responseCode);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package uk.ac.ebi.intact.ortholog;

import lombok.extern.log4j.Log4j;
import uk.ac.ebi.intact.jami.dao.IntactDao;
import uk.ac.ebi.intact.jami.model.extension.IntactProtein;
import javax.annotation.Resource;
import javax.persistence.Query;
import java.io.*;
import java.nio.file.Path;
import java.util.*;

@Log4j
public class OrthologsProteinAssociation {

@Resource(name="intactDao")
private final IntactDao intactDao;

public OrthologsProteinAssociation(IntactDao intactDao) {
this.intactDao = intactDao;
}

public List<Integer> getProteinAcs() {
String sqlQuery = "select CAST(REPLACE(ac,'EBI-','') as integer) as numberAC from intact.ia_interactor p where category = 'protein' order by numberAC asc";
Query query = intactDao.getEntityManager().createNativeQuery(sqlQuery);
return query.getResultList();
}

public List<IntactProtein> fetchProteins(Integer startAc, Integer endAc) {
String sqlQuery = "select p FROM IntactProtein p where CAST(REPLACE(ac,'EBI-','') as integer) BETWEEN :startAc and :endAc";
Query query = intactDao.getEntityManager().createQuery(sqlQuery);
query.setParameter("startAc", startAc);
query.setParameter("endAc", endAc);
return query.getResultList();
}

public static Collection<String> associateOneProteinToPantherIds(String dirPath, IntactProtein protein) throws IOException {
String proteinAc = protein.getUniprotkb();
List<String> pantherIds = new ArrayList<>();
if (proteinAc != null) {
Path filePath = Path.of(dirPath).resolve(protein.getUniprotkb());
if (filePath.toFile().exists()) {
try (BufferedReader reader = new BufferedReader(new FileReader(filePath.toFile()))) {
String line;
while ((line = reader.readLine()) != null) {
String[] parts = line.split(",");
if (parts.length == 2) {
String proteinId = parts[0];
if (proteinId.equals(protein.getUniprotkb())) {
pantherIds.add(parts[1]);
}

}
}
}
return pantherIds;
}
}
return pantherIds;
}
}
Loading

0 comments on commit 6a38dd7

Please sign in to comment.