Merge branch 'develop' into export-datasets-on-mitab-format

EBI-IntAct · Sep 26, 2024 · 6a38dd7 · 6a38dd7
2 parents a7a1f98 + cd95a06
commit 6a38dd7
Show file tree

Hide file tree

Showing 21 changed files with 1,064 additions and 4 deletions.
diff --git a/intact-orthology-import/importOrtholgy.sh b/intact-orthology-import/importOrtholgy.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#SBATCH --time=02-00:00:00   # walltime
+#SBATCH --ntasks=1   # number of tasks
+#SBATCH --cpus-per-task=5   # number of CPUs Per Task i.e if your code is multi-threaded
+#SBATCH -p research   # partition(s)
+#SBATCH --mem=32G   # memory per node
+#SBATCH -J "ORTHOLOG_IMPORT"   # job name
+#SBATCH -o "/nfs/production/hhe/intact/data/panther/logs/ortholog-import-%j.out"   # job output file
+#SBATCH --mail-type=ALL
+#SBATCH [email protected]   # email address
+export JAVA_HOME=/hps/software/users/hhe/intact/third-party-softwares/latest_intact_jdk11
+
+if [ $# -ne 1 ]; then
+      echo ""
+      echo "ERROR: wrong number of parameters ($#)."
+      echo ""
+      exit 1
+fi
+
+PROFILE=$1;
+
+echo "Profile: $PROFILE"
+
+mvn clean -U install -P import-orthology,${PROFILE} -Djob.name=orthologyImport -Dmaven.test.skip
diff --git a/intact-orthology-import/pom.xml b/intact-orthology-import/pom.xml
@@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>uk.ac.ebi.intact.dataexchange</groupId>
+        <artifactId>intact-dataexchange-master</artifactId>
+        <version>4.2.0-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>intact-orthology-import</artifactId>
+    <packaging>jar</packaging>
+    <name>intact-orthology-import</name>
+
+
+    <properties>
+        <db.url>${db.protocol}:${db.subprotocol}${db.separator}${db.alias}</db.url>
+        <db.hbm2ddl>none</db.hbm2ddl>
+    </properties>
+
+    <profiles>
+        <profile>
+            <id>import-orthology</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.codehaus.mojo</groupId>
+                        <artifactId>exec-maven-plugin</artifactId>
+                        <executions>
+                            <execution>
+                                <phase>install</phase>
+                                <goals>
+                                    <goal>exec</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                        <configuration>
+                            <executable>java</executable>
+                            <arguments>
+                                <argument>-Xmx12288m</argument>
+                                <argument>-Xms2048m</argument>
+                                <argument>-classpath</argument>
+                                <classpath/>
+                                <argument>psidev.psi.mi.jami.batch.MIBatchJobManager</argument>
+                                <argument>classpath*:/META-INF/orthology-import-spring.xml</argument>
+                                <argument>${job.name}</argument>
+                            </arguments>
+                        </configuration>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>
+
+
+    <dependencies>
+
+        <!--        base dependencies -->
+
+        <dependency>
+            <groupId>psidev.psi.mi.jami</groupId>
+            <artifactId>jami-core</artifactId>
+            <version>${psi.jami.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>psidev.psi.mi.jami</groupId>
+            <artifactId>jami-batch</artifactId>
+            <version>${psi.jami.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>psidev.psi.mi.jami.bridges</groupId>
+            <artifactId>jami-uniprot</artifactId>
+            <version>${psi.jami.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>jcl-over-slf4j</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+
+        <dependency>
+            <groupId>uk.ac.ebi.intact.jami</groupId>
+            <artifactId>intact-jami</artifactId>
+            <version>${intact.jami.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.springframework.retry</groupId>
+            <artifactId>spring-retry</artifactId>
+            <version>1.3.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-aspects</artifactId>
+            <version>${spring.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.projectlombok</groupId>
+            <artifactId>lombok</artifactId>
+            <version>1.18.30</version>
+        </dependency>
+
+        <!--        Added dependencies -->
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-compress</artifactId>
+            <version>1.21</version>
+            <scope>compile</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+            <version>2.4</version>
+        </dependency>
+        <dependency>
+            <groupId>jakarta.persistence</groupId>
+            <artifactId>jakarta.persistence-api</artifactId>
+            <version>2.2.3</version>
+            <scope>compile</scope>
+        </dependency>
+    </dependencies>
+</project>
diff --git a/intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileParser.java b/intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileParser.java
@@ -0,0 +1,89 @@
+package uk.ac.ebi.intact.ortholog;
+
+import lombok.extern.log4j.Log4j;
+import org.apache.commons.io.FileUtils;
+
+import java.io.*;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@Log4j
+public class OrthologsFileParser {
+
+    private static final Pattern UNIPROT_KB_REGEX = Pattern.compile("UniProtKB=([A-Z0-9]+)");
+    private static final Pattern PANTHER_REGEX = Pattern.compile("PTHR\\d+");
+
+    public static void parseFileAndSave(String inputFilePath, String outputDirPath) throws IOException {
+        log.info("Parsing file...");
+
+        File outputDir = new File(outputDirPath);
+        // First, we empty de directory to start clean
+        if (outputDir.exists()) {
+            FileUtils.deleteDirectory(outputDir);
+        }
+        outputDir.mkdirs();
+        long linesRead = 0;
+
+        // First we store all matches in a map to ensure there's no duplication
+        Map<String, Set<String>> uniprotAndPTHR = new HashMap<>();
+        try (BufferedReader reader = new BufferedReader(new FileReader(inputFilePath))) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                linesRead++;
+                ArrayList<String> uniprotMatches = new ArrayList<>();
+
+                Matcher uniprotMatcher = UNIPROT_KB_REGEX.matcher(line);
+                Matcher pantherMatcher = PANTHER_REGEX.matcher(line);
+
+                while (uniprotMatcher.find()) {
+                    uniprotMatches.add(uniprotMatcher.group(1));
+                }
+                while (pantherMatcher.find()) {
+                    for (String uniprotMatch : uniprotMatches) {
+                        uniprotAndPTHR.putIfAbsent(uniprotMatch, new HashSet<>());
+                        uniprotAndPTHR.get(uniprotMatch).add(pantherMatcher.group());
+                    }
+                }
+
+                if (linesRead % 250_000 == 0) {
+                    log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read");
+                }
+            }
+        }
+
+        log.info(linesRead + " lines read, " + uniprotAndPTHR.size() + " proteins read");
+        log.info("File parsed.");
+
+        log.info("Saving map to files...");
+
+        // Then, we write all the files
+        long uniprotAndPantherCount = 0;
+        for (String uniprotMatch : uniprotAndPTHR.keySet()) {
+            for (String pantherMatch : uniprotAndPTHR.get(uniprotMatch)) {
+                writePair(outputDir.toPath(), uniprotMatch, pantherMatch);
+            }
+            uniprotAndPantherCount += uniprotAndPTHR.get(uniprotMatch).size();
+            if (uniprotAndPantherCount % 25_000 == 0) {
+                log.info(uniprotAndPantherCount + " proteins saved");
+            }
+        }
+
+        log.info("All protein files saved.");
+        log.info("Number of Panther identifiers: " + uniprotAndPantherCount);
+    }
+
+    private static void writePair(Path dirPath, String uniprotId, String pantherId) throws IOException {
+        Path filePath = dirPath.resolve(uniprotId);
+        try (FileWriter fileWriter = new FileWriter(filePath.toFile(), true);
+             BufferedWriter bufferedWriter = new BufferedWriter(fileWriter)) {
+            bufferedWriter.write(uniprotId + "," + pantherId);
+            bufferedWriter.newLine();
+        }
+    }
+}
diff --git a/intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileReader.java b/intact-orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsFileReader.java
@@ -0,0 +1,43 @@
+package uk.ac.ebi.intact.ortholog;
+
+import lombok.extern.log4j.Log4j;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.io.IOUtils;
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.zip.GZIPInputStream;
+
+@Log4j
+public class OrthologsFileReader{
+
+    public static void decompressGzip(String url, String filePath) throws IOException {
+        URL gzipUrl = new URL(url);
+        HttpURLConnection connection = (HttpURLConnection) gzipUrl.openConnection();
+        int responseCode = connection.getResponseCode();
+
+        if (responseCode == HttpURLConnection.HTTP_OK) {
+            log.info("Connected to URL.");
+            try (InputStream in = connection.getInputStream();
+                 GZIPInputStream gis = new GZIPInputStream(in);
+                 TarArchiveInputStream tis = new TarArchiveInputStream(gis)) {
+                log.info("Decompressing...");
+                while (tis.getNextTarEntry() != null) {
+                    File outputFile = new File(filePath);
+                    try (FileOutputStream fos = new FileOutputStream(outputFile, false)) {
+                        // the false make it write over existing data
+                        IOUtils.copy(tis, fos);
+                    }
+                }
+                log.info("File decompressed, data in " + filePath);
+            }
+            finally {
+                connection.disconnect();
+                log.info("Disconnected from URL.");
+            }
+        }
+        else {
+            log.info("GZIP returned unexpected response: " + responseCode);
+        }
+    }
+}
diff --git a/...orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsProteinAssociation.java b/...orthology-import/src/main/java/uk/ac/ebi/intact/ortholog/OrthologsProteinAssociation.java
@@ -0,0 +1,60 @@
+package uk.ac.ebi.intact.ortholog;
+
+import lombok.extern.log4j.Log4j;
+import uk.ac.ebi.intact.jami.dao.IntactDao;
+import uk.ac.ebi.intact.jami.model.extension.IntactProtein;
+import javax.annotation.Resource;
+import javax.persistence.Query;
+import java.io.*;
+import java.nio.file.Path;
+import java.util.*;
+
+@Log4j
+public class OrthologsProteinAssociation {
+
+    @Resource(name="intactDao")
+    private final IntactDao intactDao;
+
+    public OrthologsProteinAssociation(IntactDao intactDao) {
+        this.intactDao = intactDao;
+    }
+
+    public List<Integer> getProteinAcs() {
+        String sqlQuery = "select CAST(REPLACE(ac,'EBI-','') as integer) as numberAC from intact.ia_interactor p where category = 'protein' order by numberAC asc";
+        Query query = intactDao.getEntityManager().createNativeQuery(sqlQuery);
+        return query.getResultList();
+    }
+
+    public List<IntactProtein> fetchProteins(Integer startAc, Integer endAc) {
+        String sqlQuery = "select p FROM IntactProtein p where CAST(REPLACE(ac,'EBI-','') as integer) BETWEEN :startAc and :endAc";
+        Query query = intactDao.getEntityManager().createQuery(sqlQuery);
+        query.setParameter("startAc", startAc);
+        query.setParameter("endAc", endAc);
+        return query.getResultList();
+    }
+
+    public static Collection<String> associateOneProteinToPantherIds(String dirPath, IntactProtein protein) throws IOException {
+        String proteinAc = protein.getUniprotkb();
+        List<String> pantherIds = new ArrayList<>();
+        if (proteinAc != null) {
+            Path filePath = Path.of(dirPath).resolve(protein.getUniprotkb());
+            if (filePath.toFile().exists()) {
+                try (BufferedReader reader = new BufferedReader(new FileReader(filePath.toFile()))) {
+                    String line;
+                    while ((line = reader.readLine()) != null) {
+                        String[] parts = line.split(",");
+                        if (parts.length == 2) {
+                            String proteinId = parts[0];
+                            if (proteinId.equals(protein.getUniprotkb())) {
+                                pantherIds.add(parts[1]);
+                            }
+
+                        }
+                    }
+                }
+                return pantherIds;
+            }
+        }
+        return pantherIds;
+    }
+}