diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 32a7b269b..32da2e778 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -37,7 +37,6 @@ public class DownloadProperties { private URLProperties geneUniprotXref; private URLProperties geneExpressionAtlas; private URLProperties mirbase; - private URLProperties mirbaseReadme; private URLProperties targetScan; private URLProperties miRTarBase; private URLProperties uniprot; @@ -134,15 +133,6 @@ public DownloadProperties setMirbase(URLProperties mirbase) { return this; } - public URLProperties getMirbaseReadme() { - return mirbaseReadme; - } - - public DownloadProperties setMirbaseReadme(URLProperties mirbaseReadme) { - this.mirbaseReadme = mirbaseReadme; - return this; - } - public URLProperties getTargetScan() { return targetScan; } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 58cebca84..88b2ac039 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -92,9 +92,8 @@ download: goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz mirbase: - host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz - mirbaseReadme: - host: ftp://mirbase.org/pub/mirbase/CURRENT/README + host: https://www.mirbase.org/download/miRNA.dat + version: "22.1" targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 46d64ee65..896322370 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -35,6 +35,8 @@ */ public class EtlCommons { + public static final String ENSEMBL_NAME = "ENSEMBL"; + public static final String HOMO_SAPIENS_NAME= "Homo sapiens"; public static final String SUFFIX_VERSION_FILENAME = "Version.json"; @@ -184,6 +186,12 @@ public class EtlCommons { public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz"; public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz"; + public static final String MIRBASE_NAME = "miRBase"; + public static final String MIRBASE_VERSION_FILENAME = MIRBASE_NAME + SUFFIX_VERSION_FILENAME; + + public static final String MIRTARBASE_NAME = "miRTarBase"; + public static final String MIRTARBASE_VERSION_FILENAME = MIRTARBASE_NAME + SUFFIX_VERSION_FILENAME; + public static boolean runCommandLineProcess(File workingDirectory, String binPath, List args, String logFilePath) throws IOException, InterruptedException { // This small hack allow to configure the appropriate Logger level from the command line, this is done diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 51152e478..8b0cf01ab 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -32,20 +32,19 @@ import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.*; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import static org.opencb.cellbase.lib.EtlCommons.*; + public class RegulationDownloadManager extends AbstractDownloadManager { private Path regulationFolder; - private static final String ENSEMBL_NAME = "ENSEMBL"; - private static final String MIRBASE_NAME = "miRBase"; - private static final String MIRTARBASE_NAME = "miRTarBase"; - public RegulationDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration) throws IOException, CellBaseException { super(species, assembly, outdir, configuration); @@ -54,7 +53,7 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C @Override public List download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException { if (!speciesHasInfoToDownload(speciesConfiguration, "regulation")) { - return null; + return Collections.emptyList(); } this.regulationFolder = downloadFolder.resolve("regulation"); Files.createDirectories(regulationFolder); @@ -108,22 +107,24 @@ private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFo logger.info("regulatory_pfm.json.gz is already built"); return; } - Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); - Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile); - Gff2 tfbsMotifFeature; Set motifIds = new HashSet<>(); - Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); - while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { - String pfmId = getMatrixId(filePattern, tfbsMotifFeature); - if (StringUtils.isNotEmpty(pfmId)) { - motifIds.add(pfmId); + Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE); + try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) { + Gff2 tfbsMotifFeature; + Pattern filePattern = Pattern.compile("ENSPFM(\\d+)"); + while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) { + String pfmId = getMatrixId(filePattern, tfbsMotifFeature); + if (StringUtils.isNotEmpty(pfmId)) { + motifIds.add(pfmId); + } } } - motifsFeatureReader.close(); ObjectMapper mapper = new ObjectMapper(); CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true); - logger.info("Looking up " + motifIds.size() + " pfms"); + if (logger.isInfoEnabled()) { + logger.info("Looking up {} pfms", motifIds.size()); + } for (String pfmId : motifIds) { String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId + "?unit=frequencies;content-type=application/json"; @@ -145,22 +146,24 @@ private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) { } private DownloadFile downloadMirna() throws IOException, InterruptedException { + logger.info("Downloading {} ...", MIRBASE_NAME); String url = configuration.getDownload().getMirbase().getHost(); - String readmeUrl = configuration.getDownload().getMirbaseReadme().getHost(); - downloadFile(readmeUrl, regulationFolder.resolve("mirbaseReadme.txt").toString()); - saveVersionData(EtlCommons.REGULATION_DATA, MIRBASE_NAME, - getLine(regulationFolder.resolve("mirbaseReadme.txt"), 1), getTimeStamp(), - Collections.singletonList(url), regulationFolder.resolve("mirbaseVersion.json")); - Path outputPath = regulationFolder.resolve("miRNA.xls.gz"); - DownloadFile downloadFile = downloadFile(url, regulationFolder.resolve("miRNA.xls.gz").toString()); - EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null); - return downloadFile; + + saveVersionData(EtlCommons.REGULATION_DATA, MIRBASE_NAME, configuration.getDownload().getMirbase().getVersion(), getTimeStamp(), + Collections.singletonList(url), regulationFolder.resolve(MIRBASE_VERSION_FILENAME)); + Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName()); + logger.info("Downloading from {} to {} ...", url, outputPath); + return downloadFile(url, outputPath.toString()); } private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException { + logger.info("Downloading {} ...", MIRTARBASE_NAME); String url = configuration.getDownload().getMiRTarBase().getHost(); - saveVersionData(EtlCommons.REGULATION_DATA, MIRTARBASE_NAME, null, getTimeStamp(), Collections.singletonList(url), - regulationFolder.resolve("miRTarBaseVersion.json")); - return downloadFile(url, regulationFolder.resolve("hsa_MTI.xlsx").toString()); + + saveVersionData(EtlCommons.REGULATION_DATA, MIRTARBASE_NAME, configuration.getDownload().getMiRTarBase().getVersion(), + getTimeStamp(), Collections.singletonList(url), regulationFolder.resolve(MIRTARBASE_VERSION_FILENAME)); + Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName()); + logger.info("Downloading from {} to {} ...", url, outputPath); + return downloadFile(url, outputPath.toString()); } }