Skip to content

Commit

Permalink
lib: update regulation download manager, and the configuration file, …
Browse files Browse the repository at this point in the history
…to take into account the version for miRBase and miRTarBase, #TASK-5775, #TASK-5564
  • Loading branch information
jtarraga committed Mar 28, 2024
1 parent f308f25 commit 2e6e895
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ public class DownloadProperties {
private URLProperties geneUniprotXref;
private URLProperties geneExpressionAtlas;
private URLProperties mirbase;
private URLProperties mirbaseReadme;
private URLProperties targetScan;
private URLProperties miRTarBase;
private URLProperties uniprot;
Expand Down Expand Up @@ -134,15 +133,6 @@ public DownloadProperties setMirbase(URLProperties mirbase) {
return this;
}

public URLProperties getMirbaseReadme() {
return mirbaseReadme;
}

public DownloadProperties setMirbaseReadme(URLProperties mirbaseReadme) {
this.mirbaseReadme = mirbaseReadme;
return this;
}

public URLProperties getTargetScan() {
return targetScan;
}
Expand Down
5 changes: 2 additions & 3 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,8 @@ download:
goAnnotation:
host: http://geneontology.org/gene-associations/goa_human.gaf.gz
mirbase:
host: ftp://mirbase.org/pub/mirbase/CURRENT/miRNA.xls.gz
mirbaseReadme:
host: ftp://mirbase.org/pub/mirbase/CURRENT/README
host: https://www.mirbase.org/download/miRNA.dat
version: "22.1"
targetScan:
host: http://hgdownload.cse.ucsc.edu/goldenPath/
miRTarBase:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
*/
public class EtlCommons {

public static final String ENSEMBL_NAME = "ENSEMBL";

public static final String HOMO_SAPIENS_NAME= "Homo sapiens";

public static final String SUFFIX_VERSION_FILENAME = "Version.json";
Expand Down Expand Up @@ -184,6 +186,12 @@ public class EtlCommons {
public static final String REGULATORY_FEATURES_FILE = "Regulatory_Build.regulatory_features.gff.gz";
public static final String MOTIF_FEATURES_FILE = "motif_features.gff.gz";

public static final String MIRBASE_NAME = "miRBase";
public static final String MIRBASE_VERSION_FILENAME = MIRBASE_NAME + SUFFIX_VERSION_FILENAME;

public static final String MIRTARBASE_NAME = "miRTarBase";
public static final String MIRTARBASE_VERSION_FILENAME = MIRTARBASE_NAME + SUFFIX_VERSION_FILENAME;

public static boolean runCommandLineProcess(File workingDirectory, String binPath, List<String> args, String logFilePath)
throws IOException, InterruptedException {
// This small hack allow to configure the appropriate Logger level from the command line, this is done
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,19 @@
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.opencb.cellbase.lib.EtlCommons.*;


public class RegulationDownloadManager extends AbstractDownloadManager {

private Path regulationFolder;

private static final String ENSEMBL_NAME = "ENSEMBL";
private static final String MIRBASE_NAME = "miRBase";
private static final String MIRTARBASE_NAME = "miRTarBase";

public RegulationDownloadManager(String species, String assembly, Path outdir, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
super(species, assembly, outdir, configuration);
Expand All @@ -54,7 +53,7 @@ public RegulationDownloadManager(String species, String assembly, Path outdir, C
@Override
public List<DownloadFile> download() throws IOException, InterruptedException, NoSuchMethodException, FileFormatException {
if (!speciesHasInfoToDownload(speciesConfiguration, "regulation")) {
return null;
return Collections.emptyList();
}
this.regulationFolder = downloadFolder.resolve("regulation");
Files.createDirectories(regulationFolder);
Expand Down Expand Up @@ -108,22 +107,24 @@ private void loadPfmMatrices() throws IOException, NoSuchMethodException, FileFo
logger.info("regulatory_pfm.json.gz is already built");
return;
}
Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE);
Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile);
Gff2 tfbsMotifFeature;
Set<String> motifIds = new HashSet<>();
Pattern filePattern = Pattern.compile("ENSPFM(\\d+)");
while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) {
String pfmId = getMatrixId(filePattern, tfbsMotifFeature);
if (StringUtils.isNotEmpty(pfmId)) {
motifIds.add(pfmId);
Path motifGffFile = regulationFolder.resolve(EtlCommons.MOTIF_FEATURES_FILE);
try (Gff2Reader motifsFeatureReader = new Gff2Reader(motifGffFile)) {
Gff2 tfbsMotifFeature;
Pattern filePattern = Pattern.compile("ENSPFM(\\d+)");
while ((tfbsMotifFeature = motifsFeatureReader.read()) != null) {
String pfmId = getMatrixId(filePattern, tfbsMotifFeature);
if (StringUtils.isNotEmpty(pfmId)) {
motifIds.add(pfmId);
}
}
}
motifsFeatureReader.close();

ObjectMapper mapper = new ObjectMapper();
CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "regulatory_pfm", true);
logger.info("Looking up " + motifIds.size() + " pfms");
if (logger.isInfoEnabled()) {
logger.info("Looking up {} pfms", motifIds.size());
}
for (String pfmId : motifIds) {
String urlString = "https://rest.ensembl.org/species/homo_sapiens/binding_matrix/" + pfmId
+ "?unit=frequencies;content-type=application/json";
Expand All @@ -145,22 +146,24 @@ private String getMatrixId(Pattern pattern, Gff2 tfbsMotifFeature) {
}

private DownloadFile downloadMirna() throws IOException, InterruptedException {
logger.info("Downloading {} ...", MIRBASE_NAME);
String url = configuration.getDownload().getMirbase().getHost();
String readmeUrl = configuration.getDownload().getMirbaseReadme().getHost();
downloadFile(readmeUrl, regulationFolder.resolve("mirbaseReadme.txt").toString());
saveVersionData(EtlCommons.REGULATION_DATA, MIRBASE_NAME,
getLine(regulationFolder.resolve("mirbaseReadme.txt"), 1), getTimeStamp(),
Collections.singletonList(url), regulationFolder.resolve("mirbaseVersion.json"));
Path outputPath = regulationFolder.resolve("miRNA.xls.gz");
DownloadFile downloadFile = downloadFile(url, regulationFolder.resolve("miRNA.xls.gz").toString());
EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null);
return downloadFile;

saveVersionData(EtlCommons.REGULATION_DATA, MIRBASE_NAME, configuration.getDownload().getMirbase().getVersion(), getTimeStamp(),
Collections.singletonList(url), regulationFolder.resolve(MIRBASE_VERSION_FILENAME));
Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName());
logger.info("Downloading from {} to {} ...", url, outputPath);
return downloadFile(url, outputPath.toString());
}

private DownloadFile downloadMiRTarBase() throws IOException, InterruptedException {
logger.info("Downloading {} ...", MIRTARBASE_NAME);
String url = configuration.getDownload().getMiRTarBase().getHost();
saveVersionData(EtlCommons.REGULATION_DATA, MIRTARBASE_NAME, null, getTimeStamp(), Collections.singletonList(url),
regulationFolder.resolve("miRTarBaseVersion.json"));
return downloadFile(url, regulationFolder.resolve("hsa_MTI.xlsx").toString());

saveVersionData(EtlCommons.REGULATION_DATA, MIRTARBASE_NAME, configuration.getDownload().getMiRTarBase().getVersion(),
getTimeStamp(), Collections.singletonList(url), regulationFolder.resolve(MIRTARBASE_VERSION_FILENAME));
Path outputPath = regulationFolder.resolve(Paths.get(url).getFileName());
logger.info("Downloading from {} to {} ...", url, outputPath);
return downloadFile(url, outputPath.toString());
}
}

0 comments on commit 2e6e895

Please sign in to comment.