Skip to content

Commit

Permalink
lib: improve repeat downloader by updating versions from config file,…
Browse files Browse the repository at this point in the history
… adding log messages, removing hardcode filenames, sonnar issues,..., #TASK-5775, #TASK-5564
  • Loading branch information
jtarraga committed Mar 8, 2024
1 parent 4cdd046 commit 3cea3f3
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 69 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,9 @@ public void execute() {

private CellBaseBuilder buildRepeats() {
Path repeatsFilesDir = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER);
copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILE)));
copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILE)));
copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILE)));
copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.TRF_VERSION_FILENAME)));
copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.GSD_VERSION_FILENAME)));
copyVersionFiles(Arrays.asList(repeatsFilesDir.resolve(EtlCommons.WM_VERSION_FILENAME)));
// TODO: chunk size is not really used in ConvervedRegionParser, remove?
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, EtlCommons.REPEATS_JSON);
return new RepeatsBuilder(repeatsFilesDir, serializer);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -484,9 +484,9 @@ private void loadRepeats() {

// Update release (collection and sources)
List<Path> sources = new ArrayList<>(Arrays.asList(
input.resolve(EtlCommons.TRF_VERSION_FILE),
input.resolve(EtlCommons.GSD_VERSION_FILE),
input.resolve(EtlCommons.WM_VERSION_FILE)
input.resolve(EtlCommons.TRF_VERSION_FILENAME),
input.resolve(EtlCommons.GSD_VERSION_FILENAME),
input.resolve(EtlCommons.WM_VERSION_FILENAME)
));
dataReleaseManager.update(dataRelease, "repeats", EtlCommons.REPEATS_DATA, sources);
} catch (ClassNotFoundException | NoSuchMethodException | InstantiationException | InvocationTargetException
Expand Down
9 changes: 6 additions & 3 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,14 @@ download:
dgv:
host: http://dgv.tcag.ca/v106/docs
simpleRepeats:
host: http://hgdownload.cse.ucsc.edu/goldenPath
## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38
host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/simpleRepeat.txt.gz
windowMasker:
host: http://hgdownload.cse.ucsc.edu/goldenPath
## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38
host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/windowmaskerSdust.txt.gz
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
## The CellBase downloader will change put_assembly_here by the assembly, e.g. hg38
host: http://hgdownload.cse.ucsc.edu/goldenPath/put_assembly_here/database/genomicSuperDups.txt.gz

## Variant Pathogenic Prediction
revel:
Expand Down
25 changes: 17 additions & 8 deletions cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
*/
public class EtlCommons {

public static final String HOMO_SAPIENS_NAME= "Homo sapiens";

public static final String SUFFIX_VERSION_FILENAME = "Version.json";

public static final String GENOME_DATA = "genome";
Expand Down Expand Up @@ -145,19 +147,26 @@ public class EtlCommons {
public static final String DGV_FILE = "dgv.txt";
public static final String DGV_VERSION_FILE = "dgvVersion.json";
public static final String STRUCTURAL_VARIANTS_JSON = "structuralVariants";

@Deprecated
public static final String TRF_FILE = "simpleRepeat.txt.gz";
public static final String TRF_VERSION_FILE = "simpleRepeat.json";
@Deprecated
public static final String GSD_FILE = "genomicSuperDups.txt.gz";
public static final String GSD_VERSION_FILE = "genomicSuperDups.json";
public static final String WM_FILE = "windowMasker.txt.gz";
public static final String WM_VERSION_FILE = "windowMasker.json";
@Deprecated
public static final String WM_FILE = "windowmaskerSdust.txt.gz";

public static final String TRF_VERSION_FILENAME = "simpleRepeat" + SUFFIX_VERSION_FILENAME;
public static final String GSD_VERSION_FILENAME = "genomicSuperDups" + SUFFIX_VERSION_FILENAME;
public static final String WM_VERSION_FILENAME = "windowMasker" + SUFFIX_VERSION_FILENAME;
public static final String REPEATS_FOLDER = "genome";
public static final String REPEATS_JSON = "repeats";

public static final String OBO_JSON = "ontology";
public static final String HPO_VERSION_FILE = "hpoVersion.json";
public static final String GO_VERSION_FILE = "goVersion.json";
public static final String DO_VERSION_FILE = "doVersion.json";
public static final String MONDO_VERSION_FILE = "mondoVersion.json";
public static final String HPO_VERSION_FILE = "hpo" + SUFFIX_VERSION_FILENAME;
public static final String GO_VERSION_FILE = "go" + SUFFIX_VERSION_FILENAME;
public static final String DO_VERSION_FILE = "do" + SUFFIX_VERSION_FILENAME;
public static final String MONDO_VERSION_FILE = "mondo" + SUFFIX_VERSION_FILENAME;

public static final String HGMD_FILE = "hgmd.vcf";
public static final String PUBMED_VERSION_FILENAME = "pubmedVersion.json";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class AbstractDownloadManager {

private static final String GNOMAD_NAME = "gnomAD";

protected static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} to {} ...";

protected String species;
protected String assembly;
protected Path outdir;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@

public class GeneDownloadManager extends AbstractDownloadManager {

private static final String HOMO_SAPIENS_NAME= "Homo sapiens";
private static final String ENSEMBL_NAME = "ENSEMBL";
private static final String REFSEQ_NAME = "RefSeq";
private static final String UNIPROT_NAME = "UniProt";
Expand All @@ -49,8 +48,6 @@ public class GeneDownloadManager extends AbstractDownloadManager {

private static final Map<String, String> GENE_UNIPROT_XREF_FILES;

private static final String DOWNLOADING_LOG_MESSAGE = "Downloading {} to {} ...";

static {
GENE_UNIPROT_XREF_FILES = new HashMap<>();
GENE_UNIPROT_XREF_FILES.put(HOMO_SAPIENS_NAME, "HUMAN_9606_idmapping_selected.tab.gz");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,16 @@
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.commons.utils.DockerUtils;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.GERP_SUBDIRECTORY;
import static org.opencb.cellbase.lib.EtlCommons.HOMO_SAPIENS_NAME;

public class GenomeDownloadManager extends AbstractDownloadManager {

Expand All @@ -38,6 +42,8 @@ public class GenomeDownloadManager extends AbstractDownloadManager {
private static final String GSD_NAME = "Genomic super duplications";
private static final String WM_NAME = "WindowMasker";

private static final String PUT_ASSEMBLY_HERE_MARK = "put_assembly_here";

public GenomeDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
super(species, assembly, targetDirectory, configuration);
Expand All @@ -50,8 +56,6 @@ public List<DownloadFile> download() throws IOException, InterruptedException {
downloadFiles.addAll(downloadConservation());
downloadFiles.addAll(downloadRepeats());

// cytobands
// runGenomeInfo();
return downloadFiles;
}

Expand Down Expand Up @@ -83,7 +87,7 @@ public List<DownloadFile> downloadReferenceGenome() throws IOException, Interrup
saveVersionData(EtlCommons.GENOME_DATA, ENSEMBL_NAME, ensemblVersion, getTimeStamp(),
Collections.singletonList(url), sequenceFolder.resolve("genomeVersion.json"));
List<DownloadFile> downloadFiles = Collections.singletonList(downloadFile(url, outputPath.toString()));
logger.info("Unzipping file: " + outputFileName);
logger.info("Unzipping file: {}", outputFileName);
EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(outputPath.toString()), null);
return downloadFiles;
}
Expand All @@ -96,14 +100,14 @@ public List<DownloadFile> downloadReferenceGenome() throws IOException, Interrup
*/
public List<DownloadFile> downloadConservation() throws IOException, InterruptedException {
if (!speciesHasInfoToDownload(speciesConfiguration, "conservation")) {
return null;
return Collections.emptyList();
}
logger.info("Downloading conservation information ...");
Path conservationFolder = downloadFolder.resolve("conservation");
List<DownloadFile> downloadFiles = new ArrayList<>();
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
Files.createDirectories(conservationFolder);
Files.createDirectories(conservationFolder.resolve("gerp"));
Files.createDirectories(conservationFolder.resolve(GERP_SUBDIRECTORY));
Files.createDirectories(conservationFolder.resolve("phastCons"));
Files.createDirectories(conservationFolder.resolve("phylop"));

Expand All @@ -128,7 +132,7 @@ public List<DownloadFile> downloadConservation() throws IOException, Interrupted
phyloPUrls.add(phyloPUrl);
}
String gerpUrl = configuration.getDownload().getGerp().getHost();
downloadFiles.add(downloadFile(gerpUrl, conservationFolder.resolve(EtlCommons.GERP_SUBDIRECTORY)
downloadFiles.add(downloadFile(gerpUrl, conservationFolder.resolve(GERP_SUBDIRECTORY)
.resolve(EtlCommons.GERP_FILE).toString()));

saveVersionData(EtlCommons.CONSERVATION_DATA, GERP_NAME, null, getTimeStamp(), Collections.singletonList(gerpUrl),
Expand Down Expand Up @@ -170,9 +174,9 @@ public List<DownloadFile> downloadConservation() throws IOException, Interrupted

public List<DownloadFile> downloadRepeats() throws IOException, InterruptedException {
if (!speciesHasInfoToDownload(speciesConfiguration, "repeats")) {
return null;
return Collections.emptyList();
}
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
if (speciesConfiguration.getScientificName().equals(HOMO_SAPIENS_NAME)) {
logger.info("Downloading repeats data ...");
Path repeatsFolder = downloadFolder.resolve(EtlCommons.REPEATS_FOLDER);
Files.createDirectories(repeatsFolder);
Expand All @@ -187,51 +191,35 @@ public List<DownloadFile> downloadRepeats() throws IOException, InterruptedExcep
}

// Download tandem repeat finder
String url = configuration.getDownload().getSimpleRepeats().getHost() + "/" + pathParam
+ "/database/simpleRepeat.txt.gz";
downloadFiles.add(downloadFile(url, repeatsFolder.resolve(EtlCommons.TRF_FILE).toString()));
saveVersionData(EtlCommons.REPEATS_DATA, TRF_NAME, null, getTimeStamp(), Collections.singletonList(url),
repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILE));
String url = configuration.getDownload().getSimpleRepeats().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam);
saveVersionData(EtlCommons.REPEATS_DATA, TRF_NAME, configuration.getDownload().getSimpleRepeats().getVersion(), getTimeStamp(),
Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.TRF_VERSION_FILENAME));

Path outputPath = repeatsFolder.resolve(getUrlFilename(url));
logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath);
downloadFiles.add(downloadFile(url, outputPath.toString()));

// Download genomic super duplications
url = configuration.getDownload().getGenomicSuperDups().getHost() + "/" + pathParam
+ "/database/genomicSuperDups.txt.gz";
downloadFiles.add(downloadFile(url, repeatsFolder.resolve(EtlCommons.GSD_FILE).toString()));
saveVersionData(EtlCommons.REPEATS_DATA, GSD_NAME, null, getTimeStamp(), Collections.singletonList(url),
repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILE));
url = configuration.getDownload().getGenomicSuperDups().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam);
saveVersionData(EtlCommons.REPEATS_DATA, GSD_NAME, configuration.getDownload().getGenomicSuperDups().getVersion(),
getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.GSD_VERSION_FILENAME));

outputPath = repeatsFolder.resolve(getUrlFilename(url));
logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath);
downloadFiles.add(downloadFile(url, outputPath.toString()));

// Download WindowMasker
if (!pathParam.equalsIgnoreCase("hg19")) {
url = configuration.getDownload().getWindowMasker().getHost() + "/" + pathParam
+ "/database/windowmaskerSdust.txt.gz";
downloadFiles.add(downloadFile(url, repeatsFolder.resolve(EtlCommons.WM_FILE).toString()));
saveVersionData(EtlCommons.REPEATS_DATA, WM_NAME, null, getTimeStamp(), Collections.singletonList(url),
repeatsFolder.resolve(EtlCommons.WM_VERSION_FILE));
url = configuration.getDownload().getWindowMasker().getHost().replace(PUT_ASSEMBLY_HERE_MARK, pathParam);
saveVersionData(EtlCommons.REPEATS_DATA, WM_NAME, configuration.getDownload().getWindowMasker().getVersion(),
getTimeStamp(), Collections.singletonList(url), repeatsFolder.resolve(EtlCommons.WM_VERSION_FILENAME));

outputPath = repeatsFolder.resolve(getUrlFilename(url));
logger.info(DOWNLOADING_LOG_MESSAGE, url, outputPath);
downloadFiles.add(downloadFile(url, outputPath.toString()));
}
return downloadFiles;
}
return null;
}

public void runGenomeInfo() throws IOException, InterruptedException {
logger.info("Downloading genome info ...");

// TODO don't run this if file already exists

String outputFolder = downloadFolder.getParent().toAbsolutePath().toString() + "/generated_json/";

if ("true".equals(System.getenv("CELLBASE_BUILD_DOCKER"))) {
String outputLog = downloadLogFolder + "/genome_info.log";
EtlCommons.runCommandLineProcess(null, "/opt/cellbase/genome_info.pl",
Arrays.asList("--outdir", outputFolder),
outputLog);
} else {
String dockerImage = "opencb/cellbase-builder:" + configuration.getApiVersion();

AbstractMap.SimpleEntry<String, String> outputBinding = new AbstractMap.SimpleEntry(outputFolder, "/ensembl-data");
String ensemblScriptParams = "/opt/cellbase/genome_info.pl";

DockerUtils.run(dockerImage, null, outputBinding, ensemblScriptParams, null);
}
return Collections.emptyList();
}
}

0 comments on commit 3cea3f3

Please sign in to comment.