Skip to content

Commit

Permalink
lib: udate Ensembl/RefSeq gene builder to gunzip FASTA files before b…
Browse files Browse the repository at this point in the history
…uilding, #TASK-5576, #TASK-5564
  • Loading branch information
jtarraga committed May 27, 2024
1 parent 100d6f3 commit 0cd4b80
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ private Path getFastaReferenceGenome() throws CellBaseException {
throw new CellBaseException("Error executing gunzip in FASTA file " + fastaPath, e);
}
}
fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(".gz", ""));
fastaPath = downloadFolder.resolve(GENOME_DATA).resolve(fastaFilename.replace(GZ_EXTENSION, ""));
if (!fastaPath.toFile().exists()) {
throw new CellBaseException("FASTA file " + fastaPath + " does not exist after executing gunzip");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public final class EtlCommons {
public static final String CSV_EXTENSION = ".csv";
public static final String TBI_EXTENSION = ".tbi";
public static final String FAI_EXTENSION = ".fai";
public static final String GZ_EXTENSION = ".gz";

public static final String OK_LOG_MESSAGE = "Ok.";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.models.DataSource;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.rocksdb.RocksDBException;

import java.io.File;
Expand Down Expand Up @@ -185,8 +186,30 @@ public void check() throws Exception {
+ " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh");
}

// Check genome fasta file
genomeSequenceFilePath = checkFiles(GENOME_DATA, downloadPath.getParent().getParent().resolve(GENOME_DATA), 1).get(0).toPath();
// Check genome FASTA file
Path genomeDownloadPath = downloadPath.getParent().getParent().resolve(GENOME_DATA);
String genomeGzFilename = Paths.get(((DataSource) dataSourceReader.readValue(genomeDownloadPath
.resolve(getDataVersionFilename(GENOME_DATA)).toFile())).getUrls().get(0)).getFileName().toString();
genomeSequenceFilePath = genomeDownloadPath.resolve(genomeGzFilename);
if (Files.exists(genomeSequenceFilePath)) {
// Need to be gunzip-ed
logger.info("Gunzip file: {}", genomeSequenceFilePath);
try {
EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(genomeSequenceFilePath.toString()), null);
} catch (IOException e) {
throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e);
} catch (InterruptedException e) {
// Restore interrupted state...
Thread.currentThread().interrupt();
throw new CellBaseException("Error executing gunzip in FASTA file " + genomeSequenceFilePath, e);
}
}
String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, "");
genomeSequenceFilePath = genomeDownloadPath.resolve(genomeFilename);
if (!Files.exists(genomeSequenceFilePath)) {
throw new CellBaseException("Genome FASTA file " + genomeSequenceFilePath.getFileName() + " does not exist at "
+ genomeSequenceFilePath.getParent());
}

logger.info(CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE, ensemblGeneLabel);
checked = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.models.DataSource;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.rocksdb.RocksDBException;

import java.io.IOException;
Expand Down Expand Up @@ -106,7 +107,28 @@ public void check() throws Exception {
gtfFile = checkFile(props, REFSEQ_GENOMIC_GTF_FILE_ID, downloadPath, "RefSeq GTF").toPath();
proteinFastaFile = checkFile(props, REFSEQ_PROTEIN_FAA_FILE_ID, downloadPath, "RefSeq Protein FAA").toPath();
cdnaFastaFile = checkFile(props, REFSEQ_RNA_FNA_FILE_ID, downloadPath, "RefSeq RNA FNA").toPath();
fastaFile = checkFile(props, REFSEQ_GENOMIC_FNA_FILE_ID, downloadPath, "RefSeq Genomic FNA").toPath();

// Check genome FASTA file
String genomeGzFilename = Paths.get(props.getFiles().get(REFSEQ_GENOMIC_FNA_FILE_ID)).getFileName().toString();
fastaFile = downloadPath.resolve(genomeGzFilename);
if (Files.exists(fastaFile)) {
// Need to be gunzip-ed
logger.info("Gunzip file: {}", fastaFile);
try {
EtlCommons.runCommandLineProcess(null, "gunzip", Collections.singletonList(fastaFile.toString()), null);
} catch (IOException e) {
throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e);
} catch (InterruptedException e) {
// Restore interrupted state...
Thread.currentThread().interrupt();
throw new CellBaseException("Error executing gunzip in FASTA file " + fastaFile, e);
}
}
String genomeFilename = genomeGzFilename.replace(GZ_EXTENSION, "");
fastaFile = downloadPath.resolve(genomeFilename);
if (!Files.exists(fastaFile)) {
throw new CellBaseException("Genome FASTA file " + fastaFile.getFileName() + " does not exist at " + fastaFile.getParent());
}

// Check common files
props = configuration.getDownload().getEnsembl().getUrl();
Expand Down

0 comments on commit 0cd4b80

Please sign in to comment.