Skip to content

Commit

Permalink
lib: update gene builder (Ensembl/RefSeq) according to last changes, …
Browse files Browse the repository at this point in the history
…#TASK-5576, #TASK-5564
  • Loading branch information
jtarraga committed May 17, 2024
1 parent a8a047c commit 100d6f3
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ private CellBaseBuilder buildGenomeSequence() throws CellBaseException {
}

private CellBaseBuilder buildGene() throws CellBaseException {
return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing);
return new GeneBuilder(downloadFolder.resolve(GENE_DATA), buildFolder.resolve(GENE_DATA), speciesConfiguration, flexibleGTFParsing,
configuration);
}

private CellBaseBuilder buildCadd() throws CellBaseException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ public abstract class CellBaseBuilder {
public static final String CHECKING_DONE_BEFORE_BUILDING_LOG_MESSAGE = "Checking {} done!";

public static final String BUILDING_LOG_MESSAGE = "Building {} ...";
public static final String BUILDING_DONE_LOG_MESSAGE = "Building done!";
public static final String BUILDING_DONE_LOG_MESSAGE = "Building done.";

public static final String CATEGORY_BUILDING_LOG_MESSAGE = "Building {}/{} ...";
public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done!";
public static final String CATEGORY_BUILDING_DONE_LOG_MESSAGE = "Building done.";

public static final String PARSING_LOG_MESSAGE = "Parsing {} ...";
public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done!";
public static final String PARSING_DONE_LOG_MESSAGE = "Parsing done.";

public CellBaseBuilder(CellBaseSerializer serializer) {
logger = LoggerFactory.getLogger(this.getClass());
Expand All @@ -82,8 +82,39 @@ public void disconnect() {
}
}

protected File checkFile(DownloadProperties.URLProperties props, String fileId, Path targetPath, String name) throws CellBaseException {
logger.info("Checking file {} (file ID {} in config.) ...", name, fileId);
String filename = Paths.get(props.getFiles().get(fileId)).getFileName().toString();
if (filename.contains(MANUAL_PREFIX)) {
filename = filename.replace(MANUAL_PREFIX, "");
}
Path filePath = targetPath.resolve(filename);
if (!Files.exists(filePath)) {
if (filename.contains(PUT_CAPITAL_SPECIES_HERE_MARK)) {
// Check
filename = filename.replace(PUT_CAPITAL_SPECIES_HERE_MARK + "." + PUT_ASSEMBLY_HERE_MARK + "." + PUT_RELEASE_HERE_MARK, "")
.replace(PUT_CAPITAL_SPECIES_HERE_MARK + "." + PUT_ASSEMBLY_HERE_MARK, "");
boolean found = false;
for (File file : targetPath.toFile().listFiles()) {
if (file.getName().endsWith(filename)) {
filePath = file.toPath();
found = true;
}
}
if (!found) {
throw new CellBaseException("Expected " + name + " file (configuration file ID = " + fileId + ") does not exist at "
+ targetPath);
}
} else {
throw new CellBaseException("Expected " + name + " file: " + filename + " does not exist at " + targetPath);
}
}
logger.info("Ok.");
return filePath.toFile();
}

protected File checkFile(String data, DownloadProperties.URLProperties props, String fileId, Path targetPath) throws CellBaseException {
logger.info("Checking file {}/{} ...", getDataName(data), fileId);
logger.info("Checking file {} (file ID {} in config.) ...", getDataName(data), fileId);
if (!props.getFiles().containsKey(fileId)) {
throw new CellBaseException("File ID " + fileId + " does not exist in the configuration file in the section '" + data + "'");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.opencb.biodata.tools.sequence.FastaIndex;
import org.opencb.cellbase.core.ParamConstants;
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.config.DownloadProperties;
import org.opencb.cellbase.core.config.SpeciesConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.models.DataSource;
Expand Down Expand Up @@ -92,12 +93,13 @@ public class EnsemblGeneBuilder extends CellBaseBuilder {
private Gtf nextGtfToReturn;

public EnsemblGeneBuilder(Path downloadPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing,
CellBaseSerializer serializer) {
CellBaseConfiguration configuration, CellBaseSerializer serializer) {
super(serializer);

this.downloadPath = downloadPath;
this.speciesConfiguration = speciesConfiguration;
this.flexibleGTFParsing = flexibleGTFParsing;
this.configuration = configuration;

transcriptDict = new HashMap<>(250000);
exonDict = new HashMap<>(8000000);
Expand All @@ -122,33 +124,34 @@ public void check() throws Exception {
}

// Check Ensembl files
List<File> files = checkFiles(ensemblGeneLabel, ENSEMBL_DATA, downloadPath, 3);
gtfFile = files.stream().filter(f -> f.getName().contains(".gtf")).findFirst().get().toPath();
proteinFastaFile = files.stream().filter(f -> f.getName().contains(".pep.all.fa")).findFirst().get().toPath();
cDnaFastaFile = files.stream().filter(f -> f.getName().contains(".cdna.all.fa")).findFirst().get().toPath();

// Check common files
// geneDescriptionFile =
// xrefsFile =
DownloadProperties.URLProperties props = configuration.getDownload().getEnsembl().getUrl();
gtfFile = checkFile(props, ENSEMBL_GTF_FILE_ID, downloadPath, "Ensembl GTF").toPath();
proteinFastaFile = checkFile(props, ENSEMBL_PEP_FA_FILE_ID, downloadPath, "Ensembl Protein Fasta").toPath();
cDnaFastaFile = checkFile(props, ENSEMBL_CDNA_FA_FILE_ID, downloadPath, "Ensembl CDNA Fasta").toPath();

// Commons
geneDescriptionFile = checkFile(props, ENSEMBL_DESCRIPTION_FILE_ID, downloadPath.getParent(), "Ensembl Description").toPath();
xrefsFile = checkFile(props, ENSEMBL_XREFS_FILE_ID, downloadPath.getParent(), "Ensembl Xrefs").toPath();
ensemblCanonicalFile = checkFile(props, ENSEMBL_CANONICAL_FILE_ID, downloadPath.getParent(), "Ensembl Canonical").toPath();
tso500File = checkFile(props, ENSEMBL_TSO500_FILE_ID, downloadPath.getParent(), "Ensembl TSO 500").toPath();
eglhHaemOncFile = checkFile(props, ENSEMBL_HAEM_ONC_TRANSCRIPTS_FILE_ID, downloadPath.getParent(), "EGLH Haem Onc").toPath();
maneFile = checkFiles(MANE_SELECT_DATA, downloadPath.getParent(), 1).get(0).toPath();
lrgFile = checkFiles(LRG_DATA, downloadPath.getParent(), 1).get(0).toPath();
hgncFile = checkFiles(HGNC_DATA, downloadPath.getParent(), 1).get(0).toPath();
cancerHostpotFile = checkFiles(CANCER_HOTSPOT_DATA, downloadPath.getParent(), 1).get(0).toPath();
geneDrugFile = checkFiles(DGIDB_DATA, downloadPath.getParent(), 1).get(0).toPath();
uniprotIdMappingFile = checkFiles(UNIPROT_XREF_DATA, downloadPath.getParent(), 1).get(0).toPath();
geneExpressionFile = checkFiles(GENE_EXPRESSION_ATLAS_DATA, downloadPath.getParent(), 1).get(0).toPath();
// hpoFile = checkFiles(HPO_DATA, downloadPath.getParent(), 1);
hpoFile = checkFiles(HPO_DISEASE_DATA, downloadPath.getParent(), 1).get(0).toPath();
disgenetFile = checkFiles(DISGENET_DATA, downloadPath.getParent(), 1).get(0).toPath();
gnomadFile = checkFiles(GNOMAD_CONSTRAINTS_DATA, downloadPath.getParent(), 1).get(0).toPath();
geneOntologyAnnotationFile = checkFiles(GO_ANNOTATION_DATA, downloadPath.getParent(), 1).get(0).toPath();
// ensemblCanonicalFile = ;
// cancerGeneCensus =
// tso500File =
// eglhHaemOncFile =
cancerGeneCensusFile = checkFiles(CANCER_GENE_CENSUS_DATA, downloadPath.getParent(), 1).get(0).toPath();

// Check regulation files
// Motif features
files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA), 2);
List<File> files = checkFiles(ensemblGeneLabel, MOTIF_FEATURES_DATA, downloadPath.getParent().getParent().resolve(REGULATION_DATA),
2);
if (files.get(0).getName().endsWith("tbi")) {
tabixFile = files.get(0).toPath();
tfbsFile = files.get(1).toPath();
Expand Down Expand Up @@ -177,7 +180,9 @@ public void check() throws Exception {
}
miRTarBaseFile = downloadRegulationPath.resolve(mirTarBaseFiles.get(0).replace(XLSX_EXTENSION, CSV_EXTENSION));
if (!Files.exists(miRTarBaseFile)) {
throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist");
throw new CellBaseException("The " + getDataName(MIRTARBASE_DATA) + " fixed file " + miRTarBaseFile + " does not exist. You"
+ " have to export the file " + mirTarBaseFiles.get(0) + " to " + miRTarBaseFile.getFileName() + " format separated by"
+ " tabs and then execute the script cellbase-app/app/scripts/mirtarbase/fix-gene-symbols.sh");
}

// Check genome fasta file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path
Path disgenetFile, Path gnomadFile, Path geneOntologyAnnotationFile, Path miRBaseFile, Path miRTarBaseFile,
Path cancerGeneGensusFile, Path cancerHostpotFile, Path canonicalFile, Path tso500File, Path eglhHaemOncFile)
throws IOException, RocksDBException, FileFormatException, CellBaseException {
// indexDescriptions(geneDescriptionFile);
// indexXrefs(xrefsFile, uniprotIdMappingFile);
indexDescriptions(geneDescriptionFile);
indexXrefs(xrefsFile, uniprotIdMappingFile);
indexHgncIdMapping(hgncFile);
indexManeMapping(maneFile, ENSEMBL_DATA);
indexLrgMapping(lrgFile, ENSEMBL_DATA);
Expand All @@ -88,15 +88,15 @@ public void index(Path geneDescriptionFile, Path xrefsFile, Path hgncFile, Path
indexOntologyAnnotations(geneOntologyAnnotationFile);
indexMiRBase(species, miRBaseFile);
indexMiRTarBase(miRTarBaseFile);
// indexCancerGeneCensus(cancerGeneGensusFile);
indexCancerGeneCensus(cancerGeneGensusFile);
indexCancerHotspot(cancerHostpotFile);
// indexCanonical(canonicalFile);
// indexTSO500(tso500File);
// indexEGLHHaemOnc(eglhHaemOncFile);
indexCanonical(canonicalFile);
indexTSO500(tso500File);
indexEGLHHaemOnc(eglhHaemOncFile);
}

private void indexDescriptions(Path geneDescriptionFile) throws IOException, RocksDBException {
logger.info("Loading gene description data...");
logger.info(PARSING_LOG_MESSAGE, geneDescriptionFile);
String[] fields;
if (geneDescriptionFile != null && Files.exists(geneDescriptionFile) && Files.size(geneDescriptionFile) > 0) {
List<String> lines = Files.readAllLines(geneDescriptionFile, StandardCharsets.ISO_8859_1);
Expand All @@ -108,6 +108,7 @@ private void indexDescriptions(Path geneDescriptionFile) throws IOException, Roc
logger.warn("Gene description file " + geneDescriptionFile + " not found");
logger.warn("Gene description data not loaded");
}
logger.info(PARSING_DONE_LOG_MESSAGE);
}

public String getDescription(String id) throws RocksDBException {
Expand All @@ -120,7 +121,7 @@ public String getDescription(String id) throws RocksDBException {
}

private void indexXrefs(Path xrefsFile, Path uniprotIdMappingFile) throws IOException, RocksDBException {
logger.info("Loading xref data...");
logger.info(PARSING_LOG_MESSAGE, xrefsFile);
String[] fields;
if (xrefsFile != null && Files.exists(xrefsFile) && Files.size(xrefsFile) > 0) {
List<String> lines = Files.readAllLines(xrefsFile, StandardCharsets.ISO_8859_1);
Expand Down Expand Up @@ -182,6 +183,7 @@ private void indexXrefs(Path xrefsFile, Path uniprotIdMappingFile) throws IOExce
logger.warn("Uniprot if mapping file " + uniprotIdMappingFile + " not found");
logger.warn("Protein mapping into xref data not loaded");
}
logger.info(PARSING_DONE_LOG_MESSAGE);
}

public List<Xref> getXrefs(String id) throws RocksDBException, IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package org.opencb.cellbase.lib.builders;

import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.config.SpeciesConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
Expand All @@ -29,20 +30,22 @@ public class GeneBuilder extends CellBaseBuilder {
private EnsemblGeneBuilder ensemblGeneBuilder;
private RefSeqGeneBuilder refSeqGeneBuilder;

public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing)
public GeneBuilder(Path downloadPath, Path buildPath, SpeciesConfiguration speciesConfiguration, boolean flexibleGTFParsing,
CellBaseConfiguration configuration)
throws CellBaseException {
super(null);

// Create Ensembl gene builder
CellBaseJsonFileSerializer ensemblGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(ENSEMBL_DATA),
ENSEMBL_GENE_BASENAME);
this.ensemblGeneBuilder = new EnsemblGeneBuilder(downloadPath.resolve(ENSEMBL_DATA), speciesConfiguration, flexibleGTFParsing,
ensemblGeneSerializer);
configuration, ensemblGeneSerializer);

// Create RefSeq gene builder
CellBaseJsonFileSerializer refSeqGeneSerializer = new CellBaseJsonFileSerializer(buildPath.resolve(REFSEQ_DATA),
REFSEQ_GENE_BASENAME);
this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, refSeqGeneSerializer);
this.refSeqGeneBuilder = new RefSeqGeneBuilder(downloadPath.resolve(REFSEQ_DATA), speciesConfiguration, configuration,
refSeqGeneSerializer);
}

public void check() throws Exception {
Expand All @@ -60,7 +63,7 @@ public void parse() throws Exception {
// Check folders and files before building
check();

// Build Ensembl/RefSeq genes
// // Build Ensembl/RefSeq genes
ensemblGeneBuilder.parse();
refSeqGeneBuilder.parse();

Expand Down
Loading

0 comments on commit 100d6f3

Please sign in to comment.