Skip to content

Commit

Permalink
lib: update CellBase builder for clinical variants, #TASK-5776, #TASK…
Browse files Browse the repository at this point in the history
…-5564
  • Loading branch information
jtarraga committed Apr 3, 2024
1 parent a6688d0 commit c2345d4
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@
import java.io.File;
import java.io.IOException;
import java.nio.file.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA;
import static org.opencb.cellbase.lib.EtlCommons.*;

/**
* Created by imedina on 03/02/15.
Expand Down Expand Up @@ -347,16 +348,26 @@ private CellBaseBuilder buildConservation() {
return new ConservationBuilder(conservationFilesDir, conservationChunkSize, serializer);
}

private CellBaseBuilder buildClinicalVariants() {
private CellBaseBuilder buildClinicalVariants() throws CellBaseException {
Path clinicalVariantFolder = downloadFolder.resolve(EtlCommons.CLINICAL_VARIANTS_FOLDER);
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("clinvarVersion.json")));
copyVersionFiles(Arrays.asList(clinicalVariantFolder.resolve("gwasVersion.json")));

List<Path> versionFiles = new ArrayList<>();
List<String> versionFilenames = Arrays.asList(CLINVAR_VERSION_FILENAME, COSMIC_VERSION_FILENAME, GWAS_VERSION_FILENAME,
HGMD_VERSION_FILENAME);
for (String versionFilename : versionFilenames) {
Path versionFile = clinicalVariantFolder.resolve(versionFilename);
if (!versionFile.toFile().exists()) {
throw new CellBaseException("Could not build clinical variants because of the file " + versionFilename + " does not exist");
}
versionFiles.add(versionFile);
}
copyVersionFiles(versionFiles);

CellBaseSerializer serializer = new CellBaseJsonFileSerializer(buildFolder,
EtlCommons.CLINICAL_VARIANTS_JSON_FILE.replace(".json.gz", ""), true);
return new ClinicalVariantBuilder(clinicalVariantFolder, normalize, getFastaReferenceGenome(),
buildCommandOptions.assembly == null ? getDefaultHumanAssembly() : buildCommandOptions.assembly,
serializer);
configuration, serializer);
}

private String getDefaultHumanAssembly() {
Expand Down
2 changes: 2 additions & 0 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ download:
## Download file from https://www.ebi.ac.uk/gwas/docs/file-downloads to find the real version, which is 'e110_r2023-12-20'
host: https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/02/12/gwas-catalog-associations_ontology-annotated.tsv
version: "2024-02-12"
files:
- All.vcf.gz
hpo:
## NOTE: Download manually from here now
host: https://hpo.jax.org/app/data/annotations
Expand Down
20 changes: 0 additions & 20 deletions cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,26 +87,6 @@ public class EtlCommons {
public static final String GWAS_NAME = "GWAS catalog";
public static final String GWAS_VERSION_FILENAME = "gwas" + SUFFIX_VERSION_FILENAME;

@Deprecated
public static final String CLINVAR_VERSION = "2022.11";
@Deprecated
public static final String CLINVAR_DATE = "2022-11";
@Deprecated
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz";
@Deprecated
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
@Deprecated
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
@Deprecated
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
public static final String IARCTP53_FILE = "IARC-TP53.zip";
@Deprecated
public static final String GWAS_FILE = "gwas-catalog-associations_ontology-annotated.tsv";
@Deprecated
public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz";
@Deprecated
public static final String DBSNP_FILE = "All.vcf.gz";

public static final String STRUCTURAL_VARIANTS_DATA = "svs";
public static final String REPEATS_DATA = "repeats";
public static final String OBO_DATA = "ontology";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,6 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_DATE;
import static org.opencb.cellbase.lib.EtlCommons.CLINVAR_VERSION;

//import org.opencb.biodata.formats.variant.clinvar.v24jaxb.*;

/**
* Created by fjlopez on 28/09/16.
*/
Expand Down Expand Up @@ -78,11 +73,15 @@ public class ClinVarIndexer extends ClinicalIndexer {
private static final String DIPLOTYPE = "Diplotype";
private static final String VARIANT = "Variant";
private static final char CLINICAL_SIGNIFICANCE_SEPARATOR = '/';

private final Path clinvarXMLFiles;
private final Path clinvarSummaryFile;
private final Path clinvarVariationAlleleFile;
private final Path clinvarEFOFile;

private final String version;
private final String assembly;

private int numberSomaticRecords = 0;
private int numberGermlineRecords = 0;
private int numberNoDiseaseTrait = 0;
Expand All @@ -94,15 +93,15 @@ public class ClinVarIndexer extends ClinicalIndexer {
private static final Set<ModeOfInheritance> RECESSIVE_TERM_SET
= new HashSet<>(Arrays.asList(ModeOfInheritance.biallelic));

public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinvarVariationAlleleFile,
Path clinvarEFOFile, boolean normalize, Path genomeSequenceFilePath, String assembly,
RocksDB rdb) throws IOException {
public ClinVarIndexer(Path clinvarXMLFiles, Path clinvarSummaryFile, Path clinvarVariationAlleleFile, Path clinvarEFOFile,
String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException {
super(genomeSequenceFilePath);
this.rdb = rdb;
this.clinvarXMLFiles = clinvarXMLFiles;
this.clinvarSummaryFile = clinvarSummaryFile;
this.clinvarVariationAlleleFile = clinvarVariationAlleleFile;
this.clinvarEFOFile = clinvarEFOFile;
this.version = version;
this.normalize = normalize;
this.genomeSequenceFilePath = genomeSequenceFilePath;
this.assembly = assembly;
Expand Down Expand Up @@ -310,7 +309,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, String variation
String mateVariantString, String clinicalHaplotypeString,
Map<String, EFO> traitsToEfoTermsMap) {

EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, CLINVAR_VERSION, CLINVAR_DATE);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, version, null);
// Create a set to avoid situations like germline;germline;germline
List<AlleleOrigin> alleleOrigin = null;
if (!EtlCommons.isMissing(lineFields[VARIANT_SUMMARY_ORIGIN_COLUMN])) {
Expand Down Expand Up @@ -391,7 +390,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
throws JsonProcessingException {

List<Property> additionalProperties = new ArrayList<>(3);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, CLINVAR_VERSION, CLINVAR_DATE);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_NAME, version, null);
// String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();

VariantClassification variantClassification = getVariantClassification(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.lib.builders.CellBaseBuilder;
import org.rocksdb.Options;
import org.rocksdb.RocksDB;
Expand All @@ -38,108 +39,82 @@
*/
public class ClinicalVariantBuilder extends CellBaseBuilder {

private final Path clinvarXMLFile;
private final Path clinvarSummaryFile;
private final Path clinvarVariationAlleleFile;
private final Path clinvarEFOFile;
private final Path cosmicFile;
private final Path gwasFile;
private final Path dbsnpFile;
private final Path clinicalVariantFolder;
private final String assembly;
private final Path genomeSequenceFilePath;
private final Path hgmdFile;
private boolean normalize;

public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath,
String assembly, CellBaseSerializer serializer) {
this(clinicalVariantFolder.resolve(EtlCommons.CLINVAR_XML_FILE),
clinicalVariantFolder.resolve(EtlCommons.CLINVAR_SUMMARY_FILE),
clinicalVariantFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE),
clinicalVariantFolder.resolve(EtlCommons.CLINVAR_EFO_FILE),
clinicalVariantFolder.resolve(EtlCommons.COSMIC_FILE),
clinicalVariantFolder.resolve(EtlCommons.GWAS_FILE),
clinicalVariantFolder.resolve(EtlCommons.DBSNP_FILE),
clinicalVariantFolder.resolve(EtlCommons.HGMD_FILE),
normalize,
genomeSequenceFilePath, assembly, serializer);
}
private final CellBaseConfiguration configuration;

public ClinicalVariantBuilder(Path clinvarXMLFile, Path clinvarSummaryFile, Path clinvarVariationAlleleFile,
Path clinvarEFOFile, Path cosmicFile, Path gwasFile, Path dbsnpFile, Path hgmdFile,
boolean normalize, Path genomeSequenceFilePath, String assembly,
CellBaseSerializer serializer) {
public ClinicalVariantBuilder(Path clinicalVariantFolder, boolean normalize, Path genomeSequenceFilePath,
String assembly, CellBaseConfiguration configuration, CellBaseSerializer serializer) {
super(serializer);
this.clinvarXMLFile = clinvarXMLFile;
this.clinvarSummaryFile = clinvarSummaryFile;
this.clinvarVariationAlleleFile = clinvarVariationAlleleFile;
this.clinvarEFOFile = clinvarEFOFile;
this.cosmicFile = cosmicFile;
this.gwasFile = gwasFile;
this.dbsnpFile = dbsnpFile;
this.hgmdFile = hgmdFile;
this.clinicalVariantFolder = clinicalVariantFolder;
this.normalize = normalize;
this.genomeSequenceFilePath = genomeSequenceFilePath;
this.assembly = assembly;
this.configuration = configuration;
}

public void parse() throws IOException, RocksDBException {

public void parse() throws IOException, RocksDBException, CellBaseException {
RocksDB rdb = null;
Options dbOption = null;
String dbLocation = null;

try {
Object[] dbConnection = getDBConnection(clinvarXMLFile.getParent().toString() + "/integration.idx", true);
Object[] dbConnection = getDBConnection(clinicalVariantFolder.toString() + "/integration.idx", true);
rdb = (RocksDB) dbConnection[0];
dbOption = (Options) dbConnection[1];
dbLocation = (String) dbConnection[2];

// COSMIC
// IMPORTANT: COSMIC must be indexed first (before ClinVar, IARC TP53, DOCM, HGMD,...)!!!
if (this.cosmicFile != null && Files.exists(this.cosmicFile)) {
CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFile, normalize, genomeSequenceFilePath, assembly, rdb);
Path cosmicFile = clinicalVariantFolder.resolve(configuration.getDownload().getCosmic().getFiles().get(0));
if (cosmicFile != null && Files.exists(cosmicFile)) {
CosmicIndexer cosmicIndexer = new CosmicIndexer(cosmicFile, configuration.getDownload().getCosmic().getVersion(),
normalize, genomeSequenceFilePath, assembly, rdb);
cosmicIndexer.index();
} else {
logger.warn("Cosmic file {} missing. Skipping Cosmic data", cosmicFile);
throw new CellBaseException("Could not build clinical variants: the COSMIC file " + cosmicFile + " is missing");
}

// ClinVar
if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null
&& this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile)
&& Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) {
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile,
clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb);
clinvarIndexer.index();
} else {
logger.warn("One or more of required ClinVar files are missing. Skipping ClinVar data.\n"
+ "Please, ensure that these two files exist:\n"
+ "{}\n"
+ "{}", this.clinvarXMLFile.toString(), this.clinvarSummaryFile.toString());
}
Path clinvarXMLFile = getPathFromHost(configuration.getDownload().getClinvar().getHost());
Path clinvarSummaryFile = getPathFromHost(configuration.getDownload().getClinvarSummary().getHost());
Path clinvarVariationAlleleFile = getPathFromHost(configuration.getDownload().getClinvarVariationAllele().getHost());
Path clinvarEFOFile = getPathFromHost(configuration.getDownload().getClinvarEfoTerms().getHost());
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile,
clinvarVariationAlleleFile, clinvarEFOFile, configuration.getDownload().getClinvar().getVersion(), normalize,
genomeSequenceFilePath, assembly, rdb);
clinvarIndexer.index();

// HGMD
if (this.hgmdFile != null && Files.exists(hgmdFile)) {
HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFile, normalize, genomeSequenceFilePath, assembly, rdb);
Path hgmdFile = clinicalVariantFolder.resolve(configuration.getDownload().getHgmd().getFiles().get(0));
if (hgmdFile != null && Files.exists(hgmdFile)) {
HGMDIndexer hgmdIndexer = new HGMDIndexer(hgmdFile, configuration.getDownload().getHgmd().getVersion(), normalize,
genomeSequenceFilePath, assembly, rdb);
hgmdIndexer.index();
} else {
logger.warn("The HGMD file {} is missing. Skipping HGMD data.", hgmdFile);
throw new CellBaseException("Could not build clinical variants: the HGMD file " + hgmdFile + " is missing");
}

// GWAS catalog
Path gwasFile = clinicalVariantFolder.resolve(Paths.get(configuration.getDownload().getGwasCatalog().getHost()).getFileName());
if (gwasFile != null && Files.exists(gwasFile)) {
Path dbsnpFile = clinicalVariantFolder.resolve(configuration.getDownload().getHgmd().getFiles().get(0));
if (dbsnpFile != null && Files.exists(dbsnpFile)) {
Path tabixFile = Paths.get(dbsnpFile.toAbsolutePath() + ".tbi");
if (tabixFile != null && Files.exists(tabixFile)) {
GwasIndexer gwasIndexer = new GwasIndexer(gwasFile, dbsnpFile, genomeSequenceFilePath, assembly, rdb);
gwasIndexer.index();
} else {
logger.warn("The dbSNP tabix file {} is missing. Skipping GWAS catalog data.", tabixFile);
throw new CellBaseException("Could not build clinical variants: the dbSNP tabix file " + tabixFile + " is missing");
}
} else {
logger.warn("The dbSNP file {} is missing. Skipping GWAS catalog data.", dbsnpFile);
throw new CellBaseException("Could not build clinical variants: the dbSNP file " + dbsnpFile + " is missing");
}
} else {
logger.warn("The GWAS catalog file {} is missing. Skipping GWAS catalog data.", gwasFile);
throw new CellBaseException("Could not build clinical variants: the GWAS catalog file " + gwasFile + " is missing");
}

serializeRDB(rdb);
Expand All @@ -153,6 +128,14 @@ public void parse() throws IOException, RocksDBException {

}

private Path getPathFromHost(String host) throws CellBaseException {
Path path = clinicalVariantFolder.resolve(Paths.get(host).getFileName());
if (!Files.exists(path)) {
throw new CellBaseException("Could not build clinical variants. The file " + path + " is missing");
}
return path;
}

private void serializeRDB(RocksDB rdb) throws IOException {
// DO NOT change the name of the rocksIterator variable - for some unexplainable reason Java VM crashes if it's
// named "iterator"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,12 @@
public class CosmicIndexer extends ClinicalIndexer {

private final Path cosmicFile;
private final String version;
private final String assembly;

private Pattern mutationGRCh37GenomePositionPattern;
private Pattern snvPattern;

@Deprecated
private static final String COSMIC_VERSION = "v95";

private static final int GENE_NAMES_COLUMN = 0;
private static final int HGNC_COLUMN = 3;
private static final int PRIMARY_SITE_COLUMN = 7;
Expand Down Expand Up @@ -85,10 +84,12 @@ public class CosmicIndexer extends ClinicalIndexer {
private int rocksDBNewVariants = 0;
private int rocksDBUpdateVariants = 0;

public CosmicIndexer(Path cosmicFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException {
public CosmicIndexer(Path cosmicFile, String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb)
throws IOException {
super(genomeSequenceFilePath);

this.cosmicFile = cosmicFile;
this.version = version;
this.normalize = normalize;
this.assembly = assembly;
this.rdb = rdb;
Expand Down Expand Up @@ -470,7 +471,7 @@ private EvidenceEntry buildCosmic(String[] fields) {
String id = fields[ID_COLUMN];
String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id;

EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_NAME, COSMIC_VERSION, null);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.COSMIC_NAME, version, null);
SomaticInformation somaticInformation = getSomaticInformation(fields);
List<GenomicFeature> genomicFeatureList = getGenomicFeature(fields);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,17 @@
*/
public class HGMDIndexer extends ClinicalIndexer {
private final Path hgmdFile;
private final String version;
private final String assembly;

public HGMDIndexer(Path hgmdFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb)
public HGMDIndexer(Path hgmdFile, String version, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb)
throws IOException {
super(genomeSequenceFilePath);
this.rdb = rdb;
this.assembly = assembly;
this.hgmdFile = hgmdFile;
this.version = version;
this.normalize = normalize;
this.assembly = assembly;
this.rdb = rdb;
}

public void index() throws RocksDBException, IOException {
Expand Down Expand Up @@ -93,7 +95,7 @@ private void parseHgmdInfo(Variant variant) {
}

// Source
entry.setSource(new EvidenceSource(EtlCommons.HGMD_NAME, "2020.3", "2020"));
entry.setSource(new EvidenceSource(EtlCommons.HGMD_NAME, version, null));

// Assembly
entry.setAssembly(assembly);
Expand Down
Loading

0 comments on commit c2345d4

Please sign in to comment.