Skip to content

Commit

Permalink
lib: load dbSNP data in the CellBase MongoDB collection 'snp', #TASK-…
Browse files Browse the repository at this point in the history
…5817, #TASK-5789
  • Loading branch information
jtarraga committed Mar 11, 2024
1 parent b75b1f7 commit 4e68dab
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ private CellBaseBuilder buildRefSeq() {
return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer);
}

private CellBaseBuilder buildVariation() {
private CellBaseBuilder buildVariation() throws IOException {
Path downloadVariationPath = downloadFolder.resolve(VARIATION_DATA);
Path buildVariationPath = buildFolder.resolve(VARIATION_DATA);
if (!buildVariationPath.toFile().exists()) {
Expand All @@ -288,7 +288,8 @@ private CellBaseBuilder buildVariation() {
CellBaseFileSerializer variationSerializer = new CellBaseJsonFileSerializer(buildVariationPath);

// Currently, only dbSNP data
copyVersionFiles(Collections.singletonList(downloadVariationPath.resolve(DBSNP_VERSION_FILENAME)));
Files.copy(downloadVariationPath.resolve(DBSNP_VERSION_FILENAME), buildVariationPath.resolve(DBSNP_VERSION_FILENAME),
StandardCopyOption.REPLACE_EXISTING);
return new VariationBuilder(downloadVariationPath, variationSerializer, configuration);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
import java.util.List;
import java.util.concurrent.ExecutionException;

import static org.opencb.cellbase.lib.EtlCommons.*;

/**
* Created by imedina on 03/02/15.
*/
Expand Down Expand Up @@ -372,30 +374,57 @@ private void checkParameters() throws CellBaseException {
private void loadVariationData() throws NoSuchMethodException, InterruptedException, ExecutionException,
InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException,
IOException, LoaderException, CellBaseException {
Path variationPath = input.resolve(VARIATION_DATA);
// First load data
// Common loading process from CellBase variation data models
if (field == null) {
DirectoryStream<Path> stream = Files.newDirectoryStream(input,
// Common loading process from CellBase variation data models
DirectoryStream<Path> stream = Files.newDirectoryStream(variationPath,
entry -> entry.getFileName().toString().startsWith("variation_chr"));

int numLoadings = 0;
for (Path entry : stream) {
logger.info("Loading file '{}'", entry);
loadRunner.load(input.resolve(entry.getFileName()), "variation", dataRelease);
loadRunner.load(variationPath.resolve(entry.getFileName()), "variation", dataRelease);
numLoadings++;
}

// Create index
createIndex("variation");

// Update release (collection and sources)
List<Path> sources = new ArrayList<>(Arrays.asList(
input.resolve("ensemblVariationVersion.json")
));
dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources);
if (numLoadings > 0) {
// Create index
createIndex("variation");

// Update release (collection and sources)
List<Path> sources = new ArrayList<>(Arrays.asList(
variationPath.resolve("ensemblVariationVersion.json")
));
dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources);
} else {
logger.info("Any variation file 'variation_chr...' found within folder '{}'", variationPath);
}
} else {
// Custom update required e.g. population freqs loading
logger.info("Loading file '{}'", variationPath);
loadRunner.load(variationPath, "variation", dataRelease, field, innerFields);
}

// Load dbSNP
Path dbSnpFilePath = variationPath.resolve(DBSNP_NAME + ".json.gz");
if (dbSnpFilePath.toFile().exists()) {
if (variationPath.resolve(DBSNP_VERSION_FILENAME).toFile().exists()) {
logger.info("Loading dbSNP file '{}'", dbSnpFilePath);
loadRunner.load(dbSnpFilePath, SNP_COLLECTION_NAME, dataRelease);

// Create index
createIndex(SNP_COLLECTION_NAME);

// Update release (collection and sources)
List<Path> sources = Collections.singletonList(variationPath.resolve(DBSNP_VERSION_FILENAME));
dataReleaseManager.update(dataRelease, SNP_COLLECTION_NAME, EtlCommons.VARIATION_DATA, sources);
} else {
logger.warn("In order to load the dbSNP file you need the version file {} within the folder '{}'", DBSNP_VERSION_FILENAME,
variationPath);
}
} else {
logger.info("Loading file '{}'", input);
loadRunner.load(input, "variation", dataRelease, field, innerFields);
logger.warn("Any dbSNP file found within the folder '{}'", variationPath);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public class EtlCommons {
public static final String DBSNP_FILE = "GCF_000001405.40.gz";
public static final String DBSNP_NAME = "dbSNP";
public static final String DBSNP_VERSION_FILENAME = DBSNP_NAME + "Version.json";
public static final String SNP_COLLECTION_NAME = "snp";

public static final String STRUCTURAL_VARIANTS_DATA = "svs";
public static final String REPEATS_DATA = "repeats";
Expand Down
3 changes: 3 additions & 0 deletions cellbase-lib/src/main/resources/mongodb-indexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,6 @@
{"collection": "pharmacogenomics", "fields": {"variants.phenotypeType": 1}, "options": {"background": true}}
{"collection": "pharmacogenomics", "fields": {"variants.confidence": 1}, "options": {"background": true}}
{"collection": "pharmacogenomics", "fields": {"variants.evidences.pubmed": 1}, "options": {"background": true}}

{"collection": "snp", "fields": {"id": 1}, "options": {"background": true}}
{"collection": "snp", "fields": {"chromosome": 1, "position": 1, "reference": 1}, "options": {"background": true}}

0 comments on commit 4e68dab

Please sign in to comment.