diff --git a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/BioNetDBMain.java b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/BioNetDBMain.java index fecf79e..0ff72ed 100644 --- a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/BioNetDBMain.java +++ b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/BioNetDBMain.java @@ -51,6 +51,9 @@ public static void main(String[] args) { case "load": commandExecutor = new LoadCommandExecutor(cliOptionsParser.getLoadCommandOptions()); break; + case "create-csv": + commandExecutor = new ImportCommandExecutor(cliOptionsParser.getCreateCsvCommandOptions()); + break; case "import": commandExecutor = new ImportCommandExecutor(cliOptionsParser.getImportCommandOptions()); break; diff --git a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/CliOptionsParser.java b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/CliOptionsParser.java index 4484118..15b4095 100644 --- a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/CliOptionsParser.java +++ b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/CliOptionsParser.java @@ -18,6 +18,7 @@ public class CliOptionsParser { private BuildCommandOptions buildCommandOptions; private LoadCommandOptions loadCommandOptions; + private CreateCsvCommandOptions createCsvCommandOptions; private ImportCommandOptions importCommandOptions; private QueryCommandOptions queryCommandOptions; private VariantAnnotationCommandOptions variantAnnotationCommandOptions; @@ -34,6 +35,7 @@ public CliOptionsParser() { buildCommandOptions = new BuildCommandOptions(); loadCommandOptions = new LoadCommandOptions(); + createCsvCommandOptions = new CreateCsvCommandOptions(); importCommandOptions = new ImportCommandOptions(); queryCommandOptions = new QueryCommandOptions(); variantAnnotationCommandOptions = new VariantAnnotationCommandOptions(); @@ -41,6 +43,7 @@ public CliOptionsParser() { jcommander.addCommand("build", buildCommandOptions); jcommander.addCommand("load", loadCommandOptions); + jcommander.addCommand("create-csv", createCsvCommandOptions); jcommander.addCommand("import", importCommandOptions); jcommander.addCommand("query", queryCommandOptions); jcommander.addCommand("annotation", variantAnnotationCommandOptions); @@ -120,8 +123,34 @@ public class LoadCommandOptions { @Parameter(names = {"-i", "--input"}, description = "Input directory", required = true, arity = 1) public String input; + @Parameter(names = {"-d", "--data-type"}, description = "Data type. Valid values: clinical-analysis", required = true, arity = 1) + public String dataType; + @Parameter(names = {"--database"}, description = "Data model type to be loaded, i.e. genome, gene, ...", arity = 1) public String database; +// +// @Parameter(names = {"--exclude"}, description = "Exclude information separated by comma, e.g.:'XREF_DBNAME:Reactome Database ID Release 63'", arity = 1) +// public List exclude; + + @DynamicParameter(names = "-D", description = "Dynamic parameters go here", hidden = true) + public Map loaderParams = new HashMap<>(); + + } + + @Parameters(commandNames = {"create-csv"}, commandDescription = "Create CSV filesImport the built data models in format CSV files into the database") + public class CreateCsvCommandOptions { + + @ParametersDelegate + public CommonCommandOptions commonOptions = commonCommandOptions; + + @Parameter(names = {"-i", "--input"}, description = "Input directory that contains the biological files to convert to CSV files)", required = true, arity = 1) + public String input; + + @Parameter(names = {"-o", "--output"}, description = "Output directory where to save the CSV files to import", arity = 1) + public String output; + + @Parameter(names = {"--clinical-analysis"}, description = "Input JSON files contains clinical analysis (otherwise, variants)", arity = 0) + public boolean clinicalAnalysis = false; @Parameter(names = {"--exclude"}, description = "Exclude information separated by comma, e.g.:'XREF_DBNAME:Reactome Database ID Release 63'", arity = 1) public List exclude; @@ -137,14 +166,9 @@ public class ImportCommandOptions { @ParametersDelegate public CommonCommandOptions commonOptions = commonCommandOptions; - @Parameter(names = {"-i", "--input"}, description = "Input directory where the CSV files are located (when used with --create-csv-files parameter, it contains the biological files to convert to CSV files)", required = true, arity = 1) + @Parameter(names = {"-i", "--input"}, description = "Input directory where the CSV files are located", required = true, arity = 1) public String input; - @Parameter(names = {"-o", "--output"}, description = "Output directory where to save the CSV files to import (used with the --create-csv-files parameter)", arity = 1) - public String output; - - @Parameter(names = {"--create-csv-files"}, description = "Create the CSV files from the input biological files", arity = 0) - public boolean createCsvFiles = false; @Parameter(names = {"--database"}, description = "Data model type to be loaded, i.e. genome, gene, ...", arity = 1) public String database; @@ -341,9 +365,9 @@ public LoadCommandOptions getLoadCommandOptions() { return loadCommandOptions; } - public ImportCommandOptions getImportCommandOptions() { - return importCommandOptions; - } + public CreateCsvCommandOptions getCreateCsvCommandOptions() { return createCsvCommandOptions; } + + public ImportCommandOptions getImportCommandOptions() { return importCommandOptions; } public QueryCommandOptions getQueryCommandOptions() { return queryCommandOptions; diff --git a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/ImportCommandExecutor.java b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/ImportCommandExecutor.java index 66506db..11bd667 100644 --- a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/ImportCommandExecutor.java +++ b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/ImportCommandExecutor.java @@ -2,6 +2,7 @@ import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; +import org.opencb.bionetdb.core.exceptions.BioNetDBException; import org.opencb.bionetdb.core.models.network.Node; import org.opencb.bionetdb.core.models.network.Relation; import org.opencb.bionetdb.core.utils.CsvInfo; @@ -27,20 +28,31 @@ */ public class ImportCommandExecutor extends CommandExecutor { + private CliOptionsParser.CreateCsvCommandOptions createCsvCommandOptions; private CliOptionsParser.ImportCommandOptions importCommandOptions; + public ImportCommandExecutor(CliOptionsParser.CreateCsvCommandOptions createCsvCommandOptions) { + super(createCsvCommandOptions.commonOptions.logLevel, createCsvCommandOptions.commonOptions.conf); + + this.createCsvCommandOptions = createCsvCommandOptions; + this.importCommandOptions = null; + } + public ImportCommandExecutor(CliOptionsParser.ImportCommandOptions importCommandOptions) { super(importCommandOptions.commonOptions.logLevel, importCommandOptions.commonOptions.conf); + this.createCsvCommandOptions = null; this.importCommandOptions = importCommandOptions; } @Override - public void execute() { - if (importCommandOptions.createCsvFiles) { + public void execute() throws BioNetDBException { + if (createCsvCommandOptions != null) { createCsvFiles(); - } else { + } else if (importCommandOptions != null) { importCsvFiles(); + } else { + throw new BioNetDBException("Import commandline error"); } } @@ -49,10 +61,10 @@ private void createCsvFiles() { long start; // Check input and output directories - Path inputPath = Paths.get(importCommandOptions.input); + Path inputPath = Paths.get(createCsvCommandOptions.input); FileUtils.checkDirectory(inputPath); - Path outputPath = Paths.get(importCommandOptions.output); + Path outputPath = Paths.get(createCsvCommandOptions.output); FileUtils.checkDirectory(outputPath); // Prepare CSV object @@ -107,7 +119,7 @@ private void createCsvFiles() { FileUtils.checkFile(geneFile.toPath()); } start = System.currentTimeMillis(); - importer.indexingGenes(geneFile.toPath(), outputPath); + importer.indexingGenes(geneFile.toPath()); geneIndexingTime = (System.currentTimeMillis() - start) / 1000; logger.info("Gene indexing done in {} s", geneIndexingTime); @@ -119,7 +131,7 @@ private void createCsvFiles() { FileUtils.checkFile(proteinFile.toPath()); } start = System.currentTimeMillis(); - importer.indexingProteins(proteinFile.toPath(), outputPath); + importer.indexingProteins(proteinFile.toPath()); proteinIndexingTime = (System.currentTimeMillis() - start) / 1000; logger.info("Protein indexing done in {} s", proteinIndexingTime); @@ -144,7 +156,7 @@ private void createCsvFiles() { } // Parse BioPAX files - Map> filters = parseFilters(importCommandOptions.exclude); + Map> filters = parseFilters(createCsvCommandOptions.exclude); BPAXProcessing bpaxProcessing = new BPAXProcessing(importer); Neo4jBioPaxImporter bioPAXImporter = new Neo4jBioPaxImporter(csv, filters, bpaxProcessing); start = System.currentTimeMillis(); @@ -153,10 +165,15 @@ private void createCsvFiles() { bioPaxTime = (System.currentTimeMillis() - start) / 1000; - // Parse JSON variant files - start = System.currentTimeMillis(); - importer.addVariantFiles(jsonFiles); - long variantTime = (System.currentTimeMillis() - start) / 1000; + start = System.currentTimeMillis(); + if (createCsvCommandOptions.clinicalAnalysis) { + // Parse JSON variant files + importer.addClinicalAnalysisFiles(jsonFiles); + } else { + // Parse JSON variant files + importer.addVariantFiles(jsonFiles); + } + long jsonTime = (System.currentTimeMillis() - start) / 1000; // Close CSV files csv.close(); @@ -166,7 +183,7 @@ private void createCsvFiles() { logger.info("Gene panels processing in {} s", genePanelsTime); logger.info("miRNA indexing in {} s", miRnaIndexingTime); logger.info("BioPAX processing in {} s", bioPaxTime); - logger.info("Variant processing in {} s", variantTime); + logger.info((createCsvCommandOptions.clinicalAnalysis ? "Clinical analysis" : "Variant") + " processing in {} s", jsonTime); } catch (IOException e) { logger.error("Error generation CSV files: {}", e.getMessage()); e.printStackTrace(); @@ -200,7 +217,7 @@ private void importCsvFiles() { sb.setLength(0); sb.append(neo4jHome); sb.append("/bin/neo4j-admin import --id-type INTEGER --delimiter=\"" + StringEscapeUtils.escapeJava(CsvInfo.SEPARATOR) + "\" " - + "--ignore-duplicate-nodes --ignore-missing-nodes"); + + "--ignore-duplicate-nodes --ignore-missing-nodes"); // Retrieving files from the input directory List relationFiles = new ArrayList<>(); @@ -307,9 +324,9 @@ private String removeCsvExt(String filename) { return name; } - //------------------------------------------------------------------------- - // BioPAX importer callback object - //------------------------------------------------------------------------- +//------------------------------------------------------------------------- +// BioPAX importer callback object +//------------------------------------------------------------------------- public class BPAXProcessing implements Neo4jBioPaxImporter.BioPAXProcessing { private Neo4jCsvImporter importer; diff --git a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/LoadCommandExecutor.java b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/LoadCommandExecutor.java index defe656..aa2bdad 100644 --- a/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/LoadCommandExecutor.java +++ b/bionetdb-app/src/main/java/org/opencb/bionetdb/app/cli/LoadCommandExecutor.java @@ -1,21 +1,28 @@ package org.opencb.bionetdb.app.cli; +import org.apache.commons.lang.StringUtils; import org.opencb.bionetdb.core.BioNetDbManager; +import org.opencb.bionetdb.core.exceptions.BioNetDBException; +import org.opencb.bionetdb.core.neo4j.Neo4JLoader; import org.opencb.commons.utils.FileUtils; import org.opencb.commons.utils.ListUtils; +import java.io.BufferedReader; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.util.*; + +import static org.neo4j.driver.v1.Values.parameters; /** * Created by imedina on 12/08/15. */ public class LoadCommandExecutor extends CommandExecutor { + private final String CLINICAL_ANALYSIS = "clinical-analysis"; + private final Set dataTypes = new HashSet<>(Arrays.asList(CLINICAL_ANALYSIS)); + private CliOptionsParser.LoadCommandOptions loadCommandOptions; public LoadCommandExecutor(CliOptionsParser.LoadCommandOptions loadCommandOptions) { @@ -38,24 +45,31 @@ public void execute() { // BioNetDbManager checks if database parameter is empty BioNetDbManager bioNetDbManager = new BioNetDbManager(loadCommandOptions.database, configuration); - Map> filter = null; - if (ListUtils.isNotEmpty(loadCommandOptions.exclude)) { - filter = new HashMap<>(); - for (String exclude: loadCommandOptions.exclude) { - String split[] = exclude.split(":"); - if (split.length == 2) { - if (!filter.containsKey(split[0])) { - filter.put(split[0], new HashSet<>()); - } - filter.get(split[0]).add(split[1]); - } + if (dataTypes.contains(loadCommandOptions.dataType)) { + if (CLINICAL_ANALYSIS.equals(loadCommandOptions.dataType)) { + bioNetDbManager.loadClinicalAnalysis(inputPath); } + } else { + throw new BioNetDBException("Unknown data type to load: " + loadCommandOptions.dataType + + ". Valid data types values are: " + StringUtils.join(dataTypes, ",")); } - bioNetDbManager.loadBioPax(inputPath, filter); + +// Map> filter = null; +// if (ListUtils.isNotEmpty(loadCommandOptions.exclude)) { +// filter = new HashMap<>(); +// for (String exclude: loadCommandOptions.exclude) { +// String split[] = exclude.split(":"); +// if (split.length == 2) { +// if (!filter.containsKey(split[0])) { +// filter.put(split[0], new HashSet<>()); +// } +// filter.get(split[0]).add(split[1]); +// } +// } +// } +// bioNetDbManager.loadBioPax(inputPath, filter); } catch (Exception e) { e.printStackTrace(); } - - } } diff --git a/bionetdb-app/src/test/java/org/opencb/bionetdb/app/BioNetDBMainTest.java b/bionetdb-app/src/test/java/org/opencb/bionetdb/app/BioNetDBMainTest.java new file mode 100644 index 0000000..f753b36 --- /dev/null +++ b/bionetdb-app/src/test/java/org/opencb/bionetdb/app/BioNetDBMainTest.java @@ -0,0 +1,15 @@ +package org.opencb.bionetdb.app; + +import org.junit.Test; + +import static org.junit.Assert.*; + +public class BioNetDBMainTest { + + @Test + public void createCsvClinicalAnalysis() { + String caPath = "/home/jtarraga/data150/clinicalAnalysis"; + String cmdLine = "~/appl/bionetdb/build/bin/bionetdb.sh create-csv -i " + caPath + "/input/ -o csv/ --clinical-analysis"; + } + +} \ No newline at end of file diff --git a/bionetdb-core/pom.xml b/bionetdb-core/pom.xml index 3a522d9..1de8e2c 100644 --- a/bionetdb-core/pom.xml +++ b/bionetdb-core/pom.xml @@ -79,7 +79,22 @@ org.opencb.opencga opencga-storage-core - 1.4.1-dev + ${opencga.version} + + + org.neo4j + neo4j + 3.2.8-SNAPSHOT + + + org.neo4j + neo4j-kernel + 3.2.8-SNAPSHOT + + + org.neo4j + neo4j-kernel + 3.2.8-SNAPSHOT diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/BioNetDbManager.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/BioNetDbManager.java index a0445c2..84fe0a8 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/BioNetDbManager.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/BioNetDbManager.java @@ -2,6 +2,7 @@ import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.StringUtils; +import org.neo4j.driver.v1.Session; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.tools.variant.converters.avro.VariantContextToVariantConverter; import org.opencb.bionetdb.core.analysis.InterpretationAnalysis; @@ -33,13 +34,17 @@ import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryResult; +import org.opencb.commons.utils.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; import java.io.IOException; import java.nio.file.Path; import java.util.*; +import static org.neo4j.driver.v1.Values.parameters; + /** * Created by joaquin on 1/29/18. */ @@ -135,6 +140,30 @@ public void loadVcf(java.nio.file.Path path) throws BioNetDBException { neo4JVariantLoader.loadVCFFile(path); } + public void loadClinicalAnalysis(Path inputPath) throws IOException { + Session session = ((Neo4JNetworkDBAdaptor) this.networkDBAdaptor).getDriver().session(); + + // Reading file line by line, each line a JSON object (corresponding to a clinical analysis) + BufferedReader reader = FileUtils.newBufferedReader(inputPath); + + long counter = 0; + String line = reader.readLine(); + while (line != null) { + counter++; + System.out.println("Loading clinical analysis #" + counter + ". Size: " + line.length() + " bytes"); + + // Call user defined procedure: loadClinicalAnalysis + session.run("CALL org.opencb.bionetdb.core.neo4j.loadClinicalAnalysis($caJson)", + parameters("caJson", line)); + + // Read next line + line = reader.readLine(); + } + System.out.println("Loaded " + counter + " clinical analysis"); + + reader.close(); + } + public void importFiles(Path inputPath, Path outputPath, Path neo4jHome) throws BioNetDBException, IOException, InterruptedException { // Import Neo4JVariantLoader neo4JVariantLoader = new Neo4JVariantLoader((Neo4JNetworkDBAdaptor) networkDBAdaptor); diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/VariantAnalysis.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/VariantAnalysis.java index c459be8..2330579 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/VariantAnalysis.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/VariantAnalysis.java @@ -18,6 +18,8 @@ import java.io.IOException; import java.util.*; +import static org.opencb.biodata.models.clinical.interpretation.ClinicalProperty.Penetrance.COMPLETE; + public class VariantAnalysis extends BioNetDBAnalysis { public VariantAnalysis(NetworkDBAdaptor networkDBAdaptor) { @@ -26,7 +28,7 @@ public VariantAnalysis(NetworkDBAdaptor networkDBAdaptor) { public QueryResult getDominantVariants(Pedigree pedigree, Disorder disorder, Query query) throws BioNetDBException, IOException { - Map> genotypes = ModeOfInheritance.dominant(pedigree, disorder, false); + Map> genotypes = ModeOfInheritance.dominant(pedigree, disorder, COMPLETE); putGenotypes(query, genotypes); return networkDBAdaptor.variantQuery(query, QueryOptions.empty()); @@ -34,7 +36,7 @@ public QueryResult getDominantVariants(Pedigree pedigree, Disorder diso public QueryResult getRecessiveVariants(Pedigree pedigree, Disorder disorder, Query query) throws BioNetDBException { - Map> genotypes = ModeOfInheritance.recessive(pedigree, disorder, false); + Map> genotypes = ModeOfInheritance.recessive(pedigree, disorder, COMPLETE); putGenotypes(query, genotypes); return networkDBAdaptor.variantQuery(query, QueryOptions.empty()); @@ -42,7 +44,7 @@ public QueryResult getRecessiveVariants(Pedigree pedigree, Disorder dis public QueryResult getXLinkedDominantVariants(Pedigree pedigree, Disorder disorder, Query query) throws BioNetDBException { - Map> genotypes = ModeOfInheritance.xLinked(pedigree, disorder, true); + Map> genotypes = ModeOfInheritance.xLinked(pedigree, disorder, true, COMPLETE); query.put(VariantQueryParam.CHROMOSOME.key(), "X"); putGenotypes(query, genotypes); @@ -51,7 +53,7 @@ public QueryResult getXLinkedDominantVariants(Pedigree pedigree, Disord public QueryResult getXLinkedRecessiveVariants(Pedigree pedigree, Disorder disorder, Query query) throws BioNetDBException { - Map> genotypes = ModeOfInheritance.xLinked(pedigree, disorder, false); + Map> genotypes = ModeOfInheritance.xLinked(pedigree, disorder, false, COMPLETE); query.put(VariantQueryParam.CHROMOSOME.key(), "X"); putGenotypes(query, genotypes); @@ -59,7 +61,7 @@ public QueryResult getXLinkedRecessiveVariants(Pedigree pedigree, Disor } public QueryResult getYLinkedVariants(Pedigree pedigree, Disorder disorder, Query query) throws BioNetDBException { - Map> genotypes = ModeOfInheritance.yLinked(pedigree, disorder); + Map> genotypes = ModeOfInheritance.yLinked(pedigree, disorder, COMPLETE); query.put(VariantQueryParam.CHROMOSOME.key(), "Y"); putGenotypes(query, genotypes); diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/interpretation/ProteinNetworkInterpretationAnalysis.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/interpretation/ProteinNetworkInterpretationAnalysis.java index 10a1486..60a1d14 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/interpretation/ProteinNetworkInterpretationAnalysis.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/analysis/interpretation/ProteinNetworkInterpretationAnalysis.java @@ -14,6 +14,8 @@ import java.util.*; +import static org.opencb.biodata.models.clinical.interpretation.ClinicalProperty.Penetrance.COMPLETE; + public class ProteinNetworkInterpretationAnalysis { private NetworkDBAdaptor networkDBAdaptor; @@ -28,19 +30,19 @@ public QueryResult execute(Pedigree pedigree, Disorder disorder, Clinic Map> genotypes; switch (moi) { case MONOALLELIC: - genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.dominant(pedigree, disorder, false); + genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.dominant(pedigree, disorder, COMPLETE); break; case BIALLELIC: - genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.recessive(pedigree, disorder, false); + genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.recessive(pedigree, disorder, COMPLETE); break; case XLINKED_MONOALLELIC: - genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.xLinked(pedigree, disorder, true); + genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.xLinked(pedigree, disorder, true, COMPLETE); break; case XLINKED_BIALLELIC: - genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.xLinked(pedigree, disorder, false); + genotypes = org.opencb.biodata.tools.pedigree.ModeOfInheritance.xLinked(pedigree, disorder, false, COMPLETE); break; case YLINKED: - genotypes = ModeOfInheritance.yLinked(pedigree, disorder); + genotypes = ModeOfInheritance.yLinked(pedigree, disorder, COMPLETE); break; default: genotypes = new HashMap<>(); diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Node.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Node.java index d91a75e..8ce455e 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Node.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Node.java @@ -46,6 +46,11 @@ public enum Type { XREF ("XREF"), + PANEL_GENE ("PANEL_GENE"), + PANEL_VARIANT ("PANEL_VARIANT"), + PANEL_STR ("PANEL_STR"), + PANEL_REGION ("PANEL_REGION"), + PROTEIN_ANNOTATION ("PROTEIN_ANNOTATION"), PROTEIN_FEATURE ("PROTEIN_FEATURE"), @@ -78,18 +83,39 @@ public enum Type { FILE("FILE"), SAMPLE("SAMPLE"), + INDIVIDUAL("INDIVIDUAL"), + FAMILY("FAMILY"), VARIANT_CALL("VARIANT_CALL"), VARIANT_FILE_INFO("VARIANT_FILE_INFO"), + EXPERIMENT("EXPERIMENT"), TRANSCRIPT_ANNOTATION_FLAG("TRANSCRIPT_ANNOTATION_FLAG"), EXON_OVERLAP("EXON_OVERLAP"), PROTEIN_KEYWORD("PROTEIN_KEYWORD"), PANEL("PANEL"), + GENOMIC_FEATURE("GENOMIC_FEATURE"), + + DISORDER("DISORDER"), + PHENOTYPE("PHENOTYPE"), + ONTOLOGY_TERM("ONTOLOGY_TERM"), VARIANT_OBJECT("VARIANT_OBJECT"), GENE_OBJECT("GENE_OBJECT"), - PROTEIN_OBJECT("PROTEIN_OBJECT"); + PROTEIN_OBJECT("PROTEIN_OBJECT"), + + CLINICAL_ANALYSIS("CLINICAL_ANALYSIS"), + INTERPRETATION("INTERPRETATION"), + REPORTED_VARIANT("REPORTED_VARIANT"), + REPORTED_EVENT("REPORTED_EVENT"), + VARIANT_CLASSIFICATION("VARIANT_CLASSIFICATION"), + ANALYST("ANALYST"), + CLINICAL_ANALYST("CLINICAL_ANALYST"), + SOFTWARE("SOFTWARE"), + LOW_COVERAGE_REGION("LOW_COVERAGE_REGION"), + COMMENT("COMMENT"), + + ALERT("ALERT"); private final String type; private final String parentType; diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Relation.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Relation.java index daf7d2f..72c9c3d 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Relation.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/models/network/Relation.java @@ -130,18 +130,64 @@ public enum Type { VARIANT_FILE_INFO__FILE("VARIANT_FILE_INFO__FILE"), VARIANT_CALL__VARIANT_FILE_INFO("VARIANT_CALL__VARIANT_FILE_INFO"), + FILE__SOFTWARE("FILE__SOFTWARE"), + FILE__EXPERIMENT("FILE__EXPERIMENT"), + FILE__SAMPLE("FILE__SAMPLE"), + // DNA__GENE("DNA__GENE"); + @Deprecated PANEL__GENE("PANEL__GENE"), + PANEL__PANEL_GENE("PANEL__PANEL_GENE"), + PANEL__PANEL_VARIANT("PANEL__PANEL_VARIANT"), + PANEL__PANEL_STR("PANEL__PANEL_STR"), + PANEL__PANEL_REGION("PANEL__PANEL_REGION"), + PANEL__PHENOTYPE("PANEL__PHENOTYPE"), + PHENOTYPE__ONTOLOGY_TERM("PHENOTYPE__ONTOLOGY_TERM"), + PANEL_GENE__GENE("PANEL_GENE__GENE"), + PANEL_GENE__ONTOLOGY_TERM("PANEL_GENE__ONTOLOGY_TERM"), + PANEL_VARIANT__VARIANT("PANEL_VARIANT__VARIANT"), + PANEL_VARIANT__ONTOLOGY_TERM("PANEL_VARIANT__ONTOLOGY_TERM"), + PANEL_STR__ONTOLOGY_TERM("PANEL_STR__ONTOLOGY_TERM"), + PANEL_REGION__ONTOLOGY_TERM("PANEL_REGION__ONTOLOGY_TERM"), VARIANT__VARIANT_OBJECT("VARIANT__VARIANT_OBJECT"), GENE__GENE_OBJECT("GENE__GENE_OBJECT"), - PROTEIN__PROTEIN_OBJECT("PROTEIN__PROTEIN_OBJECT"); - -// DISEASE_GROUP__PANEL("DISEASE_GROUP__PANEL"), -// DISEASE_SUBGROUP__PANEL("DISEASE_SUBGROUP__PANEL"), -// DISEASE_SUBGROUP__DISEASE_GROUP("DISEASE_SUBGROUP__DISEASE_GROUP"), -// PANEL__ONTOLOGY("PANEL__ONTOLOGY"); + PROTEIN__PROTEIN_OBJECT("PROTEIN__PROTEIN_OBJECT"), + + FAMILY__PHENOTYPE("FAMILY__PHENOTYPE"), + FAMILY__DISORDER("FAMILY__DISORDER"), + FAMILY__INDIVIDUAL("FAMILY__INDIVIDUAL"), + FATHER_OF___INDIVIDUAL___INDIVIDUAL("FATHER_OF___INDIVIDUAL___INDIVIDUAL"), + MOTHER_OF___INDIVIDUAL___INDIVIDUAL("MOTHER_OF___INDIVIDUAL___INDIVIDUAL"), + INDIVIDUAL__PHENOTYPE("INDIVIDUAL__PHENOTYPE"), + INDIVIDUAL__DISORDER("INDIVIDUAL__DISORDER"), + INDIVIDUAL__SAMPLE("INDIVIDUAL__SAMPLE"), + DISORDER__PHENOTYPE("DISORDER__PHENOTYPE"), + SAMPLE__PHENOTYPE("SAMPLE__PHENOTYPE"), + + CLINICAL_ANALYSIS__DISORDER("CLINICAL_ANALYSIS__DISORDER"), + CLINICAL_ANALYSIS__FAMILY("CLINICAL_ANALYSIS__FAMILY"), + CLINICAL_ANALYSIS__FILE("CLINICAL_ANALYSIS__FILE"), + PROBAND___CLINICAL_ANALYSIS___INDIVIDUAL("PROBAND___CLINICAL_ANALYSIS___INDIVIDUAL"), + CLINICAL_ANALYSIS__CLINICAL_ANALYST("CLINICAL_ANALYSIS__CLINICAL_ANALYST"), + CLINICAL_ANALYSIS__COMMENT("CLINICAL_ANALYSIS__COMMENT"), + CLINICAL_ANALYSIS__ALERT("CLINICAL_ANALYSIS__ALERT"), + CLINICAL_ANALYSIS__INTERPRETATION("CLINICAL_ANALYSIS__INTERPRETATION"), + + INTERPRETATION__PANEL("INTERPRETATION__PANEL"), + PRIMARY_FINDING___INTERPRETATION___REPORTED_VARIANT("PRIMARY_FINDING___INTERPRETATION___REPORTED_VARIANT"), + SECONDARY_FINDING___INTERPRETATION___REPORTED_VARIANT("SECONDARY_FINDING___INTERPRETATION___REPORTED_VARIANT"), + INTERPRETATION__COMMENT("INTERPRETATION__COMMENT"), + INTERPRETATION__SOFTWARE("INTERPRETATION__SOFTWARE"), + INTERPRETATION__LOW_COVERAGE_REGION("INTERPRETATION__LOW_COVERAGE_REGION"), + REPORTED_VARIANT__VARIANT("REPORTED_VARIANT__VARIANT"), + REPORTED_VARIANT__REPORTED_EVENT("REPORTED_VARIANT__REPORTED_EVENT"), + REPORTED_VARIANT__COMMENT("REPORTED_VARIANT__COMMENT"), + REPORTED_EVENT__PHENOTYPE("REPORTED_EVENT__PHENOTYPE"), + REPORTED_EVENT__SO("REPORTED_EVENT__SO"), + REPORTED_EVENT__GENOMIC_FEATURE("REPORTED_EVENT__GENOMIC_FEATURE"), + REPORTED_EVENT__PANEL("REPORTED_EVENT__PANEL"); private final String type; diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/neo4j/Neo4JLoader.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/neo4j/Neo4JLoader.java new file mode 100644 index 0000000..e4e3b83 --- /dev/null +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/neo4j/Neo4JLoader.java @@ -0,0 +1,676 @@ +package org.opencb.bionetdb.core.neo4j; + +import org.apache.commons.collections.MapUtils; +import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Label; +import org.neo4j.graphdb.Node; +import org.neo4j.graphdb.RelationshipType; +import org.neo4j.logging.Log; +import org.opencb.biodata.models.clinical.interpretation.*; +import org.opencb.biodata.models.clinical.interpretation.GenomicFeature; +import org.opencb.biodata.models.commons.Disorder; +import org.opencb.biodata.models.commons.OntologyTerm; +import org.opencb.biodata.models.commons.Phenotype; +import org.opencb.biodata.models.commons.Software; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.*; +import org.opencb.bionetdb.core.utils.NodeBuilder; +import org.opencb.commons.utils.CollectionUtils; +import org.opencb.commons.utils.ListUtils; +import org.opencb.opencga.core.models.*; +import org.opencb.opencga.core.models.Interpretation; +import org.parboiled.common.StringUtils; + +import java.util.List; +import java.util.Map; + +import static org.neo4j.graphdb.RelationshipType.withName; +import static org.opencb.bionetdb.core.models.network.Node.Type.*; +import static org.opencb.bionetdb.core.models.network.Node.Type.CONSERVATION; +import static org.opencb.bionetdb.core.models.network.Node.Type.FUNCTIONAL_SCORE; +import static org.opencb.bionetdb.core.models.network.Node.Type.GENE; +import static org.opencb.bionetdb.core.models.network.Node.Type.TRAIT_ASSOCIATION; +import static org.opencb.bionetdb.core.models.network.Relation.Type.*; +import static org.opencb.bionetdb.core.models.network.Relation.Type.PROTEIN; +import static org.opencb.bionetdb.core.models.network.Relation.Type.SO; +import static org.opencb.bionetdb.core.models.network.Relation.Type.TRANSCRIPT; + +public class Neo4JLoader { + + private GraphDatabaseService graphDb; + private Log log; + + public Neo4JLoader(GraphDatabaseService graphDb, Log log) { + this.graphDb = graphDb; + this.log = log; + } + + public Node loadClinicalAnalysis(ClinicalAnalysis clinicalAnalysis) { + Node caNode = graphDb.findNode(Label.label("CLINICAL_ANALYSIS"), "id", clinicalAnalysis.getId()); + if (caNode != null) { + //log.info("Clinical analysis ID " + clinicalAnalysis.getId() + " already loaded. Skip."); + return caNode; + } + + caNode = createNeo4JNode(NodeBuilder.newNode(0, clinicalAnalysis)); + + // Disorder + if (clinicalAnalysis.getDisorder() != null) { + // Disorder node and relation: clinical analysis - disorder + Node disorderNode = loadDisorder(clinicalAnalysis.getDisorder()); + caNode.createRelationshipTo(disorderNode, withName(CLINICAL_ANALYSIS__DISORDER.toString())); + } + + // Files + if (MapUtils.isNotEmpty(clinicalAnalysis.getFiles())) { + for (String key : clinicalAnalysis.getFiles().keySet()) { + List files = clinicalAnalysis.getFiles().get(key); + if (CollectionUtils.isNotEmpty(files)) { + for (File file : files) { + Node fileNode = loadFile(file); + caNode.createRelationshipTo(fileNode, withName(CLINICAL_ANALYSIS__FILE.toString())); + } + } + } + } + + // Family + if (clinicalAnalysis.getFamily() != null) { + // Family node and relation clinical analysis - family + Node familyNode = loadFamily(clinicalAnalysis.getFamily()); + caNode.createRelationshipTo(familyNode, withName(CLINICAL_ANALYSIS__FAMILY.toString())); + } + + // Proband + if (clinicalAnalysis.getProband() != null) { + // Proband node and relation clinical analysis - individual (proband) + Node probandNode = loadIndividual(clinicalAnalysis.getProband()); + caNode.createRelationshipTo(probandNode, withName(PROBAND___CLINICAL_ANALYSIS___INDIVIDUAL.toString())); + } + + + // Clinical Analyst + if (clinicalAnalysis.getAnalyst() != null) { + // Analyst node and relation: clinical analysis - analyst + Node analystNode = createNeo4JNode(NodeBuilder.newNode(0, clinicalAnalysis.getAnalyst())); + caNode.createRelationshipTo(analystNode, withName(CLINICAL_ANALYSIS__CLINICAL_ANALYST.toString())); + } + + // Comments + if (CollectionUtils.isNotEmpty(clinicalAnalysis.getComments())) { + for (Comment comment : clinicalAnalysis.getComments()) { + // Comment node and relation: clinical analysis - comment + Node commentNode = createNeo4JNode(NodeBuilder.newNode(0, comment)); + caNode.createRelationshipTo(commentNode, withName(CLINICAL_ANALYSIS__COMMENT.toString())); + } + } + + // Alerts + if (CollectionUtils.isNotEmpty(clinicalAnalysis.getAlerts())) { + for (Alert alert : clinicalAnalysis.getAlerts()) { + // Alert node and relation: clinical analysis - alert + Node alertNode = createNeo4JNode(NodeBuilder.newNode(0, alert)); + caNode.createRelationshipTo(alertNode, withName(CLINICAL_ANALYSIS__ALERT.toString())); + } + } + + // Interpretations + if (CollectionUtils.isNotEmpty(clinicalAnalysis.getInterpretations())) { + for (Interpretation interpretation : clinicalAnalysis.getInterpretations()) { + // Interpretation node and relation: clinical analysis - interpretation + Node interpretationNode = loadInterpretation(interpretation); + caNode.createRelationshipTo(interpretationNode, withName(CLINICAL_ANALYSIS__INTERPRETATION.toString())); + } + } + return caNode; + } + + public Node loadInterpretation(Interpretation interpretation) { + Node interpretationNode = graphDb.findNode(Label.label(INTERPRETATION.name()), "id", interpretation.getId()); + if (interpretationNode != null) { + //log.info("Interpretation ID " + interpretationNode.getId() + " already loaded. Skip."); + return interpretationNode; + } + + interpretationNode = createNeo4JNode(NodeBuilder.newNode(0, interpretation)); + + // Software (dependencies) + if (interpretation.getSoftware() != null) { + // Software node and relation: interpretation - software + Node softwareNode = loadSoftware(interpretation.getSoftware()); + interpretationNode.createRelationshipTo(softwareNode, withName(INTERPRETATION__SOFTWARE.toString())); + } + + // Panels + if (CollectionUtils.isNotEmpty(interpretation.getPanels())) { + for (DiseasePanel panel : interpretation.getPanels()) { + // Panel node and relation: interpretation - panel + Node panelNode = loadPanel(panel); + interpretationNode.createRelationshipTo(panelNode, withName(INTERPRETATION__PANEL.toString())); + } + } + + // Primary findings + if (CollectionUtils.isNotEmpty(interpretation.getPrimaryFindings())) { + for (ReportedVariant primaryFinding : interpretation.getPrimaryFindings()) { + // Primary finding node and relation: interpretation - primary finding + Node findingNode = loadReportedVariant(primaryFinding); + interpretationNode.createRelationshipTo(findingNode, + withName(PRIMARY_FINDING___INTERPRETATION___REPORTED_VARIANT.toString())); + } + } + + // Secondary findings + if (CollectionUtils.isNotEmpty(interpretation.getSecondaryFindings())) { + for (ReportedVariant secondaryFinding : interpretation.getSecondaryFindings()) { + // Secondary node and relation: interpretation - secondary finding + Node findingNode = loadReportedVariant(secondaryFinding); + interpretationNode.createRelationshipTo(findingNode, + withName(SECONDARY_FINDING___INTERPRETATION___REPORTED_VARIANT.toString())); + } + } + + // Low coverage regions + if (CollectionUtils.isNotEmpty(interpretation.getLowCoverageRegions())) { + for (ReportedLowCoverage lowCoverageRegion : interpretation.getLowCoverageRegions()) { + // Low coverage region node and relation: interpretation - low coverage region + Node lowCoverageRegionNode = createNeo4JNode(NodeBuilder.newNode(0, lowCoverageRegion)); + interpretationNode.createRelationshipTo(lowCoverageRegionNode, withName(INTERPRETATION__LOW_COVERAGE_REGION.toString())); + } + } + + // Comments + if (CollectionUtils.isNotEmpty(interpretation.getComments())) { + for (Comment comment : interpretation.getComments()) { + // Comment node and delation: interpretation - comment + Node commentNode = createNeo4JNode(NodeBuilder.newNode(0, comment)); + interpretationNode.createRelationshipTo(commentNode, withName(INTERPRETATION__COMMENT.toString())); + } + } + + return interpretationNode; + } + + private Node loadSoftware(Software software) { + Node softwareNode = graphDb.findNode(Label.label(SOFTWARE.name()), "id", NodeBuilder.getSoftwareId(software)); + if (softwareNode != null) { + return softwareNode; + } + + return createNeo4JNode(NodeBuilder.newNode(0, software)); + } + + public Node loadReportedVariant(ReportedVariant reportedVariant) { + Node reportedVariantNode = createNeo4JNode(NodeBuilder.newNode(0, reportedVariant)); + + // Process variant and relation it to the reported variant + Node variantNode = loadVariant(reportedVariant); + reportedVariantNode.createRelationshipTo(variantNode, withName(REPORTED_VARIANT__VARIANT.toString())); + + //log.info("================> reported events for " + reportedVariant.toStringSimple() + " ? " + // + CollectionUtils.isNotEmpty(reportedVariant.getEvidences())); + if (CollectionUtils.isNotEmpty(reportedVariant.getEvidences())) { + //log.info(reportedVariant.getEvidences().size() + " reported event for reported variant " + reportedVariant.toStringSimple()); + for (ReportedEvent evidence : reportedVariant.getEvidences()) { + // Comment node and delation: interpretation - comment + Node reportedEventNode = loadReportedEvent(evidence); + reportedVariantNode.createRelationshipTo(reportedEventNode, withName(REPORTED_VARIANT__REPORTED_EVENT.toString())); + } + } + + // Comments + if (CollectionUtils.isNotEmpty(reportedVariant.getComments())) { + for (Comment comment : reportedVariant.getComments()) { + // Comment node and delation: interpretation - comment + Node commentNode = createNeo4JNode(NodeBuilder.newNode(0, comment)); + reportedVariantNode.createRelationshipTo(commentNode, withName(REPORTED_VARIANT__COMMENT.toString())); + } + } + + return reportedVariantNode; + } + + public Node loadReportedEvent(ReportedEvent reportedEvent) { + Node reportedEventNode = createNeo4JNode(NodeBuilder.newNode(0, reportedEvent)); + + // Phenotypes + if (CollectionUtils.isNotEmpty(reportedEvent.getPhenotypes())) { + for (Phenotype phenotype : reportedEvent.getPhenotypes()) { + // Phenotype node and relation reported event - phenotype + Node phenotypeNode = loadPhenotype(phenotype); + reportedEventNode.createRelationshipTo(phenotypeNode, withName(REPORTED_EVENT__PHENOTYPE.toString())); + } + } + + // Sequence ontology terms (SO) + if (CollectionUtils.isNotEmpty(reportedEvent.getConsequenceTypes())) { + for (SequenceOntologyTerm so : reportedEvent.getConsequenceTypes()) { + Node soNode = graphDb.findNode(Label.label(SO.name()), "id", so.getAccession()); + if (soNode == null) { + //log.info("SO " + so.getAccession() + ", " + so.getName() + " not found for reported event!"); + soNode = createNeo4JNode(new org.opencb.bionetdb.core.models.network.Node(0, so.getAccession(), + so.getName(), org.opencb.bionetdb.core.models.network.Node.Type.SO)); + } + reportedEventNode.createRelationshipTo(soNode, withName(REPORTED_EVENT__SO.toString())); + } + } + + // Genomic feature + if (reportedEvent.getGenomicFeature() != null) { + GenomicFeature genomicFeature = reportedEvent.getGenomicFeature(); + Node genomicFeatureNode = graphDb.findNode(Label.label(GENOMIC_FEATURE.name()), "id", genomicFeature.getId()); + if (genomicFeatureNode == null) { + //log.info("Genomic feature " + genomicFeature.getId() + " not found for reported event!"); + genomicFeatureNode = createNeo4JNode(NodeBuilder.newNode(0, genomicFeature)); + } + reportedEventNode.createRelationshipTo(genomicFeatureNode, withName(REPORTED_EVENT__GENOMIC_FEATURE.toString())); + } + + // Panel + if (StringUtils.isNotEmpty(reportedEvent.getPanelId())) { + Node panelNode = graphDb.findNode(Label.label(PANEL.name()), "id", reportedEvent.getPanelId()); + if (panelNode == null) { + //log.info("Panel " + reportedEvent.getPanelId() + " not found for reported event!"); + panelNode = createNeo4JNode(new org.opencb.bionetdb.core.models.network.Node(0, reportedEvent.getPanelId(), + "", PANEL)); + } + reportedEventNode.createRelationshipTo(panelNode, withName(REPORTED_EVENT__PANEL.toString())); + } + + return reportedEventNode; + } + + public Node loadVariant(Variant variant) { + Node variantNode = graphDb.findNode(Label.label("VARIANT"), "id", variant.toString()); + if (variantNode != null) { + //log.info("Variant ID " + variant.toString() + " already loaded. Skip."); + return variantNode; + } + + variantNode = createNeo4JNode(NodeBuilder.newNode(0, variant)); + + // Annotation management + if (variant.getAnnotation() != null) { + // Consequence types + if (ListUtils.isNotEmpty(variant.getAnnotation().getConsequenceTypes())) { + // Consequence type nodes + for (ConsequenceType ct : variant.getAnnotation().getConsequenceTypes()) { + // Consequence type node and relation variant - consequence type + Node ctNode = createNeo4JNode(NodeBuilder.newNode(0, ct)); + variantNode.createRelationshipTo(ctNode, withName(VARIANT__CONSEQUENCE_TYPE.toString())); + + // Transcript node and relation consequence type - transcript + if (ct.getEnsemblTranscriptId() != null) { + Node transcriptNode = graphDb.findNode(Label.label(TRANSCRIPT.name()), "id", ct.getEnsemblTranscriptId()); + if (transcriptNode != null) { + ctNode.createRelationshipTo(transcriptNode, withName(CONSEQUENCE_TYPE__TRANSCRIPT.toString())); + } else { + log.warn("Transcript " + ct.getEnsemblTranscriptId() + " not found for gene " + ct.getEnsemblGeneId() + ", " + + ct.getGeneName()); + } + } else { + log.warn("Transcript null for gene " + ct.getEnsemblGeneId() + ", " + ct.getGeneName()); + } + + // SO + if (ListUtils.isNotEmpty(ct.getSequenceOntologyTerms())) { + for (SequenceOntologyTerm so : ct.getSequenceOntologyTerms()) { + // SO node and relation consequence type - so + Node soNode = graphDb.findNode(Label.label(SO.toString()), "id", so.getAccession()); + if (soNode == null) { +// log.info("SO term accession " + so.getAccession() + " not found."); + soNode = createNeo4JNode(new org.opencb.bionetdb.core.models.network.Node(0, so.getAccession(), + so.getName(), org.opencb.bionetdb.core.models.network.Node.Type.SO)); + } + ctNode.createRelationshipTo(soNode, withName(CONSEQUENCE_TYPE__SO.toString())); + } + } + + // Protein variant annotation: substitution scores, keywords and features + if (ct.getProteinVariantAnnotation() != null) { + ProteinVariantAnnotation pVA = ct.getProteinVariantAnnotation(); + + // Protein variant annotation node and relation consequence type - protein variant annotation + Node pVANode = createNeo4JNode(NodeBuilder.newNode(0, pVA)); + ctNode.createRelationshipTo(pVANode, withName(CONSEQUENCE_TYPE__PROTEIN_VARIANT_ANNOTATION.toString())); + + // Protein relationship management + if (pVA.getUniprotAccession() != null) { + Node proteinNode = graphDb.findNode(Label.label(PROTEIN.name()), "id", pVA.getUniprotAccession()); + if (proteinNode != null) { + pVANode.createRelationshipTo(proteinNode, withName(PROTEIN_VARIANT_ANNOTATION__PROTEIN.toString())); + } else { + log.warn("Protein " + pVA.getUniprotAccession() + " node not found for protein variant annotation (" + + ct.getEnsemblGeneId() + ", " + ct.getGeneName() + ", " + ct.getEnsemblTranscriptId() + ")"); + } + } else { + log.warn("Protein Uniprot accession null for protein variant annotation (" + ct.getEnsemblGeneId() + ", " + + ct.getGeneName() + ", " + ct.getEnsemblTranscriptId() + ")"); + } + + // Protein substitution scores + if (ListUtils.isNotEmpty(ct.getProteinVariantAnnotation().getSubstitutionScores())) { + for (Score score: ct.getProteinVariantAnnotation().getSubstitutionScores()) { + Node scoreNode = createNeo4JNode(NodeBuilder.newNode(0, score, + org.opencb.bionetdb.core.models.network.Node.Type.SUBSTITUTION_SCORE)); + pVANode.createRelationshipTo(scoreNode, + withName(PROTEIN_VARIANT_ANNOTATION__SUBSTITUTION_SCORE.toString())); + } + } + } + } + } + + // Population frequencies + if (ListUtils.isNotEmpty(variant.getAnnotation().getPopulationFrequencies())) { + for (PopulationFrequency popFreq : variant.getAnnotation().getPopulationFrequencies()) { + // Population frequency node and relation: variant - population frequency + Node popFreqNode = createNeo4JNode(NodeBuilder.newNode(0, popFreq)); + variantNode.createRelationshipTo(popFreqNode, withName(VARIANT__POPULATION_FREQUENCY.toString())); + } + } + + // Conservation values + if (ListUtils.isNotEmpty(variant.getAnnotation().getConservation())) { + for (Score score: variant.getAnnotation().getConservation()) { + // Conservation node and relation: variant - conservation + Node conservatioNode = createNeo4JNode(NodeBuilder.newNode(0, score, CONSERVATION)); + variantNode.createRelationshipTo(conservatioNode, withName(VARIANT__CONSERVATION.toString())); + } + } + + // Trait associations + if (ListUtils.isNotEmpty(variant.getAnnotation().getTraitAssociation())) { + for (EvidenceEntry evidence: variant.getAnnotation().getTraitAssociation()) { + // Trait association node and relation: variant - trait association + Node traitNode = createNeo4JNode(NodeBuilder.newNode(0, evidence, TRAIT_ASSOCIATION)); + variantNode.createRelationshipTo(traitNode, withName(VARIANT__TRAIT_ASSOCIATION.toString())); + } + } + + // Functional scores + if (ListUtils.isNotEmpty(variant.getAnnotation().getFunctionalScore())) { + for (Score score: variant.getAnnotation().getFunctionalScore()) { + // Functional score node and relation: variant - functional score + Node functNode = createNeo4JNode(NodeBuilder.newNode(0, score, FUNCTIONAL_SCORE)); + variantNode.createRelationshipTo(functNode, withName(VARIANT__FUNCTIONAL_SCORE.toString())); + } + } + } + + return variantNode; + } + + public Node loadFile(File file) { + Node fileNode = graphDb.findNode(Label.label(FILE.name()), "id", file.getId()); + if (fileNode != null) { + return fileNode; + } + + fileNode = createNeo4JNode(NodeBuilder.newNode(0, file)); + + // Software + if (file.getSoftware() != null) { + Node softwareNode = loadSoftware(file.getSoftware()); + fileNode.createRelationshipTo(softwareNode, withName(FILE__SOFTWARE.toString())); + } + + // Experiment + if (file.getExperiment() != null) { + Node experimentNode = createNeo4JNode(NodeBuilder.newNode(0, file.getExperiment())); + fileNode.createRelationshipTo(experimentNode, withName(FILE__EXPERIMENT.toString())); + } + + // Samples + if (CollectionUtils.isNotEmpty(file.getSamples())) { + for (Sample sample : file.getSamples()) { + Node sampleNode = loadSample(sample); + fileNode.createRelationshipTo(sampleNode, withName(FILE__SAMPLE.toString())); + } + } + + return fileNode; + } + + public Node loadFamily(Family family) { + Node familyNode = graphDb.findNode(Label.label(FAMILY.name()), "id", family.getId()); + if (familyNode != null) { + return familyNode; + } + + //log.info("Family " + family.getId() + ", " + family.getName() + " not found. Create it!"); + familyNode = createNeo4JNode(NodeBuilder.newNode(0, family)); + + // Phenotypes + //log.info("Family: loading phenotypes..."); + if (CollectionUtils.isNotEmpty(family.getPhenotypes())) { + for (Phenotype phenotype : family.getPhenotypes()) { + Node phenotypeNode = loadPhenotype(phenotype); + familyNode.createRelationshipTo(phenotypeNode, withName(FAMILY__PHENOTYPE.toString())); + } + } + + // Disorders + //log.info("Family: loading disorders..."); + if (CollectionUtils.isNotEmpty(family.getDisorders())) { + for (Disorder disorder : family.getDisorders()) { + Node disorderNode = loadDisorder(disorder); + familyNode.createRelationshipTo(disorderNode, withName(FAMILY__DISORDER.toString())); + } + } + + // Members + //log.info("Family: loading members..."); + if (CollectionUtils.isNotEmpty(family.getMembers())) { + for (Individual member : family.getMembers()) { + Node memberNode = loadIndividual(member); + familyNode.createRelationshipTo(memberNode, withName(FAMILY__INDIVIDUAL.toString())); + } + } + + return familyNode; + } + + public Node loadIndividual(Individual individual) { + Node individualNode = graphDb.findNode(Label.label(INDIVIDUAL.name()), "id", individual.getId()); + + if (individualNode != null) { + return individualNode; + } + + //log.info("Individual " + individual.getId() + ", " + individual.getName() + " not found. Create it!"); + individualNode = createNeo4JNode(NodeBuilder.newNode(0, individual)); + + // Father + if (individual.getFather() != null) { + Node fatherNode = loadIndividual(individual.getFather()); + fatherNode.createRelationshipTo(individualNode, withName(FATHER_OF___INDIVIDUAL___INDIVIDUAL.toString())); + } + + // Mother + if (individual.getMother() != null) { + Node motherNode = loadIndividual(individual.getMother()); + motherNode.createRelationshipTo(individualNode, withName(MOTHER_OF___INDIVIDUAL___INDIVIDUAL.toString())); + } + + // Phenotypes + if (CollectionUtils.isNotEmpty(individual.getPhenotypes())) { + for (Phenotype phenotype : individual.getPhenotypes()) { + Node phenotypeNode = loadPhenotype(phenotype); + individualNode.createRelationshipTo(phenotypeNode, withName(INDIVIDUAL__PHENOTYPE.toString())); + } + } + + // Disorders + if (CollectionUtils.isNotEmpty(individual.getDisorders())) { + for (Disorder disorder : individual.getDisorders()) { + Node disorderNode = loadDisorder(disorder); + individualNode.createRelationshipTo(disorderNode, withName(INDIVIDUAL__DISORDER.toString())); + } + } + + // Samples + if (CollectionUtils.isNotEmpty(individual.getSamples())) { + for (Sample sample : individual.getSamples()) { + Node sampleNode = loadSample(sample); + individualNode.createRelationshipTo(sampleNode, withName(INDIVIDUAL__SAMPLE.toString())); + } + } + return individualNode; + } + + public Node loadDisorder(Disorder disorder) { + // Disorder node + Node disorderNode = createNeo4JNode(NodeBuilder.newNode(0, disorder)); + if (CollectionUtils.isNotEmpty(disorder.getEvidences())) { + for (Phenotype phenotype : disorder.getEvidences()) { + // Phenotype node and relation: disorder - phenotype + Node phenotypeNode = loadPhenotype(phenotype); + disorderNode.createRelationshipTo(phenotypeNode, withName(DISORDER__PHENOTYPE.toString())); + } + } + return disorderNode; + } + + public Node loadPhenotype(Phenotype phenotype) { + //log.info("Loading Phenotype, id = " + phenotype.getId() + ", name = " + phenotype.getName()); + Node phenotypeNode = createNeo4JNode(NodeBuilder.newNode(0, phenotype)); + + Node ontologyTermNode = loadOntologyTerm(phenotype); + phenotypeNode.createRelationshipTo(ontologyTermNode, withName(PHENOTYPE__ONTOLOGY_TERM.name())); + //log.info("Done. Phenotype, id = " + phenotype.getId() + ", name = " + phenotype.getName()); + return phenotypeNode; + } + + public Node loadOntologyTerm(OntologyTerm ontologyTerm) { + //log.info("Loading OntologyTerm, id = " + ontologyTerm.getId() + ", name = " + ontologyTerm.getName()); + Node ontologyTermNode = graphDb.findNode(Label.label(ONTOLOGY_TERM.name()), "id", ontologyTerm.getId()); + if (ontologyTermNode == null) { + ontologyTermNode = createNeo4JNode(NodeBuilder.newNode(0, ontologyTerm)); + } + //log.info("Done. OntologyTerm, id = " + ontologyTerm.getId() + ", name = " + ontologyTerm.getName()); + return ontologyTermNode; + } + + public Node loadSample(Sample sample) { + Node sampleNode = graphDb.findNode(Label.label(SAMPLE.name()), "id", sample.getId()); + if (sampleNode != null) { + return sampleNode; + } + + //log.info("Sample " + sample.getId() + ", " + sample.getName() + " not found. Create it!"); + sampleNode = createNeo4JNode(NodeBuilder.newNode(0, sample)); + + if (CollectionUtils.isNotEmpty(sample.getPhenotypes())) { + for (Phenotype phenotype : sample.getPhenotypes()) { + // Phenotype node and relation sample - phenotype + Node phenotypeNode = loadPhenotype(phenotype); + sampleNode.createRelationshipTo(phenotypeNode, withName(SAMPLE__PHENOTYPE.name())); + } + } + return sampleNode; + } + + public Node loadPanel(DiseasePanel panel) { + Node panelNode = graphDb.findNode(Label.label(PANEL.name()), "id", panel.getId()); + if (panelNode != null) { + return panelNode; + } + + //log.info("Panel " + panel.getId() + ", " + panel.getName() + " not found. Create it!"); + panelNode = createNeo4JNode(NodeBuilder.newNode(0, panel)); + // Phenotypes + if (CollectionUtils.isNotEmpty(panel.getPhenotypes())) { + for (Phenotype phenotype : panel.getPhenotypes()) { + // Phenotype node and relation panel - phenotype + Node phenotypeNode = loadPhenotype(phenotype); + panelNode.createRelationshipTo(phenotypeNode, withName(PANEL__PHENOTYPE.name())); + } + } + // Panel variants (DiseasePanel.VariantPanel) + if (CollectionUtils.isNotEmpty(panel.getVariants())) { + for (DiseasePanel.VariantPanel panelVariant : panel.getVariants()) { + Node panelVariantNode = createNeo4JNode(NodeBuilder.newNode(0, panelVariant)); + panelNode.createRelationshipTo(panelVariantNode, withName(PANEL__PANEL_VARIANT.name())); + addOntologyTerms(panelVariant.getPhenotypes(), panelVariantNode, withName(PANEL_VARIANT__ONTOLOGY_TERM.name())); + + Node variantNode = graphDb.findNode(Label.label(VARIANT.name()), "id", panelVariant.getId()); + if (variantNode != null) { + panelVariantNode.createRelationshipTo(variantNode, withName(PANEL_VARIANT__VARIANT.name())); + } else { + log.warn("Variant not found for panel variant " + panelVariant.getId()); + } + } + } + // Panel genes (DiseasePanel.GenePanel) + if (CollectionUtils.isNotEmpty(panel.getGenes())) { + for (DiseasePanel.GenePanel panelGene : panel.getGenes()) { + Node panelGeneNode = createNeo4JNode(NodeBuilder.newNode(0, panelGene)); + panelNode.createRelationshipTo(panelGeneNode, withName(PANEL__PANEL_GENE.name())); + addOntologyTerms(panelGene.getPhenotypes(), panelGeneNode, withName(PANEL_GENE__ONTOLOGY_TERM.name())); + + Node geneNode = graphDb.findNode(Label.label(GENE.name()), "id", panelGene.getId()); + if (geneNode != null) { + panelGeneNode.createRelationshipTo(geneNode, withName(PANEL_GENE__GENE.name())); + } else { + log.warn("Gene not found for panel gene " + panelGene.getId() + ", " + panelGene.getName()); + } + } + } + // STRs (DiseasePanel.STR) + if (CollectionUtils.isNotEmpty(panel.getStrs())) { + for (DiseasePanel.STR panelStr : panel.getStrs()) { + Node panelStrNode = createNeo4JNode(NodeBuilder.newNode(0, panelStr)); + panelNode.createRelationshipTo(panelStrNode, withName(PANEL__PANEL_STR.name())); + addOntologyTerms(panelStr.getPhenotypes(), panelStrNode, withName(PANEL_STR__ONTOLOGY_TERM.name())); + } + } + // Panel regions (DiseasePanel.RegionPanel) + if (CollectionUtils.isNotEmpty(panel.getRegions())) { + for (DiseasePanel.RegionPanel panelRegion : panel.getRegions()) { + Node panelRegionNode = createNeo4JNode(NodeBuilder.newNode(0, panelRegion)); + panelNode.createRelationshipTo(panelRegionNode, withName(PANEL__PANEL_REGION.name())); + addOntologyTerms(panelRegion.getPhenotypes(), panelRegionNode, withName(PANEL_STR__ONTOLOGY_TERM.name())); + } + } + return panelNode; + } + + //------------------------------------------------------------------------- + // P R I V A T E M E T H O D S + //------------------------------------------------------------------------- + + private void addOntologyTerms(List ontologyTerms, Node node, RelationshipType relation) { + if (CollectionUtils.isNotEmpty(ontologyTerms)) { + for (OntologyTerm ontologyTerm : ontologyTerms) { + Node ontologyTermNode = loadOntologyTerm(ontologyTerm); + node.createRelationshipTo(ontologyTermNode, relation); + } + } + } + + private Node createNeo4JNode(org.opencb.bionetdb.core.models.network.Node node) { + Node neo4jNode = graphDb.createNode(Label.label(node.getType().toString())); + + neo4jNode.setProperty("uid", neo4jNode.getId()); + + if (StringUtils.isNotEmpty(node.getId())) { + neo4jNode.setProperty("id", node.getId()); + } + + if (StringUtils.isNotEmpty(node.getName())) { + neo4jNode.setProperty("name", node.getName()); + } + + if (StringUtils.isNotEmpty(node.getSource())) { + neo4jNode.setProperty("source", node.getSource()); + } + + if (MapUtils.isNotEmpty(node.getAttributes())) { + for (Map.Entry entry : node.getAttributes().entrySet()) { + neo4jNode.setProperty(entry.getKey(), entry.getValue()); + } + } + return neo4jNode; + } +} diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/neo4j/UserDefinedProcedure.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/neo4j/UserDefinedProcedure.java new file mode 100644 index 0000000..65fef10 --- /dev/null +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/neo4j/UserDefinedProcedure.java @@ -0,0 +1,64 @@ +package org.opencb.bionetdb.core.neo4j; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.graphdb.Transaction; +import org.neo4j.logging.Log; +import org.neo4j.procedure.Context; +import org.neo4j.procedure.Name; +import org.neo4j.procedure.Procedure; +import org.opencb.opencga.core.common.JacksonUtils; +import org.opencb.opencga.core.models.ClinicalAnalysis; + +import java.io.IOException; + +import static org.neo4j.procedure.Mode.SCHEMA; + +public class UserDefinedProcedure { + + // Procedure classes. This static field is the configuration we use + // to create full-text indexes. + + // This field declares that we need a GraphDatabaseService + // as context when any procedure in this class is invoked + @Context + public GraphDatabaseService db; + + // This gives us a log instance that outputs messages to the + // standard log, `neo4j.log` + @Context + public Log log; + + /** + * This is the second procedure defined in this class, it is used to update the + * index with nodes that should be queryable. You can send the same node multiple + * times, if it already exists in the index the index will be updated to match + * the current state of the node. + * + * + * Two, it returns {@code void} rather than a stream. This is a short-hand + * for saying our procedure always returns an empty stream of empty records. + * + * Three, it uses a default value for the property list, in this way you can call + * the procedure by invoking {@code CALL index(nodeId)}. Default values are + * are provided as the Cypher string representation of the given type, e.g. + * {@code {default: true}}, {@code null}, or {@code -1}. + * + * @param caJson JSON string for the clinical analysis to load + */ + @Procedure(name = "org.opencb.bionetdb.core.neo4j.loadClinicalAnalysis", mode = SCHEMA) + public void loadClinicalAnalysis(@Name("caJson") String caJson) { + Neo4JLoader neo4JLoader = new Neo4JLoader(db, log); + + try (Transaction tx = db.beginTx()) { + ObjectMapper defaultObjectMapper = JacksonUtils.getDefaultObjectMapper(); + ClinicalAnalysis ca = defaultObjectMapper.readValue(caJson, ClinicalAnalysis.class); + neo4JLoader.loadClinicalAnalysis(ca); + + tx.success(); + } catch (IOException e) { + e.printStackTrace(); + } + log.info("Done!"); + } +} diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/CsvInfo.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/CsvInfo.java index 3eeef33..b7f5db9 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/CsvInfo.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/CsvInfo.java @@ -14,11 +14,12 @@ import org.opencb.biodata.models.variant.metadata.VariantFileMetadata; import org.opencb.biodata.models.variant.metadata.VariantMetadata; import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; -import org.opencb.bionetdb.core.neo4j.Neo4JNetworkDBAdaptor; import org.opencb.bionetdb.core.models.network.Node; import org.opencb.bionetdb.core.models.network.Relation; +import org.opencb.bionetdb.core.neo4j.Neo4JNetworkDBAdaptor; import org.opencb.bionetdb.core.utils.cache.GeneCache; import org.opencb.bionetdb.core.utils.cache.ProteinCache; +import org.opencb.commons.utils.CollectionUtils; import org.opencb.commons.utils.FileUtils; import org.opencb.commons.utils.ListUtils; import org.rocksdb.RocksDB; @@ -160,7 +161,6 @@ public CsvInfo(Path inputPath, Path outputPath) { infoFields = new HashSet<>(); formatFields = new HashSet<>(); - csvWriters = new HashMap<>(); csvAnnotatedWriters = new HashMap<>(); nodeAttributes = createNodeAttributes(); @@ -169,8 +169,8 @@ public CsvInfo(Path inputPath, Path outputPath) { rocksDbManager = new RocksDbManager(); uidRocksDb = this.rocksDbManager.getDBConnection(outputPath.toString() + "/uidRocksDB", true); - geneCache = new GeneCache(); - proteinCache = new ProteinCache(); + geneCache = new GeneCache(outputPath); + proteinCache = new ProteinCache(outputPath); mapper = new ObjectMapper(); mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); @@ -288,6 +288,7 @@ public void openMetadataFile(File metafile) throws IOException { } } + // Variant call String strType; List attrs = new ArrayList<>(); @@ -412,7 +413,7 @@ public void putString(String key, String value) { rocksDbManager.putString(key, value, uidRocksDb); } - private String getNodeHeaderLine(List attrs) { + protected String getNodeHeaderLine(List attrs) { StringBuilder sb = new StringBuilder(); sb.append("uid:ID(").append(attrs.get(0)).append(")"); for (int i = 1; i < attrs.size(); i++) { @@ -425,11 +426,32 @@ private String getNodeHeaderLine(List attrs) { return sb.toString(); } - private String getRelationHeaderLine(String type) { + protected String getRelationHeaderLine(String type) { StringBuilder sb = new StringBuilder(); - String[] split = type.split("__"); - sb.append(":START_ID(").append(nodeAttributes.get(split[0]).get(0)).append(")").append(SEPARATOR).append(":END_ID(") - .append(nodeAttributes.get(split[1]).get(0)).append(")"); + + String source; + String dest; + if (type.contains("___")) { + String[] split = type.split("___"); + source = split[1]; + dest = split[2]; + } else { + String[] split = type.split("__"); + source = split[0]; + dest = split[1]; + } + + if (CollectionUtils.isNotEmpty(nodeAttributes.get(source)) && CollectionUtils.isNotEmpty(nodeAttributes.get(dest))) { + sb.append(":START_ID(").append(nodeAttributes.get(source).get(0)).append(")").append(SEPARATOR).append(":END_ID(") + .append(nodeAttributes.get(dest).get(0)).append(")"); + } else { + if (CollectionUtils.isEmpty(nodeAttributes.get(source))) { + logger.info("Attributes empty for " + source + ", from getRelationHeaderLine: " + type); + } + if (CollectionUtils.isEmpty(nodeAttributes.get(dest))) { + logger.info("Attributes empty for " + dest + ", from getRelationHeaderLine: " + type); + } + } return sb.toString(); } @@ -483,13 +505,74 @@ private String cleanString(String input) { } } - public void indexingGenes(Path inputPath, Path indexPath) throws IOException { - geneCache.index(inputPath, indexPath); - } - - public void indexingProteins(Path inputPath, Path indexPath) throws IOException { - proteinCache.index(inputPath, indexPath); - } +// public void indexingGenes(Path inputPath) throws IOException { +//// geneCache.index(inputPath, indexPath); +// +//// String objFilename = output.toString() + "/genes.rocksdb"; +//// String xrefObjFilename = output.toString() + "/xref.genes.rocksdb"; +// +//// if (Paths.get(objFilename).toFile().exists() +//// && Paths.get(xrefObjFilename).toFile().exists()) { +//// objRocksDb = rocksDbManager.getDBConnection(objFilename, true); +//// xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); +//// logger.info("\tGene index already created!"); +//// return; +//// } +// +//// // Delete protein RocksDB files +//// Paths.get(objFilename).toFile().delete(); +//// Paths.get(xrefObjFilename).toFile().delete(); +//// +//// // Create gene RocksDB files (protein and xrefs) +//// RocksDB objRocksDb = rocksDbManager.getDBConnection(objFilename, true); +//// RocksDB xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); +// +// BufferedReader reader = org.opencb.commons.utils.FileUtils.newBufferedReader(inputPath); +// String jsonGene = reader.readLine(); +// long geneCounter = 0; +// while (jsonGene != null) { +// Gene gene = geneCache.getObjReader().readValue(jsonGene); +// String geneId = gene.getId(); +// if (org.apache.commons.lang3.StringUtils.isNotEmpty(geneId)) { +// geneCounter++; +// if (geneCounter % 5000 == 0) { +// logger.info("Indexing {} genes...", geneCounter); +// } +// // Save gene +// rocksDbManager.putString(geneId, jsonGene, geneCache.getObjRocksDb()); +// +// // Save xrefs for that gene +// rocksDbManager.putString(geneId, geneId, geneCache.getXrefObjRocksDb()); +// if (org.apache.commons.lang3.StringUtils.isNotEmpty(gene.getName())) { +// rocksDbManager.putString(gene.getName(), geneId, geneCache.getXrefObjRocksDb()); +// } +// +// if (ListUtils.isNotEmpty(gene.getTranscripts())) { +// for (Transcript transcr : gene.getTranscripts()) { +// if (ListUtils.isNotEmpty(transcr.getXrefs())) { +// for (Xref xref: transcr.getXrefs()) { +// if (org.apache.commons.lang3.StringUtils.isNotEmpty(xref.getId())) { +// rocksDbManager.putString(xref.getId(), geneId, geneCache.getXrefObjRocksDb()); +// } +// } +// } +// } +// } +// } else { +// logger.info("Skipping indexing gene: missing gene ID from JSON file"); +// } +// +// // Next line +// jsonGene = reader.readLine(); +// } +// logger.info("Indexing {} genes. Done.", geneCounter); +// +// reader.close(); +// } + +// public void indexingProteins(Path inputPath, Path indexPath) throws IOException { +// proteinCache.index(inputPath, indexPath); +// } public void indexingMiRnas(Path miRnaPath, Path indexPath) throws IOException { RocksDbManager rocksDbManager = new RocksDbManager(); @@ -719,8 +802,53 @@ private Map> createNodeAttributes() { attrs = Arrays.asList("regulationId", "id", "name"); nodeAttributes.put(Node.Type.REGULATION.toString(), new ArrayList<>(attrs)); + // Clinical Analysis + attrs = Arrays.asList("clinicalAnalysisId", "id", "name", "uuid", "description", "type", "priority", "flags", "creationDate", + "modificationDate", "dueDate", "statusName", "statusDate", "statusMessage", "consentPrimaryFindings", + "consentSecondaryFindings", "consentCarrierFindings", "consentResearchFindings", "release"); + nodeAttributes.put(Node.Type.CLINICAL_ANALYSIS.toString(), new ArrayList<>(attrs)); + + // Clinical analyst + attrs = Arrays.asList("clinicalAnalystId", "id", "name", "assignedBy", "assignee", "date"); + nodeAttributes.put(Node.Type.CLINICAL_ANALYST.toString(), new ArrayList<>(attrs)); + + // Comment + attrs = Arrays.asList("commentId", "id", "name", "author", "type", "text", "date"); + nodeAttributes.put(Node.Type.COMMENT.toString(), new ArrayList<>(attrs)); + + // Interpretation + attrs = Arrays.asList("interpretationId", "id", "name", "uuid", "description", "status", "creationDate", "version"); + nodeAttributes.put(Node.Type.INTERPRETATION.toString(), new ArrayList<>(attrs)); + + // Software + attrs = Arrays.asList("softwareId", "id", "name", "version", "repository", "commit", "website", "params"); + nodeAttributes.put(Node.Type.SOFTWARE.toString(), new ArrayList<>(attrs)); + + // Reported variant + attrs = Arrays.asList("reportedVariantId", "id", "name", "deNovoQualityScore", "status", "attributes"); + nodeAttributes.put(Node.Type.REPORTED_VARIANT.toString(), new ArrayList<>(attrs)); + + // Low covarage + attrs = Arrays.asList("lowCoverageId", "id", "name", "geneName", "chromosome", "start", "end", "meanCoverage", "type"); + nodeAttributes.put(Node.Type.LOW_COVERAGE_REGION.toString(), new ArrayList<>(attrs)); + + // Analyst + attrs = Arrays.asList("analystId", "id", "name", "company", "email"); + nodeAttributes.put(Node.Type.ANALYST.toString(), new ArrayList<>(attrs)); + + // Reported event + attrs = Arrays.asList("reportedVariantId", "id", "name", "modeOfInheritance", "penetrance", "score", "fullyExplainPhenotypes", + "roleInCancer", "actionable", "justification", "tier"); + nodeAttributes.put(Node.Type.REPORTED_EVENT.toString(), new ArrayList<>(attrs)); + + // Variant classification + attrs = Arrays.asList("variantClassificationId", "id", "name", "acmg", "clinicalSignificance", "drugResponse", "traitAssociation", + "functionalEffect", "tumorigenesis"); + nodeAttributes.put(Node.Type.VARIANT_CLASSIFICATION.toString(), new ArrayList<>(attrs)); + return nodeAttributes; } + private Set createNoAttributes() { Set noAttributes = new HashSet<>(); noAttributes.add("id"); @@ -729,7 +857,6 @@ private Set createNoAttributes() { return noAttributes; } - public long getUid() { return uid; } diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/Neo4jCsvImporter.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/Neo4jCsvImporter.java index 8f7e0f4..00e5c71 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/Neo4jCsvImporter.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/Neo4jCsvImporter.java @@ -4,12 +4,12 @@ import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; +import org.apache.commons.collections.MapUtils; import org.apache.commons.lang.StringUtils; -import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.DbReferenceType; -import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.Entry; -import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.FeatureType; -import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.KeywordType; +import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.*; +import org.opencb.biodata.models.clinical.interpretation.Comment; import org.opencb.biodata.models.clinical.interpretation.DiseasePanel; +import org.opencb.biodata.models.clinical.interpretation.ReportedVariant; import org.opencb.biodata.models.core.Gene; import org.opencb.biodata.models.core.Transcript; import org.opencb.biodata.models.core.TranscriptTfbs; @@ -19,8 +19,14 @@ import org.opencb.biodata.models.variant.avro.*; import org.opencb.bionetdb.core.models.network.Node; import org.opencb.bionetdb.core.models.network.Relation; +import org.opencb.bionetdb.core.utils.cache.GeneCache; +import org.opencb.bionetdb.core.utils.cache.ProteinCache; +import org.opencb.commons.utils.CollectionUtils; import org.opencb.commons.utils.FileUtils; import org.opencb.commons.utils.ListUtils; +import org.opencb.opencga.core.common.JacksonUtils; +import org.opencb.opencga.core.models.ClinicalAnalysis; +import org.opencb.opencga.core.models.Interpretation; import org.rocksdb.RocksIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,7 +69,7 @@ public void addVariantFiles(List files) throws IOException { if (file.getName().endsWith("json") || file.getName().endsWith("json.gz")) { if (file.getName().contains("clinvar")) { // JSON file - addJSONFile(file); + addVariantJsonFile(file); } } } @@ -77,7 +83,7 @@ public void addVariantFiles(List files) throws IOException { } else if (file.getName().endsWith("json") || file.getName().endsWith("json.gz")) { if (!file.getName().contains("clinvar") && !file.getName().contains("meta.json")) { // JSON file - addJSONFile(file); + addVariantJsonFile(file); } } else { logger.info("Unknown file type, skipping!!"); @@ -85,8 +91,34 @@ public void addVariantFiles(List files) throws IOException { } } + public void addClinicalAnalysisFiles(List files) throws IOException { + for (int i = 0; i < files.size(); i++) { + File file = files.get(i); + if (file.getName().endsWith("json") || file.getName().endsWith("json.gz")) { + if (file.getName().contains("clinvar")) { + // JSON file + addVariantJsonFile(file); + } + } + } + + + for (int i = 0; i < files.size(); i++) { + File file = files.get(i); + if (file.getName().endsWith("json") || file.getName().endsWith("json.gz")) { + if (!file.getName().contains("clinvar") && !file.getName().contains("meta.json")) { + // JSON file + addClinicalAnalysisJsonFile(file); + } + } else { + logger.info("Unknown file type, skipping (only JSON format are accepted for Clinical Analysis data!!"); + } + } + } + public Long processGene(String geneId, String geneName) { if (StringUtils.isEmpty(geneId)) { + logger.info("Skip processing gene, (id, name) = (" + geneId + ", " + geneName + ")"); return null; } Long geneUid = csv.getGeneUid(geneId); @@ -157,6 +189,8 @@ public Node createGeneNode(Gene gene) { public Node createGeneNode(Gene gene, Long uid) { PrintWriter pwRel; +// logger.info("----> creating gene node: " + gene.getId() + ", " + gene.getName()); + // Create gene node and save gene UID Node geneNode = NodeBuilder.newNode(uid, gene); @@ -240,10 +274,12 @@ public Node createGeneNode(Gene gene, Long uid) { } // Xrefs + PrintWriter pwXref = csv.getCsvWriters().get(Node.Type.XREF.toString()); + pwRel = csv.getCsvWriters().get(CsvInfo.BioPAXRelation.XREF___GENE___XREF.toString()); + Set xrefSet = new HashSet<>(); + xrefSet.add(new Xref(gene.getId(), "Ensembl", "Ensembl")); + xrefSet.add(new Xref(gene.getName(), "Ensembl", "Ensembl")); if (ListUtils.isNotEmpty(gene.getTranscripts())) { - PrintWriter pwXref = csv.getCsvWriters().get(Node.Type.XREF.toString()); - pwRel = csv.getCsvWriters().get(CsvInfo.BioPAXRelation.XREF___GENE___XREF.toString()); - Set xrefSet = new HashSet<>(); for (Transcript transcript : gene.getTranscripts()) { if (ListUtils.isNotEmpty(transcript.getXrefs())) { for (Xref xref : transcript.getXrefs()) { @@ -251,18 +287,18 @@ public Node createGeneNode(Gene gene, Long uid) { } } } - Iterator it = xrefSet.iterator(); - while (it.hasNext()) { - Xref xref = it.next(); - Long xrefUid = csv.getLong(xref.getDbName() + "." + xref.getId()); - if (xrefUid == null) { - n = NodeBuilder.newNode(csv.getAndIncUid(), xref); - pwXref.println(csv.nodeLine(n)); - xrefUid = n.getUid(); - csv.putLong(xref.getDbName() + "." + xref.getId(), xrefUid); - } - pwRel.println(csv.relationLine(uid, xrefUid)); + } + Iterator it = xrefSet.iterator(); + while (it.hasNext()) { + Xref xref = it.next(); + Long xrefUid = csv.getLong(xref.getDbName() + "." + xref.getId()); + if (xrefUid == null) { + n = NodeBuilder.newNode(csv.getAndIncUid(), xref); + pwXref.println(csv.nodeLine(n)); + xrefUid = n.getUid(); + csv.putLong(xref.getDbName() + "." + xref.getId(), xrefUid); } + pwRel.println(csv.relationLine(uid, xrefUid)); } return geneNode; @@ -339,9 +375,9 @@ public Node createProteinNode(Entry protein, Long uid) { } // Model Xrefs - PrintWriter pwXref = csv.getCsvWriters().get(Node.Type.XREF.toString()); - pw = csv.getCsvWriters().get(CsvInfo.BioPAXRelation.XREF___PROTEIN___XREF.toString()); if (ListUtils.isNotEmpty(protein.getDbReference())) { + PrintWriter pwXref = csv.getCsvWriters().get(Node.Type.XREF.toString()); + pw = csv.getCsvWriters().get(CsvInfo.BioPAXRelation.XREF___PROTEIN___XREF.toString()); for (DbReferenceType dbRef: protein.getDbReference()) { Long xrefUid = csv.getLong(dbRef.getType() + "." + dbRef.getId()); if (xrefUid == null) { @@ -411,11 +447,12 @@ public Node createTranscriptNode(Transcript transcript, Long uid) { public Long processVariant(Variant variant) throws IOException { Node variantNode = null; - Long variantUid = csv.getLong(variant.toString()); + + Long variantUid = csv.getLong(variant.toStringSimple()); if (variantUid == null) { variantNode = createVariantNode(variant); variantUid = variantNode.getUid(); - csv.putLong(variant.toString(), variantUid); + csv.putLong(variant.toStringSimple(), variantUid); } // Process sample info @@ -434,9 +471,9 @@ public Node createVariantNode(Variant variant) { } public Node createVariantNode(Variant variant, Long varUid) { - Node n, node = NodeBuilder.newNode(varUid, variant); + Node varNode = NodeBuilder.newNode(varUid, variant); PrintWriter pw = csv.getCsvWriters().get(Node.Type.VARIANT.toString()); - pw.println(csv.nodeLine(node)); + pw.println(csv.nodeLine(varNode)); // Annotation management if (variant.getAnnotation() != null) { @@ -483,9 +520,9 @@ public Node createVariantNode(Variant variant, Long varUid) { if (soId != null) { Long soUid = csv.getLong(soId); if (soUid == null) { - n = new Node(csv.getAndIncUid(), so.getAccession(), so.getName(), Node.Type.SO); - updateCSVFiles(ctNode.getUid(), n, Relation.Type.CONSEQUENCE_TYPE__SO.toString()); - csv.putLong(soId, n.getUid()); + Node soNode = new Node(csv.getAndIncUid(), so.getAccession(), so.getName(), Node.Type.SO); + updateCSVFiles(ctNode.getUid(), soNode, Relation.Type.CONSEQUENCE_TYPE__SO.toString()); + csv.putLong(soId, soNode.getUid()); } else { // Relation: consequence type - so pw = csv.getCsvWriters().get(Relation.Type.CONSEQUENCE_TYPE__SO.toString()); @@ -514,8 +551,8 @@ public Node createVariantNode(Variant variant, Long varUid) { // Protein substitution scores if (ListUtils.isNotEmpty(ct.getProteinVariantAnnotation().getSubstitutionScores())) { for (Score score: ct.getProteinVariantAnnotation().getSubstitutionScores()) { - node = NodeBuilder.newNode(csv.getAndIncUid(), score, Node.Type.SUBSTITUTION_SCORE); - updateCSVFiles(pVANode.getUid(), node, + Node scoreNode = NodeBuilder.newNode(csv.getAndIncUid(), score, Node.Type.SUBSTITUTION_SCORE); + updateCSVFiles(pVANode.getUid(), scoreNode, Relation.Type.PROTEIN_VARIANT_ANNOTATION__SUBSTITUTION_SCORE.toString()); } } @@ -523,12 +560,14 @@ public Node createVariantNode(Variant variant, Long varUid) { } } + Node node; + // Population frequencies if (ListUtils.isNotEmpty(variant.getAnnotation().getPopulationFrequencies())) { for (PopulationFrequency popFreq : variant.getAnnotation().getPopulationFrequencies()) { // Population frequency node - n = NodeBuilder.newNode(csv.getAndIncUid(), popFreq); - updateCSVFiles(varUid, n, Relation.Type.VARIANT__POPULATION_FREQUENCY.toString()); + node = NodeBuilder.newNode(csv.getAndIncUid(), popFreq); + updateCSVFiles(varUid, node, Relation.Type.VARIANT__POPULATION_FREQUENCY.toString()); } } @@ -536,8 +575,8 @@ public Node createVariantNode(Variant variant, Long varUid) { if (ListUtils.isNotEmpty(variant.getAnnotation().getConservation())) { for (Score score: variant.getAnnotation().getConservation()) { // Conservation node - n = NodeBuilder.newNode(csv.getAndIncUid(), score, Node.Type.CONSERVATION); - updateCSVFiles(varUid, n, Relation.Type.VARIANT__CONSERVATION.toString()); + node = NodeBuilder.newNode(csv.getAndIncUid(), score, Node.Type.CONSERVATION); + updateCSVFiles(varUid, node, Relation.Type.VARIANT__CONSERVATION.toString()); } } @@ -545,8 +584,8 @@ public Node createVariantNode(Variant variant, Long varUid) { if (ListUtils.isNotEmpty(variant.getAnnotation().getTraitAssociation())) { for (EvidenceEntry evidence: variant.getAnnotation().getTraitAssociation()) { // Trait association node - n = NodeBuilder.newNode(csv.getAndIncUid(), evidence, Node.Type.TRAIT_ASSOCIATION); - updateCSVFiles(varUid, n, Relation.Type.VARIANT__TRAIT_ASSOCIATION.toString()); + node = NodeBuilder.newNode(csv.getAndIncUid(), evidence, Node.Type.TRAIT_ASSOCIATION); + updateCSVFiles(varUid, node, Relation.Type.VARIANT__TRAIT_ASSOCIATION.toString()); } } @@ -554,21 +593,140 @@ public Node createVariantNode(Variant variant, Long varUid) { if (ListUtils.isNotEmpty(variant.getAnnotation().getFunctionalScore())) { for (Score score: variant.getAnnotation().getFunctionalScore()) { // Functional score node - n = NodeBuilder.newNode(csv.getAndIncUid(), score, Node.Type.FUNCTIONAL_SCORE); - updateCSVFiles(varUid, n, Relation.Type.VARIANT__FUNCTIONAL_SCORE.toString()); + node = NodeBuilder.newNode(csv.getAndIncUid(), score, Node.Type.FUNCTIONAL_SCORE); + updateCSVFiles(varUid, node, Relation.Type.VARIANT__FUNCTIONAL_SCORE.toString()); } } } - return node; + return varNode; + } + + public Long processClinicalAnalysis(ClinicalAnalysis clinicalAnalysis) throws IOException { + Node clinicalAnalysisNode = null; + Long clinicalAnalysisUid = csv.getLong(clinicalAnalysis.getId()); + if (clinicalAnalysisUid == null) { + clinicalAnalysisNode = createClinicalAnalysisNode(clinicalAnalysis); + clinicalAnalysisUid = clinicalAnalysisNode.getUid(); + csv.putLong(clinicalAnalysis.getId(), clinicalAnalysisUid); + } + + return clinicalAnalysisUid; + } + + public Node createClinicalAnalysisNode(ClinicalAnalysis clinicalAnalysis) throws IOException { + return createClinicalAnalysisNode(clinicalAnalysis, csv.getAndIncUid()); + } + + public Node createClinicalAnalysisNode(ClinicalAnalysis clinicalAnalysis, Long caUid) throws IOException { + Node caNode = NodeBuilder.newNode(caUid, clinicalAnalysis); + PrintWriter pw = csv.getCsvWriters().get(Node.Type.CLINICAL_ANALYSIS.toString()); + pw.println(csv.nodeLine(caNode)); + + // Comments + if (CollectionUtils.isNotEmpty(clinicalAnalysis.getComments())) { + for (Comment comment : clinicalAnalysis.getComments()) { + Node commentNode = NodeBuilder.newNode(csv.getAndIncUid(), comment); + pw = csv.getCsvWriters().get(Node.Type.COMMENT.toString()); + pw.println(csv.nodeLine(commentNode)); + + // Relation: clinical analysis - comment + pw = csv.getCsvWriters().get(Relation.Type.CLINICAL_ANALYSIS__COMMENT.toString()); + pw.println(csv.relationLine(caUid, commentNode.getUid())); + } + } + + // Clinical Analyst + if (clinicalAnalysis.getAnalyst() != null) { + Node analystNode = NodeBuilder.newNode(csv.getAndIncUid(), clinicalAnalysis.getAnalyst()); + pw = csv.getCsvWriters().get(Node.Type.CLINICAL_ANALYST.toString()); + pw.println(csv.nodeLine(analystNode)); + + // Relation: clinical analysis - comment + pw = csv.getCsvWriters().get(Relation.Type.CLINICAL_ANALYSIS__CLINICAL_ANALYST.toString()); + pw.println(csv.relationLine(caUid, analystNode.getUid())); + } + + if (CollectionUtils.isNotEmpty(clinicalAnalysis.getInterpretations())) { + for (Interpretation interpretation : clinicalAnalysis.getInterpretations()) { + Node interpretationNode = NodeBuilder.newNode(csv.getAndIncUid(), interpretation); + pw = csv.getCsvWriters().get(Node.Type.INTERPRETATION.toString()); + pw.println(csv.nodeLine(interpretationNode)); + + // Relation: clinical analysis - interpretation + pw = csv.getCsvWriters().get(Relation.Type.CLINICAL_ANALYSIS__INTERPRETATION.toString()); + pw.println(csv.relationLine(caUid, interpretationNode.getUid())); + + // Primary findings + if (CollectionUtils.isNotEmpty(interpretation.getPrimaryFindings())) { + processReportedVariants(interpretation.getPrimaryFindings(), interpretationNode.getUid(), true); + } + + // Secondary findings + if (CollectionUtils.isNotEmpty(interpretation.getSecondaryFindings())) { + processReportedVariants(interpretation.getSecondaryFindings(), interpretationNode.getUid(), false); + } + +// // Low coverage +// if (CollectionUtils.isNotEmpty(interpretation.getLowCoverageRegions())) { +// } +// +// // Software +// if (interpretation.getSoftware() != null) { +// } + + // Comments + if (CollectionUtils.isNotEmpty(interpretation.getComments())) { + for (Comment comment : interpretation.getComments()) { + Node commentNode = NodeBuilder.newNode(csv.getAndIncUid(), comment); + pw = csv.getCsvWriters().get(Node.Type.COMMENT.toString()); + pw.println(csv.nodeLine(commentNode)); + + // Relation: clinical analysis - comment + pw = csv.getCsvWriters().get(Relation.Type.INTERPRETATION__COMMENT.toString()); + pw.println(csv.relationLine(interpretation.getUid(), commentNode.getUid())); + } + } + } + } + + + return caNode; } //------------------------------------------------------------------------- // P R I V A T E M E T H O D S //------------------------------------------------------------------------- + private void processReportedVariants(List findings, Long interpretationUid, boolean arePrimaryFindings) + throws IOException { + PrintWriter pw; + Relation.Type interpretationRelation; + if (arePrimaryFindings) { + interpretationRelation = Relation.Type.PRIMARY_FINDING___INTERPRETATION___REPORTED_VARIANT; + + } else { + interpretationRelation = Relation.Type.SECONDARY_FINDING___INTERPRETATION___REPORTED_VARIANT; + } + + for (ReportedVariant finding : findings) { + Node findingNode = NodeBuilder.newNode(csv.getAndIncUid(), finding); + pw = csv.getCsvWriters().get(Node.Type.REPORTED_VARIANT.toString()); + pw.println(csv.nodeLine(findingNode)); - private void addJSONFile(File file) throws IOException { + // Process variant and relation it to the reported variant + Long variantUid = processVariant(finding); + pw = csv.getCsvWriters().get(Relation.Type.REPORTED_VARIANT__VARIANT.toString()); + pw.println(csv.relationLine(findingNode.getUid(), variantUid)); + + // Relation: interpretation - primary finding + pw = csv.getCsvWriters().get(interpretationRelation.toString()); + pw.println(csv.relationLine(interpretationUid, findingNode.getUid())); + } + } + + + private void addVariantJsonFile(File file) throws IOException { // Reading file line by line, each line a JSON object BufferedReader reader; ObjectMapper mapper = new ObjectMapper(); @@ -698,6 +856,116 @@ private String variantFormatLine(Long formatUid, StudyEntry studyEntry, int inde return sb.toString(); } + //------------------------------------------------------------------------- + + + private void addClinicalAnalysisJsonFile(File file) throws IOException { + // Reading file line by line, each line a JSON object + BufferedReader reader; + ObjectMapper mapper = JacksonUtils.getDefaultObjectMapper(); + + // TODO: how to get metadata from clinical analysis (format field, sample names,...) + boolean done = false; + + long counter = 0; + logger.info("Processing JSON file {}", file.getPath()); + reader = FileUtils.newBufferedReader(file.toPath()); + String line = reader.readLine(); + while (line != null) { + ClinicalAnalysis clinicalAnalysis = mapper.readValue(line, ClinicalAnalysis.class); + if (!done) { + done = processMetadataFromClinicalAnalysis(clinicalAnalysis); + } + processClinicalAnalysis(clinicalAnalysis); + + // read next line + line = reader.readLine(); + if (++counter % 5000 == 0) { + logger.info("Parsing {} clinical analsysis...", counter); + } + } + reader.close(); + logger.info("Parsed {} clinical analysis from {}. Done!!!", counter, file.toString()); + } + + private boolean processMetadataFromClinicalAnalysis(ClinicalAnalysis clinicalAnalysis) { + boolean done = false; + int counter = 0; + + Set formats = new HashSet<>(); + Set info = new HashSet<>(); + + if (org.apache.commons.collections.CollectionUtils.isNotEmpty(clinicalAnalysis.getInterpretations())) { + for (Interpretation interpretation : clinicalAnalysis.getInterpretations()) { + if (org.apache.commons.collections.CollectionUtils.isNotEmpty(interpretation.getPrimaryFindings())) { + for (ReportedVariant variant : interpretation.getPrimaryFindings()) { + if (org.apache.commons.collections.CollectionUtils.isNotEmpty(variant.getStudies())) { + for (StudyEntry study : variant.getStudies()) { + // Info fields + for (FileEntry file : study.getFiles()) { + if (MapUtils.isNotEmpty(file.getAttributes())) { + info.addAll(file.getAttributes().keySet()); + } + } + + // Format fields + if (org.apache.commons.collections.CollectionUtils.isNotEmpty(study.getFormat())) { + formats.addAll(study.getFormat()); + } + } + } + counter++; + if (counter > 2) { + break; + } + } + } + if (counter > 2) { + break; + } + } + } + + if (org.apache.commons.collections.CollectionUtils.isNotEmpty(info) + && org.apache.commons.collections.CollectionUtils.isNotEmpty(formats)) { + + // Variant call + String strType; + List attrs = new ArrayList<>(); + attrs.add("variantCallId"); + Iterator iterator = formats.iterator(); + while (iterator.hasNext()) { + attrs.add(iterator.next()); + } + strType = Node.Type.VARIANT_CALL.toString(); + Map> nodeAttributes = csv.getNodeAttributes(); + nodeAttributes.put(strType, attrs); + csv.getCsvWriters().get(strType).println(csv.getNodeHeaderLine(attrs)); + strType = Relation.Type.VARIANT__VARIANT_CALL.toString(); + csv.getCsvWriters().get(strType).println(csv.getRelationHeaderLine(strType)); + strType = Relation.Type.SAMPLE__VARIANT_CALL.toString(); + csv.getCsvWriters().get(strType).println(csv.getRelationHeaderLine(strType)); + + // Variant file info + attrs = new ArrayList<>(); + attrs.add("variantFileInfoId"); + iterator = info.iterator(); + while (iterator.hasNext()) { + attrs.add(iterator.next()); + } + strType = Node.Type.VARIANT_FILE_INFO.toString(); + nodeAttributes.put(strType, attrs); + csv.getCsvWriters().get(strType).println(csv.getNodeHeaderLine(attrs)); + strType = Relation.Type.VARIANT_CALL__VARIANT_FILE_INFO.toString(); + csv.getCsvWriters().get(strType).println(csv.getRelationHeaderLine(strType)); + strType = Relation.Type.VARIANT_FILE_INFO__FILE.toString(); + csv.getCsvWriters().get(strType).println(csv.getRelationHeaderLine(strType)); + + done = true; + } + return done; + } + private void updateCSVFiles(long startUid, Node node, String relationType) { updateCSVFiles(startUid, node, relationType, false); } @@ -714,12 +982,116 @@ private void updateCSVFiles(long startUid, Node node, String relationType, boole pw.println(csv.relationLine(startUid, node.getUid())); } - public void indexingGenes(Path genePath, Path indexPath) throws IOException { - csv.indexingGenes(genePath, indexPath); + public void indexingGenes(Path genePath) throws IOException { + GeneCache geneCache = csv.getGeneCache(); + BufferedReader reader = org.opencb.commons.utils.FileUtils.newBufferedReader(genePath); + + String jsonGene = reader.readLine(); + long geneCounter = 0; + while (jsonGene != null) { + Gene gene = geneCache.getObjReader().readValue(jsonGene); + String geneId = gene.getId(); + if (org.apache.commons.lang3.StringUtils.isNotEmpty(geneId)) { + geneCounter++; + if (geneCounter % 5000 == 0) { + logger.info("Indexing {} genes...", geneCounter); + } + // Save gene + geneCache.saveObject(geneId, jsonGene); + + // Save xrefs for that gene + geneCache.saveXref(geneId, geneId); + if (org.apache.commons.lang3.StringUtils.isNotEmpty(gene.getName())) { + geneCache.saveXref(gene.getName(), geneId); + } + + if (ListUtils.isNotEmpty(gene.getTranscripts())) { + for (Transcript transcr : gene.getTranscripts()) { + if (ListUtils.isNotEmpty(transcr.getXrefs())) { + for (Xref xref: transcr.getXrefs()) { + if (org.apache.commons.lang3.StringUtils.isNotEmpty(xref.getId())) { + geneCache.saveXref(xref.getId(), geneId); + } + } + } + } + } + + // And process gene to save to CSV file + processGene(gene.getId(), gene.getName()); + } else { + logger.info("Skipping indexing gene: missing gene ID from JSON file"); + } + + // Next line + jsonGene = reader.readLine(); + } + logger.info("Indexing {} genes. Done.", geneCounter); + + reader.close(); } - public void indexingProteins(Path proteinPath, Path indexPath) throws IOException { - csv.indexingProteins(proteinPath, indexPath); + public void indexingProteins(Path proteinPath) throws IOException { + ProteinCache proteinCache = csv.getProteinCache(); + BufferedReader reader = org.opencb.commons.utils.FileUtils.newBufferedReader(proteinPath); + + String jsonProtein = reader.readLine(); + long proteinCounter = 0; + while (jsonProtein != null) { + Entry protein = proteinCache.getObjReader().readValue(jsonProtein); + if (ListUtils.isNotEmpty(protein.getAccession())) { + proteinCounter++; + if (proteinCounter % 5000 == 0) { + logger.info("Indexing {} proteins...", proteinCounter); + } + + // Save protein in RocksDB + String proteinAcc = protein.getAccession().get(0); + proteinCache.saveObject(proteinAcc, jsonProtein); + + // Save protein xrefs + proteinCache.saveXref(proteinAcc, proteinAcc); + if (ListUtils.isNotEmpty(protein.getAccession())) { + for (String acc: protein.getAccession()) { + proteinCache.saveXref(acc, proteinAcc); + } + } + + if (protein.getProtein() != null && protein.getProtein().getRecommendedName() != null + && ListUtils.isNotEmpty(protein.getProtein().getRecommendedName().getShortName())) { + for (EvidencedStringType shortName : protein.getProtein().getRecommendedName().getShortName()) { + if (org.apache.commons.lang3.StringUtils.isNotEmpty(shortName.getValue())) { + proteinCache.saveXref(shortName.getValue(), proteinAcc); + } + } + } + + String proteinName = null; + if (ListUtils.isNotEmpty(protein.getName())) { + proteinName = protein.getName().get(0); + for (String name: protein.getName()) { + proteinCache.saveXref(name, proteinAcc); + } + } + + if (ListUtils.isNotEmpty(protein.getDbReference())) { + for (DbReferenceType dbRef: protein.getDbReference()) { + proteinCache.saveXref(dbRef.getId(), proteinAcc); + } + } + + // And process protein to save to CSV file + processProtein(proteinAcc, proteinName); + } else { + logger.info("Skipping indexing protein: missing protein accession from JSON file"); + } + + // Next line + jsonProtein = reader.readLine(); + } + logger.info("Indexing {} proteins. Done.", proteinCounter); + + reader.close(); } public void addGenePanels(Path panelPath, Path indexPath) throws IOException { diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/NodeBuilder.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/NodeBuilder.java index 7c38ba5..4d493bb 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/NodeBuilder.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/NodeBuilder.java @@ -1,9 +1,12 @@ package org.opencb.bionetdb.core.utils; +import org.apache.commons.collections.MapUtils; import org.apache.commons.lang.StringUtils; import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.*; -import org.opencb.biodata.models.clinical.interpretation.DiseasePanel; -import org.opencb.biodata.models.commons.Phenotype; +import org.opencb.biodata.models.clinical.interpretation.*; +import org.opencb.biodata.models.clinical.interpretation.GenomicFeature; +import org.opencb.biodata.models.clinical.interpretation.VariantClassification; +import org.opencb.biodata.models.commons.*; import org.opencb.biodata.models.core.Gene; import org.opencb.biodata.models.core.Transcript; import org.opencb.biodata.models.core.TranscriptTfbs; @@ -14,13 +17,16 @@ import org.opencb.biodata.models.variant.avro.*; import org.opencb.bionetdb.core.models.network.Node; import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.utils.CollectionUtils; import org.opencb.commons.utils.ListUtils; +import org.opencb.opencga.core.models.*; +import org.opencb.opencga.core.models.Interpretation; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; public class NodeBuilder { - public static final String CHROMOSOME = "chromosome"; public static final String START = "start"; public static final String END = "end"; @@ -46,7 +52,7 @@ public class NodeBuilder { public static final String PANEL_GENE = BIONETDB_PREFIX + "panelGene"; public static Node newNode(long uid, Variant variant) { - Node node = new Node(uid, variant.toString(), variant.getId(), Node.Type.VARIANT); + Node node = new Node(uid, variant.toStringSimple(), variant.getId(), Node.Type.VARIANT); if (ListUtils.isNotEmpty(variant.getNames())) { node.addAttribute("alternativeNames", StringUtils.join(variant.getNames(), ";")); } @@ -74,32 +80,25 @@ public static Node newNode(long uid, Variant variant) { public static Variant newVariant(Node node) { VariantBuilder variantBuilder = Variant.newBuilder(); - ObjectMap attrs = node.getAttributes(); if (attrs.containsKey(NodeBuilder.CHROMOSOME)) { variantBuilder.setChromosome(attrs.getString(NodeBuilder.CHROMOSOME)); } - if (attrs.containsKey(NodeBuilder.START)) { variantBuilder.setStart(attrs.getInt(NodeBuilder.START)); } - if (attrs.containsKey(NodeBuilder.END)) { variantBuilder.setEnd(attrs.getInt(NodeBuilder.END)); } - if (attrs.containsKey(NodeBuilder.REFERENCE)) { variantBuilder.setReference(attrs.getString(NodeBuilder.REFERENCE)); } - if (attrs.containsKey(NodeBuilder.ALTERNATE)) { variantBuilder.setAlternate(attrs.getString(NodeBuilder.ALTERNATE)); } - if (attrs.containsKey(NodeBuilder.TYPE)) { variantBuilder.setType(VariantType.valueOf(attrs.getString(NodeBuilder.TYPE))); } - variantBuilder.setStudyId("S"); variantBuilder.setFormat("GT"); @@ -184,12 +183,10 @@ public static Node newNode(long uid, ConsequenceType ct) { node.addAttribute("strand", ct.getStrand()); node.addAttribute("gene", ct.getEnsemblGeneId()); node.addAttribute("transcript", ct.getEnsemblTranscriptId()); - // Transcript annotation flags if (ListUtils.isNotEmpty(ct.getTranscriptAnnotationFlags())) { node.addAttribute("transcriptAnnotationFlags", StringUtils.join(ct.getTranscriptAnnotationFlags(), ",")); } - // Exon overlap if (ListUtils.isNotEmpty(ct.getExonOverlap())) { StringBuilder overlaps = new StringBuilder(); @@ -200,7 +197,6 @@ public static Node newNode(long uid, ConsequenceType ct) { } node.addAttribute("exonOverlap", overlaps.toString()); } - return node; } @@ -301,6 +297,12 @@ public static Node newNode(long uid, Xref xref) { return node; } + public static Node newNode(long uid, DbReferenceType xref) { + Node node = new Node(uid, xref.getId(), null, Node.Type.XREF); + node.addAttribute("dbName", xref.getType()); + return node; + } + public static Node newNode(long uid, Entry protein) { String id = (ListUtils.isNotEmpty(protein.getAccession()) ? protein.getAccession().get(0) : null); String name = (ListUtils.isNotEmpty(protein.getName()) ? protein.getName().get(0) : null); @@ -332,6 +334,7 @@ public static Node newNode(long uid, Entry protein) { // if (ListUtils.isNotEmpty(protein.getGene())) { // protein.getGene().get(0).getName().get(0). // } + return node; } @@ -357,38 +360,511 @@ public static Node newNode(long uid, FeatureType feature) { return node; } - public static Node newNode(long uid, DbReferenceType xref) { - Node node = new Node(uid, xref.getId(), null, Node.Type.XREF); - node.addAttribute("dbName", xref.getType()); - return node; - } - public static Node newNode(long uid, DiseasePanel panel) { + // IMPORTANT: phenotypes, variant, genes, STRs, regions must be created by the caller of this function! + Node node = new Node(uid, panel.getId(), panel.getName(), Node.Type.PANEL); - node.addAttribute("description", panel.getDescription()); + if (CollectionUtils.isNotEmpty(panel.getCategories())) { + node.addAttribute("categories", panel.getCategories().stream().map(DiseasePanel.PanelCategory::getName) + .collect(Collectors.joining(","))); + } + if (CollectionUtils.isNotEmpty(panel.getTags())) { + node.addAttribute("tags", StringUtils.join(panel.getTags(), ",")); + } + if (MapUtils.isNotEmpty(panel.getStats())) { + for (String key : panel.getStats().keySet()) { + node.addAttribute("stats_" + key, String.valueOf(panel.getStats().get(key))); + } + } + if (panel.getSource() != null) { + node.addAttribute("source_id", panel.getSource().getId()); + node.addAttribute("source_name", panel.getSource().getId()); + node.addAttribute("source_author", panel.getSource().getAuthor()); + node.addAttribute("source_project", panel.getSource().getProject()); + node.addAttribute("source_version", panel.getSource().getVersion()); + } node.addAttribute("creationDate", panel.getCreationDate()); node.addAttribute("modificationDate", panel.getModificationDate()); - if (ListUtils.isNotEmpty(panel.getPhenotypes())) { - StringBuilder sb = new StringBuilder(); - for (Phenotype phenotype : panel.getPhenotypes()) { - if (StringUtils.isNotEmpty(phenotype.getName())) { - if (sb.length() > 0) { - sb.append("--"); - } - sb.append(phenotype.getName()); - } + node.addAttribute("description", panel.getDescription()); + if (MapUtils.isNotEmpty(panel.getAttributes())) { + for (String key : panel.getAttributes().keySet()) { + node.addAttribute("attributes_" + key, panel.getAttributes().get(key).toString()); + } + } + return node; + } + + public static Node newNode(long uid, ClinicalAnalysis clinicalAnalysis) { + Node node = new Node(uid, clinicalAnalysis.getId(), clinicalAnalysis.getName(), Node.Type.CLINICAL_ANALYSIS); + node.addAttribute("uuid", clinicalAnalysis.getUuid()); + node.addAttribute("description", clinicalAnalysis.getDescription()); + if (clinicalAnalysis.getType() != null) { + node.addAttribute("type", clinicalAnalysis.getType().name()); + } + if (clinicalAnalysis.getPriority() != null) { + node.addAttribute("priority", clinicalAnalysis.getPriority().name()); + } + if (CollectionUtils.isNotEmpty(clinicalAnalysis.getFlags())) { + node.addAttribute("flags", StringUtils.join(clinicalAnalysis.getFlags(), ";")); + } + node.addAttribute("creationDate", clinicalAnalysis.getCreationDate()); + node.addAttribute("modificationDate", clinicalAnalysis.getModificationDate()); + node.addAttribute("dueDate", clinicalAnalysis.getModificationDate()); + addStatus(clinicalAnalysis.getStatus(), node); + if (clinicalAnalysis.getConsent() != null) { + node.addAttribute("consent_primaryFindings", clinicalAnalysis.getConsent().getPrimaryFindings().name()); + node.addAttribute("consent_secondaryFindings", clinicalAnalysis.getConsent().getSecondaryFindings().name()); + node.addAttribute("consent_carrierFindings", clinicalAnalysis.getConsent().getCarrierFindings().name()); + node.addAttribute("consent_researchFindings", clinicalAnalysis.getConsent().getResearchFindings().name()); + } + node.addAttribute("release", clinicalAnalysis.getRelease()); + return node; + } + + public static Node newNode(long uid, Comment comment) { + Node node = new Node(uid, "" + uid, "" + uid, Node.Type.COMMENT); + node.addAttribute("author", comment.getAuthor()); + node.addAttribute("type", comment.getType()); + node.addAttribute("text", comment.getText()); + node.addAttribute("date", comment.getDate()); + return node; + } + + public static Node newNode(long uid, ClinicalAnalysis.ClinicalAnalyst analyst) { + Node node = new Node(uid, "" + uid, "" + uid, Node.Type.CLINICAL_ANALYST); + node.addAttribute("assignedBy", analyst.getAssignedBy()); + node.addAttribute("assignee", analyst.getAssignee()); + node.addAttribute("date", analyst.getDate()); + return node; + } + + public static Node newNode(long uid, Interpretation interpretation) { + Node node = new Node(uid, "" + uid, "" + uid, Node.Type.INTERPRETATION); + node.addAttribute("uuid", interpretation.getUuid()); + node.addAttribute("description", interpretation.getDescription()); + node.addAttribute("status", interpretation.getStatus().name()); + node.addAttribute("creationDate", interpretation.getCreationDate()); + node.addAttribute("version", interpretation.getVersion()); + addObjectMap(interpretation.getFilters(), node, "filters_"); + return node; + } + + public static Node newNode(long uid, Software software) { + String id = getSoftwareId(software); + Node node = new Node(uid, id, software.getName(), Node.Type.SOFTWARE); + node.addAttribute("version", software.getVersion()); + node.addAttribute("repository", software.getRepository()); + node.addAttribute("commit", software.getCommit()); + node.addAttribute("website", software.getWebsite()); + addMap(software.getParams(), node, "params_"); + return node; + } + + public static Node newNode(long uid, ReportedVariant reportedVariant) { + Node node = new Node(uid, reportedVariant.toStringSimple(), reportedVariant.getId(), Node.Type.REPORTED_VARIANT); + node.addAttribute("deNovoQualityScore", reportedVariant.getDeNovoQualityScore()); + node.addAttribute("status", reportedVariant.getStatus().name()); + addObjectMap(reportedVariant.getAttributes(), node); + return node; + } + + public static Node newNode(long uid, ReportedLowCoverage reportedLowCoverage) { + Node node = new Node(uid, reportedLowCoverage.getId(), "" + uid, Node.Type.LOW_COVERAGE_REGION); + node.addAttribute("geneName", reportedLowCoverage.getGeneName()); + node.addAttribute("chromosome", reportedLowCoverage.getChromosome()); + node.addAttribute("start", reportedLowCoverage.getStart()); + node.addAttribute("end", reportedLowCoverage.getEnd()); + node.addAttribute("meanCoverage", reportedLowCoverage.getMeanCoverage()); + node.addAttribute("type", reportedLowCoverage.getType()); + return node; + } + + public static Node newNode(long uid, Analyst analyst) { + Node node = new Node(uid, analyst.getName(), analyst.getName(), Node.Type.ANALYST); + node.addAttribute("company", analyst.getCompany()); + node.addAttribute("email", analyst.getEmail()); + return node; + } + + public static Node newNode(long uid, ReportedEvent reportedEvent) { + Node node = new Node(uid, reportedEvent.getId(), reportedEvent.getId(), Node.Type.REPORTED_EVENT); + if (reportedEvent.getModeOfInheritance() != null) { + node.addAttribute("modeOfInheritance", reportedEvent.getModeOfInheritance().name()); + } + if (reportedEvent.getPenetrance() != null) { + node.addAttribute("penetrance", reportedEvent.getPenetrance().name()); + } + if (CollectionUtils.isNotEmpty(reportedEvent.getCompoundHeterozygousVariantIds())) { + node.addAttribute("compoundHeterozygousVariantIds", StringUtils.join(reportedEvent.getCompoundHeterozygousVariantIds(), ",")); + } + node.addAttribute("score", reportedEvent.getScore()); + node.addAttribute("fullyExplainPhenotypes", reportedEvent.isFullyExplainPhenotypes()); + if (reportedEvent.getRoleInCancer() != null) { + node.addAttribute("roleInCancer", reportedEvent.getRoleInCancer().name()); + } + node.addAttribute("actionable", reportedEvent.isActionable()); + node.addAttribute("justification", reportedEvent.getJustification()); + if (reportedEvent.getClassification() != null) { + VariantClassification classification = reportedEvent.getClassification(); + // Tier + if (StringUtils.isNotEmpty(classification.getTier())) { + node.addAttribute("classification_tier", classification.getTier()); + } + // ACMG + if (CollectionUtils.isNotEmpty(classification.getAcmg())) { + node.addAttribute("classification_acmg", StringUtils.join(classification.getAcmg(), ",")); + } + // Clinical significance + if (classification.getClinicalSignificance() != null) { + node.addAttribute("classification_clinicalSignificance", classification.getClinicalSignificance().name()); } - if (sb.length() > 0) { - node.addAttribute("phenotypeNames", sb.toString()); + // Drug response + if (classification.getDrugResponse() != null) { + node.addAttribute("classification_drugResponse", classification.getDrugResponse().name()); + } + // Trait association + if (classification.getTraitAssociation() != null) { + node.addAttribute("classification_traitAssociation", classification.getTraitAssociation().name()); + } + // Functional effect + if (classification.getFunctionalEffect() != null) { + node.addAttribute("classification_functionalEffect", classification.getFunctionalEffect().name()); + } + // Tumorigenesis + if (classification.getTumorigenesis() != null) { + node.addAttribute("classification_tumorigenesis", classification.getTumorigenesis().name()); + } + // Other + if (CollectionUtils.isNotEmpty(classification.getOther())) { + node.addAttribute("classification_other", StringUtils.join(classification.getOther(), "---")); } } - if (panel.getSource() != null) { - node.addAttribute("sourceId", panel.getSource().getId()); - node.addAttribute("sourceName", panel.getSource().getId()); - node.addAttribute("sourceAuthor", panel.getSource().getAuthor()); - node.addAttribute("sourceProject", panel.getSource().getProject()); - node.addAttribute("sourceVersion", panel.getSource().getVersion()); + return node; + } + + public static Node newNode(long uid, GenomicFeature genomicFeature) { + Node node = new Node(uid, genomicFeature.getId(), genomicFeature.getId(), Node.Type.GENOMIC_FEATURE); + node.addAttribute("type", genomicFeature.getType()); + node.addAttribute("geneName", genomicFeature.getGeneName()); + node.addAttribute("transcriptId", genomicFeature.getTranscriptId()); + // TODO: xrefs + return node; + } + + public static Node newNode(long uid, Phenotype phenotype) { + // IMPORTANT: ontology term node and relation must be created by the caller! + + Node node = new Node(uid, phenotype.getId(), phenotype.getName(), Node.Type.PHENOTYPE); + node.addAttribute("ageOfOnset", phenotype.getAgeOfOnset()); + if (phenotype.getStatus() != null) { + node.addAttribute("status", phenotype.getStatus().name()); + } + return node; + } + + public static Node newNode(long uid, OntologyTerm ontologyTerm) { + Node node = new Node(uid, ontologyTerm.getId(), ontologyTerm.getName(), Node.Type.ONTOLOGY_TERM); + node.addAttribute("source", ontologyTerm.getSource()); + if (MapUtils.isNotEmpty(ontologyTerm.getAttributes())) { + for (String key : ontologyTerm.getAttributes().keySet()) { + node.addAttribute("attributes_" + key, ontologyTerm.getAttributes().get(key)); + } + } + return node; + } + + public static Node newNode(long uid, Disorder disorder) { + // IMPORTANT: phenotype nodes and relations must be created by the caller of this function!!! + + Node node = new Node(uid, disorder.getId(), disorder.getName(), Node.Type.DISORDER); + node.addAttribute("description", disorder.getDescription()); + node.addAttribute("source", disorder.getSource()); + if (MapUtils.isNotEmpty(disorder.getAttributes())) { + for (String key : disorder.getAttributes().keySet()) { + node.addAttribute("attributes_" + key, disorder.getAttributes().get(key)); + } + } + return node; + } + + public static Node newNode(long uid, Individual individual) { + // IMPORTANT: father, mother, phenotypes, disorders, samples nodes and relations must be created by the caller of this + // function!!! + + Node node = new Node(uid, individual.getId(), individual.getName(), Node.Type.INDIVIDUAL); + node.addAttribute("uuid", individual.getUuid()); + if (individual.getLocation() != null) { + node.addAttribute("location_address", individual.getLocation().getAddress()); + node.addAttribute("location_city", individual.getLocation().getCity()); + node.addAttribute("location_postalCode", individual.getLocation().getPostalCode()); + node.addAttribute("location_state", individual.getLocation().getState()); + node.addAttribute("location_country", individual.getLocation().getCountry()); + } + if (individual.getSex() != null) { + node.addAttribute("sex", individual.getSex().name()); + } + if (individual.getKaryotypicSex() != null) { + node.addAttribute("karyotypicSex", individual.getKaryotypicSex().name()); + } + node.addAttribute("ethnicity", individual.getEthnicity()); + if (individual.getPopulation() != null) { + node.addAttribute("population_name", individual.getPopulation().getName()); + node.addAttribute("population_subpopulation", individual.getPopulation().getSubpopulation()); + node.addAttribute("population_description", individual.getPopulation().getDescription()); + } + if (individual.getMultiples() != null) { + node.addAttribute("multiples_type", individual.getMultiples().getType()); + node.addAttribute("multiples_siblings", StringUtils.join(individual.getMultiples().getSiblings(), ",")); + } + node.addAttribute("dateOfBirth", individual.getDateOfBirth()); + node.addAttribute("release", individual.getRelease()); + node.addAttribute("version", individual.getRelease()); + node.addAttribute("creationDate", individual.getCreationDate()); + node.addAttribute("modificationDate", individual.getModificationDate()); + addStatus(individual.getStatus(), node); + if (individual.getLifeStatus() != null) { + node.addAttribute("lifeStatus", individual.getLifeStatus().name()); + } + node.addAttribute("parentalConsanguinity", individual.isParentalConsanguinity()); + if (MapUtils.isNotEmpty(individual.getAttributes())) { + for (String key : individual.getAttributes().keySet()) { + node.addAttribute("attributes_" + key, individual.getAttributes().get(key).toString()); + } + } + return node; + } + + public static Node newNode(long uid, Sample sample) { + // IMPORTANT: phenotypes nodes and relations must be created by the caller of this function!!! + + Node node = new Node(uid, sample.getId(), sample.getName(), Node.Type.SAMPLE); + node.addAttribute("uuid", sample.getUuid()); + node.addAttribute("source", sample.getSource()); + if (sample.getProcessing() != null) { + node.addAttribute("processing_product", sample.getProcessing().getProduct()); + node.addAttribute("processing_preparationMethod", sample.getProcessing().getPreparationMethod()); + node.addAttribute("processing_extractionMethod", sample.getProcessing().getExtractionMethod()); + node.addAttribute("processing_labSampleId", sample.getProcessing().getLabSampleId()); + node.addAttribute("processing_quantity", sample.getProcessing().getQuantity()); + node.addAttribute("processing_date", sample.getProcessing().getDate()); + // TODO: sample.getProcessing().getAttributes() + } + if (sample.getCollection() != null) { + node.addAttribute("collection_tissue", sample.getCollection().getTissue()); + node.addAttribute("collection_organ", sample.getCollection().getOrgan()); + node.addAttribute("collection_quantity", sample.getCollection().getQuantity()); + node.addAttribute("collection_method", sample.getCollection().getMethod()); + node.addAttribute("collection_date", sample.getCollection().getDate()); + // TODO: sample.getCollection().getAttributes() + } + node.addAttribute("release", sample.getRelease()); + node.addAttribute("version", sample.getRelease()); + node.addAttribute("creationDate", sample.getCreationDate()); + node.addAttribute("modificationDate", sample.getModificationDate()); + addStatus(sample.getStatus(), node); + node.addAttribute("description", sample.getDescription()); + node.addAttribute("type", sample.getType()); + node.addAttribute("somatic", sample.isSomatic()); + if (MapUtils.isNotEmpty(sample.getAttributes())) { + for (String key : sample.getAttributes().keySet()) { + node.addAttribute("attributes_" + key, sample.getAttributes().get(key).toString()); + } + } + return node; + } + + public static Node newNode(long uid, Alert alert) { + Node node = new Node(uid, null, null, Node.Type.ALERT); + node.addAttribute("author", alert.getAuthor()); + node.addAttribute("date", alert.getDate()); + node.addAttribute("message", alert.getMessage()); + if (alert.getRisk() != null) { + node.addAttribute("risk", alert.getRisk().name()); + } + return node; + } + + public static Node newNode(long uid, Family family) { + // IMPORTANT: phenotypes, disorder and members, nodes and relations must be created by the caller! + + Node node = new Node(uid, family.getId(), family.getName(), Node.Type.FAMILY); + node.addAttribute("uuid", family.getUuid()); + node.addAttribute("creationDate", family.getCreationDate()); + node.addAttribute("modificationDate", family.getModificationDate()); + addStatus(family.getStatus(), node); + node.addAttribute("expectedSize", family.getExpectedSize()); + node.addAttribute("description", family.getDescription()); + node.addAttribute("release", family.getRelease()); + node.addAttribute("version", family.getRelease()); + if (MapUtils.isNotEmpty(family.getAttributes())) { + for (String key : family.getAttributes().keySet()) { + node.addAttribute("attributes_" + key, family.getAttributes().get(key).toString()); + } + } + return node; + } + + public static Node newNode(long uid, DiseasePanel.GenePanel panelGene) { + // IMPORTANT: phenotypes nodes and relations must be created by the caller! + + Node node = new Node(uid, panelGene.getId(), panelGene.getName(), Node.Type.PANEL_GENE); + addDiseasePanelCommon(panelGene, node); + return node; + } + + public static Node newNode(long uid, DiseasePanel.VariantPanel panelVariant) { + // IMPORTANT: phenotypes nodes and relations must be created by the caller! + + Node node = new Node(uid, panelVariant.getId(), panelVariant.getId(), Node.Type.PANEL_VARIANT); + node.addAttribute("alternate", panelVariant.getAlternate()); + node.addAttribute("reference", panelVariant.getReference()); + addDiseasePanelCommon(panelVariant, node); + return node; + } + + public static Node newNode(long uid, DiseasePanel.STR panelStr) { + // IMPORTANT: phenotypes nodes and relations must be created by the caller! + + Node node = new Node(uid, panelStr.getId(), panelStr.getId(), Node.Type.PANEL_STR); + node.addAttribute("repeatedSequence", panelStr.getRepeatedSequence()); + node.addAttribute("normalRepeats", String.valueOf(panelStr.getNormalRepeats())); + node.addAttribute("pathogenicRepeats", String.valueOf(panelStr.getPathogenicRepeats())); + addDiseasePanelCommon(panelStr, node); + return node; + } + + public static Node newNode(long uid, DiseasePanel.RegionPanel panelRegion) { + // IMPORTANT: phenotypes nodes and relations must be created by the caller! + + Node node = new Node(uid, panelRegion.getId(), panelRegion.getId(), Node.Type.PANEL_REGION); + node.addAttribute("description", panelRegion.getDescription()); + if (panelRegion.getTypeOfVariants() != null) { + node.addAttribute("typeOfVariants", panelRegion.getTypeOfVariants().name()); } + node.addAttribute("haploinsufficiencyScore", panelRegion.getHaploinsufficiencyScore()); + node.addAttribute("triplosensitivityScore", panelRegion.getTriplosensitivityScore()); + node.addAttribute("requiredOverlapPercentage", String.valueOf(panelRegion.getRequiredOverlapPercentage())); + addDiseasePanelCommon(panelRegion, node); return node; } + + public static Node newNode(long uid, File file) { + // IMPORTANT: software, experiment and sample nodes and relations must be created by the caller! + + Node node = new Node(uid, file.getId(), file.getName(), Node.Type.FILE); + node.addAttribute("uuid", file.getUuid()); + if (file.getType() != null) { + node.addAttribute("type", file.getType().name()); + } + if (file.getFormat() != null) { + node.addAttribute("format", file.getFormat().name()); + } + if (file.getBioformat() != null) { + node.addAttribute("bioformat", file.getBioformat().name()); + } + node.addAttribute("checksum", file.getChecksum()); + if (file.getUri() != null) { + node.addAttribute("uri", file.getUri().toString()); + } + node.addAttribute("path", file.getPath()); + node.addAttribute("release", String.valueOf(file.getRelease())); + node.addAttribute("creationDate", String.valueOf(file.getCreationDate())); + node.addAttribute("modificationDate", String.valueOf(file.getModificationDate())); + node.addAttribute("description", String.valueOf(file.getDescription())); + addStatus(file.getStatus(), node); + node.addAttribute("external", file.isExternal()); + node.addAttribute("size", String.valueOf(file.getSize())); + if (CollectionUtils.isNotEmpty(file.getTags())) { + node.addAttribute("tags", StringUtils.join(file.getTags(), ",")); + } + // TODO: file.getRelatedFiles(), File.RelatedFile + // TODO: file.getIndex(), FileIndex + addObjectMap(file.getStats(), node, "stats_"); + addObjectMap(file.getAttributes(), node); + return node; + } + + public static Node newNode(long uid, Experiment experiment) { + Node node = new Node(uid, "", "", Node.Type.EXPERIMENT); + node.addAttribute("type", experiment.getType()); + node.addAttribute("platform", experiment.getPlatform()); + node.addAttribute("manufacturer", experiment.getManufacturer()); + node.addAttribute("date", experiment.getDate()); + node.addAttribute("lab", experiment.getLab()); + node.addAttribute("center", experiment.getCenter()); + node.addAttribute("responsible", experiment.getResponsible()); + node.addAttribute("description", experiment.getDescription()); + addObjectMap(experiment.getAttributes(), node); + return node; + } + + public static String getSoftwareId(Software software) { + StringBuilder id = new StringBuilder(); + id.append(software.getName() != null ? software.getName() : "").append("-"); + id.append(software.getVersion() != null ? software.getVersion() : "").append("-"); + id.append(software.getCommit() != null ? software.getCommit() : ""); + return id.toString(); + } + + //------------------------------------------------------------------------- + // P R I V A T E M E T H O D S + //------------------------------------------------------------------------- + +// private static void addAttributes(Map attributes, Node node) { +// if (MapUtils.isNotEmpty(attributes)) { +// for (String key : attributes.keySet()) { +// node.addAttribute("attributes_" + key, attributes.get(key).toString()); +// } +// } +// } + + private static void addObjectMap(Map attributes, Node node) { + addObjectMap(attributes, node, "attributes_"); + } + + private static void addObjectMap(Map attributes, Node node, String prefix) { + if (MapUtils.isNotEmpty(attributes)) { + for (String key : attributes.keySet()) { + node.addAttribute(prefix + key, attributes.get(key).toString()); + } + } + } + + private static void addMap(Map attributes, Node node) { + addMap(attributes, node, "attributes_"); + } + + private static void addMap(Map attributes, Node node, String prefix) { + if (MapUtils.isNotEmpty(attributes)) { + for (String key : attributes.keySet()) { + node.addAttribute(prefix + key, attributes.get(key)); + } + } + } + + private static void addStatus(Status status, Node node) { + if (status != null) { + node.addAttribute("status_name", status.getName()); + node.addAttribute("status_message", status.getMessage()); + node.addAttribute("status_date", status.getDate()); + } + } + + private static void addDiseasePanelCommon(DiseasePanel.Common common, Node node) { + node.addAttribute("modeOfInheritance", common.getModeOfInheritance()); + if (common.getPenetrance() != null) { + node.addAttribute("penetrance", common.getPenetrance().name()); + } + node.addAttribute("confidence", common.getConfidence()); + if (CollectionUtils.isNotEmpty(common.getEvidences())) { + node.addAttribute("evidences", StringUtils.join(common.getEvidences(), ",")); + } + if (CollectionUtils.isNotEmpty(common.getPublications())) { + node.addAttribute("publications", StringUtils.join(common.getPublications(), ",")); + } + if (CollectionUtils.isNotEmpty(common.getCoordinates())) { + node.addAttribute("coordinates", common.getCoordinates().stream().map(c -> c.getAssembly() + "+" + c.getLocation() + "+" + + c.getSource()).collect(Collectors.joining(","))); + } + } } diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/Cache.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/Cache.java index 0d0dee8..1751f59 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/Cache.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/Cache.java @@ -11,29 +11,43 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.nio.file.Paths; public abstract class Cache { - protected ObjectMapper objMapper; - protected ObjectReader objReader; + protected String objFilename; + protected String xrefObjFilename; protected RocksDB objRocksDb; protected RocksDB xrefObjRocksDb; protected RocksDbManager rocksDbManager; + protected ObjectMapper objMapper; + protected ObjectReader objReader; + private static Logger logger; - public Cache() { + public Cache(String objFilename, String xrefObjFilename) { + this.objFilename = objFilename; + this.xrefObjFilename = xrefObjFilename; + + rocksDbManager = new RocksDbManager(); + + // Delete protein RocksDB files + Paths.get(objFilename).toFile().delete(); + Paths.get(xrefObjFilename).toFile().delete(); + + // Create gene RocksDB files (protein and xrefs) + objRocksDb = rocksDbManager.getDBConnection(objFilename, true); + xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); + objMapper = new ObjectMapper(); objMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); objMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); - rocksDbManager = new RocksDbManager(); - logger = LoggerFactory.getLogger(this.getClass()); } - public abstract void index(Path input, Path output) throws IOException; +// public abstract void index(Path input, Path output) throws IOException; public String getPrimaryId(String id) { return rocksDbManager.getString(id, xrefObjRocksDb); @@ -59,4 +73,24 @@ public T get(String id) { return obj; } + + public void saveObject(String id, String json) { + rocksDbManager.putString(id, json, objRocksDb); + } + + public void saveXref(String xref, String id) { + rocksDbManager.putString(xref, id, xrefObjRocksDb); + } + + public RocksDB getObjRocksDb() { + return objRocksDb; + } + + public RocksDB getXrefObjRocksDb() { + return xrefObjRocksDb; + } + + public ObjectReader getObjReader() { + return objReader; + } } diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/GeneCache.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/GeneCache.java index ca5b58e..c19b2e8 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/GeneCache.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/GeneCache.java @@ -1,92 +1,18 @@ package org.opencb.bionetdb.core.utils.cache; -import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.core.Gene; -import org.opencb.biodata.models.core.Transcript; -import org.opencb.biodata.models.core.Xref; -import org.opencb.commons.utils.ListUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.IOException; import java.nio.file.Path; -import java.nio.file.Paths; public class GeneCache extends Cache { - private static Logger logger; - public GeneCache() { - super(); + public GeneCache(Path indexPath) { + super(indexPath + "/genes.rocksdb", indexPath + "/xref.genes.rocksdb"); objReader = objMapper.reader(Gene.class); logger = LoggerFactory.getLogger(this.getClass()); } - - @Override - public void index(Path input, Path output) throws IOException { - String objFilename = output.toString() + "/genes.rocksdb"; - String xrefObjFilename = output.toString() + "/xref.genes.rocksdb"; - -// if (Paths.get(objFilename).toFile().exists() -// && Paths.get(xrefObjFilename).toFile().exists()) { -// objRocksDb = rocksDbManager.getDBConnection(objFilename, true); -// xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); -// logger.info("\tGene index already created!"); -// return; -// } - - // Delete protein RocksDB files - Paths.get(objFilename).toFile().delete(); - Paths.get(xrefObjFilename).toFile().delete(); - - // Create gene RocksDB files (protein and xrefs) - objRocksDb = rocksDbManager.getDBConnection(objFilename, true); - xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); - - BufferedReader reader = org.opencb.commons.utils.FileUtils.newBufferedReader(input); - String jsonGene = reader.readLine(); - long geneCounter = 0; - while (jsonGene != null) { - Gene gene = objReader.readValue(jsonGene); - String geneId = gene.getId(); - if (StringUtils.isNotEmpty(geneId)) { - geneCounter++; - if (geneCounter % 5000 == 0) { - logger.info("Indexing {} genes...", geneCounter); - } - // Save gene - rocksDbManager.putString(geneId, jsonGene, objRocksDb); - - // Save xrefs for that gene - rocksDbManager.putString(geneId, geneId, xrefObjRocksDb); - if (StringUtils.isNotEmpty(gene.getName())) { - rocksDbManager.putString(gene.getName(), geneId, xrefObjRocksDb); - } - - if (ListUtils.isNotEmpty(gene.getTranscripts())) { - for (Transcript transcr : gene.getTranscripts()) { - if (StringUtils.isNotEmpty(transcr.getId()) || StringUtils.isNotEmpty(transcr.getName())) { - if (ListUtils.isNotEmpty(transcr.getXrefs())) { - for (Xref xref: transcr.getXrefs()) { - if (StringUtils.isNotEmpty(xref.getId())) { - rocksDbManager.putString(xref.getId(), geneId, xrefObjRocksDb); - } - } - } - } - } - } - } else { - logger.info("Skipping indexing gene: missing gene ID from JSON file"); - } - - // Next line - jsonGene = reader.readLine(); - } - logger.info("Indexing {} genes. Done.", geneCounter); - - reader.close(); - } } diff --git a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/ProteinCache.java b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/ProteinCache.java index d5cf3b8..fdee540 100644 --- a/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/ProteinCache.java +++ b/bionetdb-core/src/main/java/org/opencb/bionetdb/core/utils/cache/ProteinCache.java @@ -1,102 +1,19 @@ package org.opencb.bionetdb.core.utils.cache; -import org.apache.commons.lang3.StringUtils; -import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.DbReferenceType; import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.Entry; -import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.EvidencedStringType; -import org.opencb.commons.utils.ListUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.IOException; import java.nio.file.Path; -import java.nio.file.Paths; public class ProteinCache extends Cache { protected static Logger logger; - public ProteinCache() { - super(); + public ProteinCache(Path indexPath) { + super(indexPath + "/proteins.rocksdb", indexPath + "/xref.proteins.rocksdb"); objReader = objMapper.reader(Entry.class); logger = LoggerFactory.getLogger(this.getClass()); } - - @Override - public void index(Path input, Path output) throws IOException { - String objFilename = output.toString() + "/proteins.rocksdb"; - String xrefObjFilename = output.toString() + "/xref.proteins.rocksdb"; - -// if (Paths.get(objFilename).toFile().exists() -// && Paths.get(xrefObjFilename).toFile().exists()) { -// objRocksDb = rocksDbManager.getDBConnection(objFilename, true); -// xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); -// logger.info("\tProtein index already created!"); -// return; -// } - - // Delete protein RocksDB files - Paths.get(objFilename).toFile().delete(); - Paths.get(xrefObjFilename).toFile().delete(); - - // Create gene RocksDB files (protein and xrefs) - objRocksDb = rocksDbManager.getDBConnection(objFilename, true); - xrefObjRocksDb = rocksDbManager.getDBConnection(xrefObjFilename, true); - - BufferedReader reader = org.opencb.commons.utils.FileUtils.newBufferedReader(input); - String jsonProtein = reader.readLine(); - long proteinCounter = 0; - while (jsonProtein != null) { - Entry protein = objReader.readValue(jsonProtein); - if (ListUtils.isNotEmpty(protein.getAccession())) { - proteinCounter++; - if (proteinCounter % 5000 == 0) { - logger.info("Indexing {} proteins...", proteinCounter); - } - - // Save protein in RocksDB - String proteinAcc = protein.getAccession().get(0); - rocksDbManager.putString(proteinAcc, jsonProtein, objRocksDb); - - // Save protein xrefs - if (ListUtils.isNotEmpty(protein.getAccession())) { - for (String acc: protein.getAccession()) { - rocksDbManager.putString(acc, proteinAcc, xrefObjRocksDb); - } - } - - try { - for (EvidencedStringType shortName : protein.getProtein().getRecommendedName().getShortName()) { - if (StringUtils.isNotEmpty(shortName.getValue())) { - rocksDbManager.putString(shortName.getValue(), proteinAcc, xrefObjRocksDb); - } - } - } catch (Exception e) { - logger.info(e.getLocalizedMessage()); - } - - if (ListUtils.isNotEmpty(protein.getName())) { - for (String name: protein.getName()) { - rocksDbManager.putString(name, proteinAcc, xrefObjRocksDb); - } - } - - if (ListUtils.isNotEmpty(protein.getDbReference())) { - for (DbReferenceType dbRef: protein.getDbReference()) { - rocksDbManager.putString(dbRef.getId(), proteinAcc, xrefObjRocksDb); - } - } - } else { - logger.info("Skipping indexing protein: missing protein accession from JSON file"); - } - - // Next line - jsonProtein = reader.readLine(); - } - logger.info("Indexing {} proteins. Done.", proteinCounter); - - reader.close(); - } } diff --git a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/BioNetDbManagerTest.java b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/BioNetDbManagerTest.java index 7554453..424369e 100644 --- a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/BioNetDbManagerTest.java +++ b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/BioNetDbManagerTest.java @@ -24,9 +24,9 @@ import org.opencb.bionetdb.core.config.BioNetDBConfiguration; import org.opencb.bionetdb.core.config.DatabaseConfiguration; import org.opencb.bionetdb.core.exceptions.BioNetDBException; -import org.opencb.bionetdb.core.neo4j.query.Neo4JQueryParser; import org.opencb.bionetdb.core.models.network.Network; import org.opencb.bionetdb.core.models.network.Node; +import org.opencb.bionetdb.core.neo4j.query.Neo4JQueryParser; import org.opencb.bionetdb.core.utils.CsvInfo; import org.opencb.bionetdb.core.utils.Neo4jCsvImporter; import org.opencb.bionetdb.core.utils.NodeBuilder; @@ -494,76 +494,6 @@ public void getGene() throws IOException { } } - @Test - public void getGenes() throws IOException { - String assembly = "GRCh38"; // "GRCh37", "GRCh38" - // CellBase client - ClientConfiguration clientConfiguration = new ClientConfiguration(); - clientConfiguration.setVersion("v4"); - clientConfiguration.setRest(new RestConfig(Collections.singletonList("http://bioinfo.hpc.cam.ac.uk/cellbase"), 30000)); - CellBaseClient cellBaseClient = new CellBaseClient("hsapiens", assembly, clientConfiguration); - - GeneClient geneClient = cellBaseClient.getGeneClient(); - Query query = new Query(); - QueryOptions options = new QueryOptions(QueryOptions.EXCLUDE, "transcripts.exons,transcripts.cDnaSequence,annotation.expression"); - QueryResponse countResponse = geneClient.count(query); - long numGenes = countResponse.firstResult(); - int bufferSize = 400; - options.put(QueryOptions.LIMIT, bufferSize); - System.out.println("Num. genes: " + numGenes); - ObjectMapper mapper = new ObjectMapper(); - mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); - mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); - ObjectWriter writer = mapper.writer(); - PrintWriter pw = new PrintWriter(Paths.get("/tmp/" + assembly + ".genes.json").toString()); - for (int i = 0; i < numGenes; i += bufferSize) { - options.put(QueryOptions.SKIP, i); - QueryResponse geneResponse = geneClient.search(query, options); - for (Gene gene : geneResponse.allResults()) { - String json = writer.writeValueAsString(gene); - pw.println(json); - } - System.out.println("Processing " + i + " of " + numGenes); - } - pw.close(); - } - - @Test - public void getProteins() throws IOException { - String assembly = "GRCh38"; // "GRCh37", "GRCh38" - // CellBase client - ClientConfiguration clientConfiguration = new ClientConfiguration(); - clientConfiguration.setVersion("v4"); - clientConfiguration.setRest(new RestConfig(Collections.singletonList("http://bioinfo.hpc.cam.ac.uk/cellbase"), 30000)); - CellBaseClient cellBaseClient = new CellBaseClient("hsapiens", assembly, clientConfiguration); - - ProteinClient proteinClient = cellBaseClient.getProteinClient(); - Query query = new Query(); - QueryOptions options = new QueryOptions(QueryOptions.EXCLUDE, "reference,comment,sequence,evidence"); - long numProteins = 100000; - int bufferSize = 400; - options.put(QueryOptions.LIMIT, bufferSize); - System.out.println("Num. proteins: " + numProteins); - ObjectMapper mapper = new ObjectMapper(); - mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); - mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); - ObjectWriter writer = mapper.writer(); - PrintWriter pw = new PrintWriter(Paths.get("/tmp/" + assembly + ".proteins.json").toString()); - for (int i = 0; i < numProteins; i += bufferSize) { - options.put(QueryOptions.SKIP, i); - QueryResponse proteinResponse = proteinClient.search(query, options); - if (proteinResponse.allResults().size() == 0) { - break; - } - for (Entry entry : proteinResponse.allResults()) { - String json = writer.writeValueAsString(entry); - pw.println(json); - } - System.out.println("Processing " + i + " of " + numProteins); - } - pw.close(); - } - @Test public void countGenes() throws IOException { ObjectMapper mapper = new ObjectMapper(); diff --git a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4JLoaderTest.java b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4JLoaderTest.java new file mode 100644 index 0000000..7c8ed33 --- /dev/null +++ b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4JLoaderTest.java @@ -0,0 +1,149 @@ +package org.opencb.bionetdb.core.neo4j; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.neo4j.driver.v1.AuthTokens; +import org.neo4j.driver.v1.Driver; +import org.neo4j.driver.v1.GraphDatabase; +import org.neo4j.driver.v1.Session; +import org.opencb.biodata.formats.protein.uniprot.v201504jaxb.Entry; +import org.opencb.biodata.models.core.Gene; +import org.opencb.cellbase.client.config.ClientConfiguration; +import org.opencb.cellbase.client.config.RestConfig; +import org.opencb.cellbase.client.rest.CellBaseClient; +import org.opencb.cellbase.client.rest.GeneClient; +import org.opencb.cellbase.client.rest.ProteinClient; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.datastore.core.QueryResponse; +import org.opencb.commons.utils.FileUtils; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collections; + +import static org.neo4j.driver.v1.Values.parameters; + +public class Neo4JLoaderTest { + private Driver driver; + private Session session; + + @Before + public void init() { + String uri = "bolt://localhost:7687"; + String user = "neo4j"; + String password = "neo4j;"; + driver = GraphDatabase.driver(uri, AuthTokens.basic(user, password)); + Runtime.getRuntime().addShutdownHook(new Thread(() -> driver.close())); + session = driver.session(); + System.out.println("Init. Done!"); + } + + @After + public void clean() { + session.close(); + System.out.println("Clean. Done!"); + } + + @Test + public void testClinicalAnalysis() throws IOException { + long counter = 0; + // Reading file line by line, each line a + // JSON object + Path path = Paths.get("/home/jtarraga/data150/clinicalAnalysis/input/clinicalAnalysis.json"); + BufferedReader reader = FileUtils.newBufferedReader(path); + + String line = reader.readLine(); + while (line != null) { + counter++; + System.out.println("line " + counter + ", length = " + line.length()); + + // Call user defined procedure: loadClinicalAnalysis + session.run( "CALL org.opencb.bionetdb.core.neo4j.loadClinicalAnalysis($caJson)", parameters( "caJson", line)); + + // Read next line + line = reader.readLine(); + } + System.out.println("Clinical analysis: " + counter); + + reader.close(); + } + + @Test + public void getGenes() throws IOException { + String assembly = "GRCh38"; // "GRCh37", "GRCh38" + // CellBase client + ClientConfiguration clientConfiguration = new ClientConfiguration(); + clientConfiguration.setVersion("v4"); + clientConfiguration.setRest(new RestConfig(Collections.singletonList("http://bioinfo.hpc.cam.ac.uk/cellbase"), 30000)); + CellBaseClient cellBaseClient = new CellBaseClient("hsapiens", assembly, clientConfiguration); + + GeneClient geneClient = cellBaseClient.getGeneClient(); + Query query = new Query(); + QueryOptions options = new QueryOptions(QueryOptions.EXCLUDE, "transcripts.exons,transcripts.cDnaSequence,annotation.expression"); + QueryResponse countResponse = geneClient.count(query); + long numGenes = countResponse.firstResult(); + int bufferSize = 400; + options.put(QueryOptions.LIMIT, bufferSize); + System.out.println("Num. genes: " + numGenes); + ObjectMapper mapper = new ObjectMapper(); + mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + ObjectWriter writer = mapper.writer(); + PrintWriter pw = new PrintWriter(Paths.get("/tmp/" + assembly + ".genes.json").toString()); + for (int i = 0; i < numGenes; i += bufferSize) { + options.put(QueryOptions.SKIP, i); + QueryResponse geneResponse = geneClient.search(query, options); + for (Gene gene : geneResponse.allResults()) { + String json = writer.writeValueAsString(gene); + pw.println(json); + } + System.out.println("Processing " + i + " of " + numGenes); + } + pw.close(); + } + + @Test + public void getProteins() throws IOException { + String assembly = "GRCh38"; // "GRCh37", "GRCh38" + // CellBase client + ClientConfiguration clientConfiguration = new ClientConfiguration(); + clientConfiguration.setVersion("v4"); + clientConfiguration.setRest(new RestConfig(Collections.singletonList("http://bioinfo.hpc.cam.ac.uk/cellbase"), 30000)); + CellBaseClient cellBaseClient = new CellBaseClient("hsapiens", assembly, clientConfiguration); + + ProteinClient proteinClient = cellBaseClient.getProteinClient(); + Query query = new Query(); + QueryOptions options = new QueryOptions(QueryOptions.EXCLUDE, "reference,comment,sequence,evidence"); + long numProteins = 100000; + int bufferSize = 400; + options.put(QueryOptions.LIMIT, bufferSize); + System.out.println("Num. proteins: " + numProteins); + ObjectMapper mapper = new ObjectMapper(); + mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); + mapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + ObjectWriter writer = mapper.writer(); + PrintWriter pw = new PrintWriter(Paths.get("/tmp/" + assembly + ".proteins.json").toString()); + for (int i = 0; i < numProteins; i += bufferSize) { + options.put(QueryOptions.SKIP, i); + QueryResponse proteinResponse = proteinClient.search(query, options); + if (proteinResponse.allResults().size() == 0) { + break; + } + for (Entry entry : proteinResponse.allResults()) { + String json = writer.writeValueAsString(entry); + pw.println(json); + } + System.out.println("Processing " + i + " of " + numProteins); + } + pw.close(); + } +} \ No newline at end of file diff --git a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4JVariantLoaderTest.java b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4JVariantLoaderTest.java new file mode 100644 index 0000000..6c39327 --- /dev/null +++ b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4JVariantLoaderTest.java @@ -0,0 +1,94 @@ +package org.opencb.bionetdb.core.neo4j; + +import org.junit.Test; +import org.neo4j.graphdb.*; +import org.neo4j.graphdb.factory.GraphDatabaseFactory; + +import java.io.File; + +import static org.junit.Assert.*; + +public class Neo4JVariantLoaderTest { + + @Test + public void test1() { + GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabase(new File("/home/jtarraga/soft/neo4j/data/databases/graph.db") ); + + Label aaLabel = Label.label("AA"); + Label geneLabel = Label.label("GENE"); + + try (Transaction tx = graphDb.beginTx()) { + + //Node node = graphDb.createNode(aaLabel); + //node.setProperty("name", "toto"); + + int count = 0; + + String geneId = "ENSG00000139567"; + ResourceIterator nodes = graphDb.findNodes(geneLabel, "id", geneId);//, "name", "toto"); + while (nodes.hasNext()) { + System.out.println(nodes.next().getAllProperties()); + + if (++count > 10) { + break; + } + } + + tx.success(); + } + graphDb.shutdown(); + } + + @Test + public void testAllXref() { + GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabase(new File("/home/jtarraga/soft/neo4j/data/databases/graph.db") ); + + Label label = Label.label("XREF"); + + int count = 0; + int maxCount = 10; + + try (Transaction tx = graphDb.beginTx()) { + + ResourceIterator nodes = graphDb.findNodes(label); + while (nodes.hasNext()) { + Node next = nodes.next(); + System.out.println(next.getId() + " -> " + next.getAllProperties()); + + if (++count > maxCount) { + break; + } + } + + tx.success(); + } + graphDb.shutdown(); + } + + @Test + public void testCreateRelationship() { + GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabase(new File("/home/jtarraga/soft/neo4j/data/databases/graph.db") ); + + Label aaLabel = Label.label("AA"); + Label bbLabel = Label.label("BB"); + + String relationshipName = aaLabel.toString() + "-" + bbLabel.toString(); + + try (Transaction tx = graphDb.beginTx()) { + + Node aaNode = graphDb.createNode(aaLabel); + aaNode.setProperty("id", 1); + aaNode.setProperty("name", "name-a-1"); + + Node bbNode = graphDb.createNode(bbLabel); + bbNode.setProperty("id", 2); + bbNode.setProperty("name", "name-b-2"); + + Relationship aaBB = aaNode.createRelationshipTo(bbNode, RelationshipType.withName(relationshipName)); + aaBB.setProperty("score", 1.22); + + tx.success(); + } + graphDb.shutdown(); + } +} \ No newline at end of file diff --git a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4jCsvImporterTest.java b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4jCsvImporterTest.java new file mode 100644 index 0000000..9fefbd5 --- /dev/null +++ b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/neo4j/Neo4jCsvImporterTest.java @@ -0,0 +1,52 @@ +package org.opencb.bionetdb.core.neo4j; + +import org.junit.Test; +import org.opencb.bionetdb.core.exceptions.BioNetDBException; +import org.opencb.bionetdb.core.utils.CsvInfo; +import org.opencb.bionetdb.core.utils.Neo4jCsvImporter; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.*; + +public class Neo4jCsvImporterTest { + //----------------------------------------- + // Clinical analysis + //----------------------------------------- + + @Test + public void createCsvFilesForClinicalAnalysys() throws BioNetDBException, URISyntaxException, IOException, InterruptedException { + String caFilename = "/home/jtarraga/data150/clinicalAnalysis/input/clinicalAnalysis.json"; + String csvDirname= "/home/jtarraga/data150/clinicalAnalysis/csv"; + + Path input = Paths.get(caFilename); + + Path output = Paths.get(csvDirname); + output.toFile().delete(); + if (!output.toFile().exists()) { + output.toFile().mkdirs(); + } + + // Prepare CSV object + CsvInfo csv = new CsvInfo(input, output); + + // Open CSV files + csv.openCSVFiles(); + + Neo4jCsvImporter importer = new Neo4jCsvImporter(csv); + + List files = new ArrayList<>(); + files.add(input.toFile()); + + importer.addClinicalAnalysisFiles(files); + + // Close CSV files + csv.close(); + } +} \ No newline at end of file diff --git a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/utils/UtilsTest.java b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/utils/UtilsTest.java index 9ad1564..d7c474b 100644 --- a/bionetdb-core/src/test/java/org/opencb/bionetdb/core/utils/UtilsTest.java +++ b/bionetdb-core/src/test/java/org/opencb/bionetdb/core/utils/UtilsTest.java @@ -14,9 +14,15 @@ import org.opencb.biodata.models.variant.avro.ConsequenceType; import org.opencb.biodata.models.variant.avro.SequenceOntologyTerm; import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.commons.utils.FileUtils; import org.opencb.commons.utils.StringUtils; +import org.opencb.opencga.core.common.JacksonUtils; +import org.opencb.opencga.core.models.ClinicalAnalysis; +import java.io.BufferedReader; import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; diff --git a/checkstyle.xml b/checkstyle.xml index b7fd971..9b522ff 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -145,9 +145,11 @@ + diff --git a/pom.xml b/pom.xml index f82a19d..141a9ee 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,8 @@ 0.2.0-SNAPSHOT 1.8 3.7.3-SNAPSHOT - 1.4.3-SNAPSHOT + 1.4.5-SNAPSHOT + 1.4.2-dev 4.6.2-SNAPSHOT 2.8.10 2.23