From f996724839b0ae19f589b90ebab94c1fed7c1d52 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Wed, 20 Dec 2023 10:50:39 +0100 Subject: [PATCH 01/13] Prepare next release 2.12.2-SNAPSHOT --- biodata-external/pom.xml | 2 +- biodata-formats/pom.xml | 2 +- biodata-models/pom.xml | 2 +- biodata-tools/pom.xml | 2 +- pom.xml | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml index 347042c9..0c0fb358 100644 --- a/biodata-external/pom.xml +++ b/biodata-external/pom.xml @@ -6,7 +6,7 @@ biodata org.opencb.biodata - 2.12.1 + 2.12.2-SNAPSHOT ../pom.xml diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml index f0b0019e..44fad6f3 100644 --- a/biodata-formats/pom.xml +++ b/biodata-formats/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.1 + 2.12.2-SNAPSHOT ../pom.xml diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml index 81a8b17b..34ead415 100644 --- a/biodata-models/pom.xml +++ b/biodata-models/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.1 + 2.12.2-SNAPSHOT ../pom.xml diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index e2438f80..87cfcc16 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.1 + 2.12.2-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 97688342..5d3d9743 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.1 + 2.12.2-SNAPSHOT pom Biodata @@ -38,7 +38,7 @@ - 4.12.0 + 4.12.1-SNAPSHOT 2.11.4 4.4 1.7.7 From 6c11e006bd51aa10ba5e1379d96752cd567ed7bc Mon Sep 17 00:00:00 2001 From: imedina Date: Mon, 11 Mar 2024 03:12:56 +0000 Subject: [PATCH 02/13] Add new SNP classes to store dbSNP --- .../org/opencb/biodata/models/core/Snp.java | 147 ++++++++++++++++++ .../biodata/models/core/SnpAnnotation.java | 90 +++++++++++ 2 files changed, 237 insertions(+) create mode 100644 biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java create mode 100644 biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java b/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java new file mode 100644 index 00000000..c2d079ba --- /dev/null +++ b/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java @@ -0,0 +1,147 @@ +/* + * + * + */ + +package org.opencb.biodata.models.core; + +import java.util.List; + +public class Snp { + private String id; + private String chromosome; + private int position; + private String reference; + private List alleles; + private String type; + private String source; + private String version; + private SnpAnnotation annotation; + + public Snp() { + } + + public Snp(String id, String chromosome, int position, String reference, List alleles, String type, + String source, String version, SnpAnnotation annotation) { + this.id = id; + this.chromosome = chromosome; + this.position = position; + this.reference = reference; + this.alleles = alleles; + this.type = type; + this.source = source; + this.version = version; + this.annotation = annotation; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("Snp{"); + sb.append("id='").append(id).append('\''); + sb.append(", chromosome='").append(chromosome).append('\''); + sb.append(", position=").append(position); + sb.append(", reference='").append(reference).append('\''); + sb.append(", alleles=").append(alleles); + sb.append(", type='").append(type).append('\''); + sb.append(", source='").append(source).append('\''); + sb.append(", version='").append(version).append('\''); + sb.append(", annotation=").append(annotation); + sb.append('}'); + return sb.toString(); + } + + public String getId() { + return id; + } + + public Snp setId(String id) { + this.id = id; + return this; + } + + public String getChromosome() { + return chromosome; + } + + public Snp setChromosome(String chromosome) { + this.chromosome = chromosome; + return this; + } + + public int getPosition() { + return position; + } + + public Snp setPosition(int position) { + this.position = position; + return this; + } + + public String getReference() { + return reference; + } + + public Snp setReference(String reference) { + this.reference = reference; + return this; + } + + public List getAlleles() { + return alleles; + } + + public Snp setAlleles(List alleles) { + this.alleles = alleles; + return this; + } + + public String getType() { + return type; + } + + public Snp setType(String type) { + this.type = type; + return this; + } + + public String getSource() { + return source; + } + + public Snp setSource(String source) { + this.source = source; + return this; + } + + public String getVersion() { + return version; + } + + public Snp setVersion(String version) { + this.version = version; + return this; + } + + public SnpAnnotation getAnnotation() { + return annotation; + } + + public Snp setAnnotation(SnpAnnotation annotation) { + this.annotation = annotation; + return this; + } +} diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java b/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java new file mode 100644 index 00000000..215341ad --- /dev/null +++ b/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java @@ -0,0 +1,90 @@ +/* + * + * + */ + +package org.opencb.biodata.models.core; + +import org.opencb.biodata.models.variant.avro.EvidenceEntry; +import org.opencb.biodata.models.variant.avro.PopulationFrequency; + +import java.util.List; + +public class SnpAnnotation { + + private List flags; + private String gene; + private List populationFrequencies; + private List traitAssociation; + + public SnpAnnotation() { + } + + public SnpAnnotation(List flags, String gene, List populationFrequencies, List traitAssociation) { + this.flags = flags; + this.gene = gene; + this.populationFrequencies = populationFrequencies; + this.traitAssociation = traitAssociation; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SnpAnnotation{"); + sb.append("flags=").append(flags); + sb.append(", gene='").append(gene).append('\''); + sb.append(", populationFrequencies=").append(populationFrequencies); + sb.append(", traitAssociation=").append(traitAssociation); + sb.append('}'); + return sb.toString(); + } + + public List getFlags() { + return flags; + } + + public SnpAnnotation setFlags(List flags) { + this.flags = flags; + return this; + } + + public String getGene() { + return gene; + } + + public SnpAnnotation setGene(String gene) { + this.gene = gene; + return this; + } + + public List getPopulationFrequencies() { + return populationFrequencies; + } + + public SnpAnnotation setPopulationFrequencies(List populationFrequencies) { + this.populationFrequencies = populationFrequencies; + return this; + } + + public List getTraitAssociation() { + return traitAssociation; + } + + public SnpAnnotation setTraitAssociation(List traitAssociation) { + this.traitAssociation = traitAssociation; + return this; + } +} From ef752abc04d234ee16e4a68169c32d8d55d28fcb Mon Sep 17 00:00:00 2001 From: imedina Date: Tue, 12 Mar 2024 02:30:43 +0000 Subject: [PATCH 03/13] Update SnpAnnotation data model --- .../biodata/models/core/SnpAnnotation.java | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java b/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java index 215341ad..16fab718 100644 --- a/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java +++ b/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java @@ -19,37 +19,26 @@ package org.opencb.biodata.models.core; -import org.opencb.biodata.models.variant.avro.EvidenceEntry; import org.opencb.biodata.models.variant.avro.PopulationFrequency; import java.util.List; +import java.util.Map; public class SnpAnnotation { private List flags; private String gene; private List populationFrequencies; - private List traitAssociation; + private Map additionalAttributes; public SnpAnnotation() { } - public SnpAnnotation(List flags, String gene, List populationFrequencies, List traitAssociation) { + public SnpAnnotation(List flags, String gene, List populationFrequencies, Map additionalAttributes) { this.flags = flags; this.gene = gene; this.populationFrequencies = populationFrequencies; - this.traitAssociation = traitAssociation; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder("SnpAnnotation{"); - sb.append("flags=").append(flags); - sb.append(", gene='").append(gene).append('\''); - sb.append(", populationFrequencies=").append(populationFrequencies); - sb.append(", traitAssociation=").append(traitAssociation); - sb.append('}'); - return sb.toString(); + this.additionalAttributes = additionalAttributes; } public List getFlags() { @@ -79,12 +68,12 @@ public SnpAnnotation setPopulationFrequencies(List populati return this; } - public List getTraitAssociation() { - return traitAssociation; + public Map getAdditionalAttributes() { + return additionalAttributes; } - public SnpAnnotation setTraitAssociation(List traitAssociation) { - this.traitAssociation = traitAssociation; + public SnpAnnotation setAdditionalAttributes(Map additionalAttributes) { + this.additionalAttributes = additionalAttributes; return this; } } From 9ea6c147b64917ff95fed4b8335bd89eae109176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Tue, 12 Mar 2024 11:28:15 +0100 Subject: [PATCH 04/13] models: rename alleles to alternates, #TASK-5813, #TASK-5789 --- .../java/org/opencb/biodata/models/core/Snp.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java b/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java index c2d079ba..8f8cc712 100644 --- a/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java +++ b/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java @@ -26,7 +26,7 @@ public class Snp { private String chromosome; private int position; private String reference; - private List alleles; + private List alternates; private String type; private String source; private String version; @@ -35,13 +35,13 @@ public class Snp { public Snp() { } - public Snp(String id, String chromosome, int position, String reference, List alleles, String type, + public Snp(String id, String chromosome, int position, String reference, List alternates, String type, String source, String version, SnpAnnotation annotation) { this.id = id; this.chromosome = chromosome; this.position = position; this.reference = reference; - this.alleles = alleles; + this.alternates = alternates; this.type = type; this.source = source; this.version = version; @@ -55,7 +55,7 @@ public String toString() { sb.append(", chromosome='").append(chromosome).append('\''); sb.append(", position=").append(position); sb.append(", reference='").append(reference).append('\''); - sb.append(", alleles=").append(alleles); + sb.append(", alternates=").append(alternates); sb.append(", type='").append(type).append('\''); sb.append(", source='").append(source).append('\''); sb.append(", version='").append(version).append('\''); @@ -100,12 +100,12 @@ public Snp setReference(String reference) { return this; } - public List getAlleles() { - return alleles; + public List getAlternates() { + return alternates; } - public Snp setAlleles(List alleles) { - this.alleles = alleles; + public Snp setAlternates(List alternates) { + this.alternates = alternates; return this; } From b79bbdca15b81d4f30f1f0f3216c76160f726a60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Thu, 28 Mar 2024 11:45:19 +0100 Subject: [PATCH 05/13] tools: increase MAX_REGION_COVERAGE; and fix sonnar issues, #TASK-5162 --- .../opencb/biodata/tools/alignment/BamManager.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java index 895767e9..54b84cc0 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java @@ -49,6 +49,7 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; /** @@ -62,7 +63,7 @@ public class BamManager implements AutoCloseable { public static final int DEFAULT_WINDOW_SIZE = 1; public static final int MAX_NUM_RECORDS = 50000; - public static final int MAX_REGION_COVERAGE = 100000; + public static final int MAX_REGION_COVERAGE = 500000; public static final String COVERAGE_BIGWIG_EXTENSION = ".bw"; private Logger logger; @@ -191,7 +192,10 @@ public Path calculateBigWigCoverage(Path bigWigPath, int windowSize) throws IOEx return bigWigPath; } - + /** + * @deprecated (since getFileHeader().getTextHeader() is deprecated !) + */ + @Deprecated public String header() { return samReader.getFileHeader().getTextHeader(); } @@ -338,7 +342,7 @@ public List getChunks(Region region) { BAMIndex index = samReader.indexing().getIndex(); return index.getSpanOverlapping(sequenceIndex, start, end).getChunks(); } - return null; + return Collections.emptyList(); } public List getBreakpoints(Region region) throws IOException { @@ -378,7 +382,7 @@ public List getBreakpoints(Region region) throws IOException { } } } - return null; + return Collections.emptyList(); } /** @@ -445,7 +449,7 @@ public AlignmentGlobalStats stats(Region region, AlignmentFilters fil return calculateGlobalStats(iterator(region, filters, options)); } - private AlignmentGlobalStats calculateGlobalStats(BamIterator iterator) throws IOException { + private AlignmentGlobalStats calculateGlobalStats(BamIterator iterator) { AlignmentGlobalStats alignmentGlobalStats = new AlignmentGlobalStats(); SamRecordAlignmentGlobalStatsCalculator calculator = new SamRecordAlignmentGlobalStatsCalculator(); while (iterator.hasNext()) { From 941cb51e9721d38da344c10afaa440c3e2aacbbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 25 Apr 2024 15:53:04 +0100 Subject: [PATCH 06/13] tools: Fix normalization of variants wiht ins seq. #TASK-6122 --- .../opencb/biodata/tools/variant/VariantNormalizer.java | 2 ++ .../biodata/tools/variant/VariantNormalizerTest.java | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index 990d61e8..e3bb3d6e 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -1394,6 +1394,8 @@ private Variant newVariant(Variant variant, VariantKeyFields keyFields, Structur normalizedVariant.getSv().setCiStartRight(sv.getCiStartRight()); normalizedVariant.getSv().setCiEndLeft(sv.getCiEndLeft()); normalizedVariant.getSv().setCiEndRight(sv.getCiEndRight()); + normalizedVariant.getSv().setLeftSvInsSeq(sv.getLeftSvInsSeq()); + normalizedVariant.getSv().setRightSvInsSeq(sv.getRightSvInsSeq()); // Variant will never have CopyNumber, because the Alternate is normalized from to normalizedVariant.getSv().setCopyNumber(keyFields.getCopyNumber()); diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index 95265190..a4a62f06 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -784,6 +784,14 @@ public void testNormalizeSvToIndel() throws NonStandardCompliantSampleField { } + @Test + public void testNormalizeWithInsSeq() throws NonStandardCompliantSampleField { + Variant variant = new Variant("1:799984<800001<800022:-:ACCACACCCACACAACACACA...TGTGGTGTGTGTGGTGTG"); + Variant normVar = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); + assertEquals(variant, normVar); + assertEquals(variant.toString(), normVar.toString()); + } + @Test public void testNormalizeBND() throws NonStandardCompliantSampleField { normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, 99, "A", "A[chr9:10[")); From 5678bd3ed24a641d74b1aac180db27c68f3ebe77 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Tue, 30 Apr 2024 10:27:48 +0200 Subject: [PATCH 07/13] Prepare release 2.12.2 --- biodata-external/pom.xml | 2 +- biodata-formats/pom.xml | 2 +- biodata-models/pom.xml | 2 +- biodata-tools/pom.xml | 2 +- pom.xml | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml index 0c0fb358..3a98861e 100644 --- a/biodata-external/pom.xml +++ b/biodata-external/pom.xml @@ -6,7 +6,7 @@ biodata org.opencb.biodata - 2.12.2-SNAPSHOT + 2.12.2 ../pom.xml diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml index 44fad6f3..fe80d521 100644 --- a/biodata-formats/pom.xml +++ b/biodata-formats/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2-SNAPSHOT + 2.12.2 ../pom.xml diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml index 34ead415..79a1954e 100644 --- a/biodata-models/pom.xml +++ b/biodata-models/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2-SNAPSHOT + 2.12.2 ../pom.xml diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index 87cfcc16..45d41e63 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2-SNAPSHOT + 2.12.2 ../pom.xml diff --git a/pom.xml b/pom.xml index 5d3d9743..a051029f 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2-SNAPSHOT + 2.12.2 pom Biodata @@ -38,7 +38,7 @@ - 4.12.1-SNAPSHOT + 4.12.0 2.11.4 4.4 1.7.7 From c139406bbaeba0a77a4c1d545d75572613476882 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Tue, 30 Apr 2024 10:28:15 +0200 Subject: [PATCH 08/13] Prepare next release 2.12.3-SNAPSHOT --- biodata-external/pom.xml | 2 +- biodata-formats/pom.xml | 2 +- biodata-models/pom.xml | 2 +- biodata-tools/pom.xml | 2 +- pom.xml | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml index 3a98861e..3c1b2425 100644 --- a/biodata-external/pom.xml +++ b/biodata-external/pom.xml @@ -6,7 +6,7 @@ biodata org.opencb.biodata - 2.12.2 + 2.12.3-SNAPSHOT ../pom.xml diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml index fe80d521..24182d67 100644 --- a/biodata-formats/pom.xml +++ b/biodata-formats/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2 + 2.12.3-SNAPSHOT ../pom.xml diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml index 79a1954e..1eb260d6 100644 --- a/biodata-models/pom.xml +++ b/biodata-models/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2 + 2.12.3-SNAPSHOT ../pom.xml diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index 45d41e63..a283a477 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2 + 2.12.3-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index a051029f..c4ba58fc 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.opencb.biodata biodata - 2.12.2 + 2.12.3-SNAPSHOT pom Biodata @@ -38,7 +38,7 @@ - 4.12.0 + 4.12.1-SNAPSHOT 2.11.4 4.4 1.7.7 From c52673185b0aa2cf31f8915689519d935e39a271 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Thu, 30 May 2024 17:52:46 +0200 Subject: [PATCH 09/13] cicd: Update action version to test for compatibility with test and release process #TASK-6264 --- .github/workflows/test-analysis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-analysis.yml b/.github/workflows/test-analysis.yml index d460871a..e9df51ec 100644 --- a/.github/workflows/test-analysis.yml +++ b/.github/workflows/test-analysis.yml @@ -10,11 +10,11 @@ jobs: name: Test and push Sonar analysis runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: '0' - name: Set up JDK 11 - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '11' From b37690016e70f7cf0b2abe72d13bbdbeba733f88 Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Mon, 8 Jul 2024 15:01:10 +0200 Subject: [PATCH 10/13] exclude distlib dependency --- pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pom.xml b/pom.xml index c4ba58fc..6166063d 100644 --- a/pom.xml +++ b/pom.xml @@ -198,6 +198,12 @@ com.databricks SnpEff ${SnpEff.version} + + + distlib + distlib + + com.google.guava From aacf0bbe1bacc7ce0f5cd0decb7986236348616a Mon Sep 17 00:00:00 2001 From: JuanfeSanahuja Date: Mon, 8 Jul 2024 15:51:26 +0200 Subject: [PATCH 11/13] exclude distlib dependency --- biodata-tools/pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml index a283a477..fdb21cbf 100644 --- a/biodata-tools/pom.xml +++ b/biodata-tools/pom.xml @@ -53,6 +53,12 @@ com.databricks SnpEff + + + distlib + distlib + + org.rocksdb From 75345478c664b4fdae9803ab8138e370313717b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 17 Jul 2024 10:57:11 +0100 Subject: [PATCH 12/13] tools: Centralise SV normalization at VariantKeyFields.sv #TASK-6558 --- .../tools/variant/VariantNormalizer.java | 110 +++++-- .../variant/VariantNormalizerGenericTest.java | 12 +- .../tools/variant/VariantNormalizerTest.java | 309 +++++++++++++----- 3 files changed, 306 insertions(+), 125 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index e3bb3d6e..1b8992c0 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -306,19 +306,18 @@ public List normalize(List batch, boolean reuse) throws NonSta Integer start = variant.getStart(); Integer end = variant.getEnd(); String chromosome = variant.getChromosome(); - StructuralVariation sv = variant.getSv(); if (variant.getStudies() == null || variant.getStudies().isEmpty()) { List keyFieldsList; if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternate, sv); + keyFieldsList = normalizeSymbolic(start, end, reference, alternate, variant.getSv()); } else { keyFieldsList = normalize(chromosome, start, reference, alternate); } // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) { OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele()); - Variant normalizedVariant = newVariant(variant, keyFields, sv); + Variant normalizedVariant = newVariant(variant, keyFields); if (keyFields.getPhaseSet() != null) { StudyEntry studyEntry = new StudyEntry(); studyEntry.setSamples( @@ -346,7 +345,7 @@ public List normalize(List batch, boolean reuse) throws NonSta List keyFieldsList; List originalKeyFieldsList; if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternates, sv); + keyFieldsList = normalizeSymbolic(start, end, reference, alternates, variant.getSv()); } else { keyFieldsList = normalize(chromosome, start, reference, alternates); } @@ -400,6 +399,9 @@ public List normalize(List batch, boolean reuse) throws NonSta variant.setEnd(keyFields.getEnd()); variant.setReference(keyFields.getReference()); variant.setAlternate(keyFields.getAlternate()); + if (keyFields.getSv() != null) { + variant.setSv(keyFields.getSv()); + } variant.reset(); // Variant is being reused, must ensure the SV field si appropriately created // if (isSymbolic(variant)) { @@ -415,7 +417,7 @@ public List normalize(List batch, boolean reuse) throws NonSta } samples = entry.getSamples(); } else { - normalizedVariant = newVariant(variant, keyFields, sv); + normalizedVariant = newVariant(variant, keyFields); normalizedEntry = new StudyEntry(); normalizedEntry.setStudyId(entry.getStudyId()); @@ -624,6 +626,36 @@ public List normalizeSymbolic(final Integer start, final Integ Integer copyNumber = sv == null ? null : sv.getCopyNumber(); keyFields = normalizeSymbolic(start, end, reference, alternate, alternates, copyNumber, numAllelesIdx); } + + if (alternate.equals(VariantBuilder.DUP_TANDEM_ALT)) { + if (keyFields.getSv() == null) { + keyFields.setSv(new StructuralVariation()); + } + keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION); + } + + if (sv != null) { + StructuralVariation normalizedSv = keyFields.getSv(); + if (normalizedSv == null) { + normalizedSv = new StructuralVariation(); + } + // CI positions may change during the normalization. Update them. + normalizedSv.setCiStartLeft(sv.getCiStartLeft()); + normalizedSv.setCiStartRight(sv.getCiStartRight()); + normalizedSv.setCiEndLeft(sv.getCiEndLeft()); + normalizedSv.setCiEndRight(sv.getCiEndRight()); + normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); + normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); + + if (keyFields.getSv() == null) { + if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null + || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null + || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) { + keyFields.setSv(normalizedSv); + } + } + } + list.add(keyFields); } @@ -695,7 +727,7 @@ private static VariantKeyFields normalizeMateBreakend( } VariantKeyFields keyFields = new VariantKeyFields(newStart, newStart - 1, numAllelesIdx, newReference, newAlternate); - keyFields.getSv().setBreakend(breakend); + keyFields.setBreakend(breakend); return keyFields; } @@ -718,20 +750,23 @@ private VariantKeyFields normalizeSymbolic( + "contain 0 or 1 nt, but no more. Please, check."); } - Integer cn = VariantBuilder.getCopyNumberFromAlternate(alternate); // if (cn != null) { // // Alternate with the form , being xxx the number of copies, must be normalized into "" // newAlternate = ""; // } String newAlternate; + Integer newCn; if (alternate.equals("") && copyNumber != null) { // Alternate must be of the form , being xxx the number of copies newAlternate = ""; + newCn = copyNumber; } else { newAlternate = alternate; + newCn = VariantBuilder.getCopyNumberFromAlternate(alternate); } + return new VariantKeyFields(newStart, end, numAllelesIdx, newReference, newAlternate, - null, cn, false); + null, newCn, false); } @@ -1380,34 +1415,24 @@ private int[] getGenotypesReorderingMap(int numAllele, int[] alleleMap) { } } - - private Variant newVariant(Variant variant, VariantKeyFields keyFields, StructuralVariation sv) { + private Variant newVariant(Variant variant, VariantKeyFields keyFields) { Variant normalizedVariant = new Variant(variant.getChromosome(), keyFields.getStart(), keyFields.getEnd(), keyFields.getReference(), keyFields.getAlternate()) .setId(variant.getId()) .setNames(variant.getNames()) .setStrand(variant.getStrand()); - if (sv != null) { - if (normalizedVariant.getSv() != null) { - // CI positions may change during the normalization. Update them. - normalizedVariant.getSv().setCiStartLeft(sv.getCiStartLeft()); - normalizedVariant.getSv().setCiStartRight(sv.getCiStartRight()); - normalizedVariant.getSv().setCiEndLeft(sv.getCiEndLeft()); - normalizedVariant.getSv().setCiEndRight(sv.getCiEndRight()); - normalizedVariant.getSv().setLeftSvInsSeq(sv.getLeftSvInsSeq()); - normalizedVariant.getSv().setRightSvInsSeq(sv.getRightSvInsSeq()); - - // Variant will never have CopyNumber, because the Alternate is normalized from to - normalizedVariant.getSv().setCopyNumber(keyFields.getCopyNumber()); - VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber()); - if (cnvSubtype != null) { - normalizedVariant.setType(cnvSubtype); - } - } + if (keyFields.getSv() != null) { + normalizedVariant.setSv(keyFields.getSv()); } - normalizedVariant.setAnnotation(variant.getAnnotation()); + if (keyFields.getCopyNumber() != null) { + VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber()); + if (cnvSubtype != null) { + normalizedVariant.setType(cnvSubtype); + } + } + return normalizedVariant; // normalizedVariant.setAnnotation(variant.getAnnotation()); // if (isSymbolic(variant)) { @@ -1527,8 +1552,10 @@ public VariantKeyFields(int start, int end, int numAllele, String reference, Str this.alternate = alternate; this.originalKeyFields = originalKeyFields == null ? this : originalKeyFields; this.referenceBlock = referenceBlock; - this.sv = new StructuralVariation(); - setCopyNumber(copyNumber); + this.sv = null; + if (copyNumber != null) { + setCopyNumber(copyNumber); + } } @@ -1604,7 +1631,28 @@ public Integer getCopyNumber() { } public VariantKeyFields setCopyNumber(Integer copyNumber) { - sv.setCopyNumber(copyNumber); + if (sv == null) { + if (copyNumber != null) { + sv = new StructuralVariation(); + sv.setCopyNumber(copyNumber); + sv.setType(VariantBuilder.getCNVSubtype(copyNumber)); + } + } else { + sv.setCopyNumber(copyNumber); + sv.setType(VariantBuilder.getCNVSubtype(copyNumber)); + } + return this; + } + + public VariantKeyFields setBreakend(Breakend breakend) { + if (sv == null) { + if (breakend != null) { + sv = new StructuralVariation(); + sv.setBreakend(breakend); + } + } else { + sv.setBreakend(breakend); + } return this; } diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java index f097d1e1..e59ad530 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java @@ -230,7 +230,7 @@ protected Variant newVariant(int position, String ref, String altsCsv) { return newVariant(position, position, ref, Arrays.asList(altsCsv.split(",")), "2"); } - protected Variant newVariant(int start, int end, String ref, String altsCsv) { + protected Variant newVariant(int start, Integer end, String ref, String altsCsv) { return newVariant(start, end, ref, Arrays.asList(altsCsv.split(",")), "2"); } @@ -238,12 +238,16 @@ protected Variant newVariant(int position, String ref, List altsList, St return newVariant(position, position, ref, altsList, studyId); } - protected Variant newVariant(int start, int end, String ref, List altsList, String studyId) { + protected Variant newVariant(int start, Integer end, String ref, List altsList, String studyId) { return newVariantBuilder(start, end, ref, altsList, studyId).build(); } - protected VariantBuilder newVariantBuilder(int position, int end, String ref, List altsList, String studyId) { - return Variant.newBuilder("1", position, end, ref, String.join(",", altsList)) + protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, List altsList, String studyId) { + return newVariantBuilder(position, end, ref, String.join(",", altsList), studyId); + } + + protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, String alts, String studyId) { + return Variant.newBuilder("1", position, end, ref, alts) .setStudyId(studyId) .setSampleDataKeys("GT") .setSamples(new ArrayList<>()) diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index a4a62f06..4253d940 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -9,6 +9,7 @@ import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import java.util.*; +import java.util.function.Consumer; import java.util.stream.Collectors; import static org.junit.Assert.*; @@ -582,9 +583,7 @@ public void testMultiSNP() throws NonStandardCompliantSampleField { public void testNormalizeMultiAllelicPL() throws NonStandardCompliantSampleField { Variant variant = generateVariantWithFormat("X:100:A:T", "GT:GL", "S01", "0/0", "1,2,3", "S02", "0", "1,2"); - List normalize1 = normalizer.normalize(Collections.singletonList(variant), false); - assertEquals("1,2,3", normalize1.get(0).getStudies().get(0).getSampleData("S01", "GL")); - assertEquals("1,2", normalize1.get(0).getStudies().get(0).getSampleData("S02", "GL")); + normalizeUnmodified(variant); Variant variant2 = generateVariantWithFormat("X:100:A:T,C", "GT:GL", "S01", "0/0", "1,2,3,4,5,6", "S02", "A", "1,2,3"); List normalize2 = normalizer.normalize(Collections.singletonList(variant2), false); @@ -614,14 +613,138 @@ public void testCNVsNormalization() throws Exception { .addSample("HG00096", "0|0") .build(); - List normalizedVariantList = normalizer.normalize(Collections.singletonList(variant), true); - assertEquals(1, normalizedVariantList.size()); - assertEquals(new StructuralVariation(86, 150, 150, 211, 0, null, null, - StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariantList.get(0).getSv()); - // Normalize CNV alternate - assertEquals("", normalizedVariantList.get(0).getAlternate()); - assertEquals("1:86<100<150-150<200<211:C:", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); - assertEquals(0, normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 150, 211, 0, null, null, + StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testCNVsNormalizationNoNumber() throws Exception { + Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 150, 211, null, null, null, null, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testCNVsNormalizationNoNumberNoCipos() throws Exception { + Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(null, null, null, null, null, null, null, null, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testCNVsNormalizationUnmodified() throws Exception { + Variant variant = newVariantBuilder(101, 200, "-", Collections.singletonList(""), "2") + .addSample("HG00096", "0|0") + .build(); + + normalizeUnmodified(variant); + } + + @Test + public void testINSsNormalizationWithCIEND() throws Exception { + Variant variant = newVariantBuilder(100, null, "C", Collections.singletonList(""), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addFileData("LEFT_SVINSSEQ", "AAAA") + .addFileData("RIGHT_SVINSSEQ", "CCCC") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, null, null, null, "AAAA", "CCCC", null, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-50<100<111:C:AAAA...CCCC", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals("1:86<101<150:-:AAAA...CCCC", normalizedVariant.toString()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testDUPTANDEMNormalization() throws Exception { + Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addSample("HG00096", "0|0") + .build(); + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 150, 211, null, null, null, StructuralVariantType.TANDEM_DUPLICATION, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals("1:86<101<150-150<200<211:-:", normalizedVariant.toString()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + public void normalizeUnmodified(Variant variant) throws NonStandardCompliantSampleField { + normalizer.setGenerateReferenceBlocks(false); + + int hashCode = variant.hashCode(); + List list = normalizer.normalize(Collections.singletonList(variant), false); + assertEquals(1, list.size()); + Variant normVar = list.get(0); + + assertEquals(variant.toString(), normVar.toString()); + assertEquals("Ensure input variant is not modified", hashCode, variant.hashCode()); + assertEquals("Ensure norm variant is not modified", hashCode, normVar.hashCode()); + + list = normalizer.normalize(Collections.singletonList(variant), true); + assertEquals(1, list.size()); + normVar = list.get(0); + + assertEquals(variant.toString(), normVar.toString()); + assertEquals("Ensure input variant is not modified", hashCode, variant.hashCode()); + assertEquals("Ensure norm variant is not modified", hashCode, normVar.hashCode()); + + } + + public void normalizeOne(Variant variant, Consumer consumer) throws NonStandardCompliantSampleField { + normalizer.setGenerateReferenceBlocks(false); + + int hashCode = variant.hashCode(); + List list = normalizer.normalize(Collections.singletonList(variant), false); + assertEquals(1, list.size()); + consumer.accept(list.get(0)); + + int hashCode2 = variant.hashCode(); + + // Check that the original variant has not been modified, and check again, but reusing the input variant + assertEquals("Ensure input variant is not modified", hashCode, hashCode2); + + + list = normalizer.normalize(Collections.singletonList(variant), true); + assertEquals(1, list.size()); + assertSame(variant, list.get(0)); + consumer.accept(variant); + consumer.accept(list.get(0)); + + int hashCode3 = variant.hashCode(); + assertNotEquals(hashCode3, hashCode); } @Test @@ -670,23 +793,40 @@ public void testVNCNormalizationMultiallelic() throws NonStandardCompliantSample @Test public void testCNVsNormalizationCopyNumber() throws NonStandardCompliantSampleField { Variant variant; - List normalizedVariantList; variant = newVariantBuilder(100, 200, "C", Arrays.asList(""), "2") .setSampleDataKeys("GT", "CN") .addSample("HG00096", "0|1","3") .build(); - normalizedVariantList = normalizer.normalize(Collections.singletonList(variant), true); - assertEquals(1, normalizedVariantList.size()); - Variant normalizedVariant = normalizedVariantList.get(0); - assertEquals(new StructuralVariation(null, null, null, null, 3, null, null, - StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv()); - // Normalize CNV alternate - assertEquals("", normalizedVariant.getAlternate()); - assertEquals(101, normalizedVariant.getStart().intValue()); - assertEquals("", normalizedVariant.getReference()); - assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); - assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + normalizeOne(variant, normalizedVariant->{ + assertEquals(new StructuralVariation(null, null, null, null, 3, null, null, + StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals(101, normalizedVariant.getStart().intValue()); + assertEquals("", normalizedVariant.getReference()); + assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + @Test + public void testCNVsNormalizationCopyNumberWithCipos() throws NonStandardCompliantSampleField { + Variant variant; + variant = newVariantBuilder(100, 200, "C", Arrays.asList(""), "2") + .addFileData("CIPOS", "-10,50") + .setSampleDataKeys("GT", "CN") + .addSample("HG00096", "0|1","3") + .build(); + normalizeOne(variant, normalizedVariant->{ + assertEquals(new StructuralVariation(90, 150, null, null, 3, null, null, + StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals(101, normalizedVariant.getStart().intValue()); + assertEquals("", normalizedVariant.getReference()); + assertEquals("1:90<100<150-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); } @Test @@ -725,38 +865,33 @@ public void testNormalizeSV() throws NonStandardCompliantSampleField { @Test public void testNormalizeDEL() throws NonStandardCompliantSampleField { - Variant variant = newVariant(100, 200, "N", Collections.singletonList(""), STUDY_ID); - List normalized = normalizer.normalize(Collections.singletonList(variant), false); - - assertEquals(1, normalized.size()); - assertEquals(101, normalized.get(0).getStart().intValue()); - assertEquals(200, normalized.get(0).getEnd().intValue()); - assertEquals(new StructuralVariation(), normalized.get(0).getSv()); - System.out.println(normalized.get(0).toJson()); + normalizeOne(variant, normalized -> { + assertEquals(101, normalized.getStart().intValue()); + assertEquals(200, normalized.getEnd().intValue()); + assertEquals(new StructuralVariation(), normalized.getSv()); +// System.out.println(normalized.toJson()); + }); } @Test public void testNormalizeINS() throws NonStandardCompliantSampleField { - String seq = "ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTG"; Variant variant = newVariantBuilder(100, 100, "N", Collections.singletonList(""), STUDY_ID) .addFileData("SVINSSEQ", seq) .build(); - List list = new VariantNormalizer().normalize(Collections.singletonList(variant), false); - - assertEquals(1, list.size()); - Variant normalized = list.get(0); - assertEquals(101, normalized.getStart().intValue()); - assertEquals(100, normalized.getEnd().intValue()); - assertEquals(seq.length(), normalized.getLength().intValue()); - assertEquals(seq.length(), normalized.getLengthAlternate().intValue()); - assertEquals(0, normalized.getLengthReference().intValue()); - assertEquals("", normalized.getReference()); - assertEquals(seq, normalized.getAlternate()); - assertEquals(new StructuralVariation(), normalized.getSv()); - assertEquals("1:100-100:N:", normalized.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); - assertEquals(0, normalized.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + normalizeOne(variant, normalizedVariant -> { + assertEquals(101, normalizedVariant.getStart().intValue()); + assertEquals(100, normalizedVariant.getEnd().intValue()); + assertEquals(seq.length(), normalizedVariant.getLength().intValue()); + assertEquals(seq.length(), normalizedVariant.getLengthAlternate().intValue()); + assertEquals(0, normalizedVariant.getLengthReference().intValue()); + assertEquals("", normalizedVariant.getReference()); + assertEquals(seq, normalizedVariant.getAlternate()); + assertEquals(new StructuralVariation(), normalizedVariant.getSv()); + assertEquals("1:100-100:N:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); } @Test @@ -768,66 +903,60 @@ public void testNormalizeSvToIndel() throws NonStandardCompliantSampleField { assertEquals(Variant.SV_THRESHOLD + 1, variant.getLengthAlternate().intValue()); assertNotNull(variant.getSv()); - Variant normVar = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); - assertEquals(VariantType.INDEL, normVar.getType()); - assertEquals(Variant.SV_THRESHOLD, normVar.getLengthAlternate().intValue()); - assertNull(normVar.getSv()); - - // Check that the original variant has not been modified, and check again, but reusing the input variant - assertEquals(VariantType.INSERTION, variant.getType()); - assertEquals(Variant.SV_THRESHOLD + 1, variant.getLengthAlternate().intValue()); - assertNotNull(variant.getSv()); - Variant normVarReuse = new VariantNormalizer().normalize(Collections.singletonList(variant), true).get(0); - assertEquals(VariantType.INDEL, normVarReuse.getType()); - assertEquals(Variant.SV_THRESHOLD, normVarReuse.getLengthAlternate().intValue()); - assertNull(normVarReuse.getSv()); - + normalizeOne(variant, normVar -> { + assertEquals(VariantType.INDEL, normVar.getType()); + assertEquals(Variant.SV_THRESHOLD, normVar.getLengthAlternate().intValue()); + assertNull(normVar.getSv()); + }); } @Test public void testNormalizeWithInsSeq() throws NonStandardCompliantSampleField { Variant variant = new Variant("1:799984<800001<800022:-:ACCACACCCACACAACACACA...TGTGGTGTGTGTGGTGTG"); - Variant normVar = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); - assertEquals(variant, normVar); - assertEquals(variant.toString(), normVar.toString()); + normalizeUnmodified(variant); } @Test public void testNormalizeBND() throws NonStandardCompliantSampleField { - normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, 99, "A", "A[chr9:10[")); - normalizeBnd(newVariant(100, 99, "", "[22:10[."), newVariant(100, 99, "A", "[chr22:10[A")); - normalizeBnd(newVariant(101, 100, "", ".]9:10]"), newVariant(100, 99, "A", "A]chr9:10]")); - normalizeBnd(newVariant(100, 99, "", "]22:10]."), newVariant(100, 99, "A", "]chr22:10]A")); - normalizeBnd(newVariant(100, 99, "", "]22:10]NNN"), newVariant(100, 99, "A", "]chr22:10]NNNA")); + normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, null, "A", "A[chr9:10[")); + normalizeBnd(newVariant(100, 99, "", "[22:10[."), newVariant(100, null, "A", "[chr22:10[A")); + normalizeBnd(newVariant(101, 100, "", ".]9:10]"), newVariant(100, null, "A", "A]chr9:10]")); + normalizeBnd(newVariant(100, 99, "", "]22:10]."), newVariant(100, null, "A", "]chr22:10]A")); + normalizeBnd(newVariant(100, 99, "", "]22:10]NNN"), newVariant(100, null, "A", "]chr22:10]NNNA")); - normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, 99, "A", "[1:10[TA")); - normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, 99, "AC", "[1:10[TAC")); + normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, null, "A", "[1:10[TA")); + normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, null, "AC", "[1:10[TAC")); - normalizeBnd(newVariant(100, 99, "TAC", "[1:10[AC"), newVariant(100, 99, "TAC", "[1:10[AC")); - normalizeBnd(newVariant(100, 99, "TAC", "TA[1:10["), newVariant(100, 99, "TAC", "TA[1:10[")); + normalizeBnd(newVariant(100, 99, "TAC", "[1:10[AC"), newVariant(100, null, "TAC", "[1:10[AC")); + normalizeBnd(newVariant(100, 99, "TAC", "TA[1:10["), newVariant(100, null, "TAC", "TA[1:10[")); + + normalizeBnd(newVariantBuilder(101, 100, "", ".[9:10[", "s1").setCiStart(95, 105).build(), + newVariantBuilder(100, null, "A", "A[chr9:10[", "s1").setCiStart(95,105).setCiEnd(95,105).build()); } private void normalizeBnd(Variant expectedVariant, Variant variant) throws NonStandardCompliantSampleField { - System.out.println("---"); +// System.out.println("---"); boolean expectsNormalization = !expectedVariant.equals(variant); - System.out.println(" - Actual"); - System.out.println(" " + variant.toString()); - System.out.println(" " + variant.toJson()); - System.out.println(" - Expected"); - System.out.println(" " + expectedVariant.toString()); - System.out.println(" " + expectedVariant.toJson()); - System.out.println(" - Normalized (same = " + !expectsNormalization + ")"); - List normalized = normalizer.normalize(Collections.singletonList(variant), false); - - for (Variant v : normalized) { - System.out.println(" " + v.toString()); - System.out.println(" " + v.toJson()); - if (expectsNormalization) { - assertNotNull(v.getStudies().get(0).getFiles().get(0).getCall()); - v.getStudies().get(0).getFiles().get(0).setCall(null); - } - assertEquals(expectedVariant, v); +// System.out.println(" - Actual"); +// System.out.println(" " + variant.toString()); +// System.out.println(" " + variant.toJson()); +// System.out.println(" - Expected"); +// System.out.println(" " + expectedVariant.toString()); +// System.out.println(" " + expectedVariant.toJson()); +// System.out.println(" - Normalized (same = " + !expectsNormalization + ")"); + if (expectsNormalization) { + normalizeOne(variant, normVar -> { + System.out.println(" " + normVar.toString()); + System.out.println(" " + normVar.toJson()); + OriginalCall call = normVar.getStudies().get(0).getFiles().get(0).getCall(); + assertNotNull(call); + normVar.getStudies().get(0).getFiles().get(0).setCall(null); + assertEquals(expectedVariant, normVar); + normVar.getStudies().get(0).getFiles().get(0).setCall(call); + }); + } else { + normalizeUnmodified(variant); } } From a6abc515055de2f42805197c0859852441b91317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 17 Jul 2024 11:03:56 +0100 Subject: [PATCH 13/13] tools: Remove sv.ciEnd from INSERTION and BREAKEND variants. #TASK-6558 --- .../biodata/tools/variant/VariantNormalizer.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index 1b8992c0..e902ce99 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -642,8 +642,18 @@ public List normalizeSymbolic(final Integer start, final Integ // CI positions may change during the normalization. Update them. normalizedSv.setCiStartLeft(sv.getCiStartLeft()); normalizedSv.setCiStartRight(sv.getCiStartRight()); - normalizedSv.setCiEndLeft(sv.getCiEndLeft()); - normalizedSv.setCiEndRight(sv.getCiEndRight()); + + // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND. + // At this point, we're removing the CIEND from the normalized variant. + // Do not remove the value from the INFO field (if any). + // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start") + if (keyFields.getEnd() < keyFields.getStart()) { + normalizedSv.setCiEndLeft(null); + normalizedSv.setCiEndRight(null); + } else { + normalizedSv.setCiEndLeft(sv.getCiEndLeft()); + normalizedSv.setCiEndRight(sv.getCiEndRight()); + } normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq());