From 75345478c664b4fdae9803ab8138e370313717b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 17 Jul 2024 10:57:11 +0100 Subject: [PATCH 1/3] tools: Centralise SV normalization at VariantKeyFields.sv #TASK-6558 --- .../tools/variant/VariantNormalizer.java | 110 +++++-- .../variant/VariantNormalizerGenericTest.java | 12 +- .../tools/variant/VariantNormalizerTest.java | 309 +++++++++++++----- 3 files changed, 306 insertions(+), 125 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index e3bb3d6e2..1b8992c0c 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -306,19 +306,18 @@ public List normalize(List batch, boolean reuse) throws NonSta Integer start = variant.getStart(); Integer end = variant.getEnd(); String chromosome = variant.getChromosome(); - StructuralVariation sv = variant.getSv(); if (variant.getStudies() == null || variant.getStudies().isEmpty()) { List keyFieldsList; if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternate, sv); + keyFieldsList = normalizeSymbolic(start, end, reference, alternate, variant.getSv()); } else { keyFieldsList = normalize(chromosome, start, reference, alternate); } // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) { OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele()); - Variant normalizedVariant = newVariant(variant, keyFields, sv); + Variant normalizedVariant = newVariant(variant, keyFields); if (keyFields.getPhaseSet() != null) { StudyEntry studyEntry = new StudyEntry(); studyEntry.setSamples( @@ -346,7 +345,7 @@ public List normalize(List batch, boolean reuse) throws NonSta List keyFieldsList; List originalKeyFieldsList; if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternates, sv); + keyFieldsList = normalizeSymbolic(start, end, reference, alternates, variant.getSv()); } else { keyFieldsList = normalize(chromosome, start, reference, alternates); } @@ -400,6 +399,9 @@ public List normalize(List batch, boolean reuse) throws NonSta variant.setEnd(keyFields.getEnd()); variant.setReference(keyFields.getReference()); variant.setAlternate(keyFields.getAlternate()); + if (keyFields.getSv() != null) { + variant.setSv(keyFields.getSv()); + } variant.reset(); // Variant is being reused, must ensure the SV field si appropriately created // if (isSymbolic(variant)) { @@ -415,7 +417,7 @@ public List normalize(List batch, boolean reuse) throws NonSta } samples = entry.getSamples(); } else { - normalizedVariant = newVariant(variant, keyFields, sv); + normalizedVariant = newVariant(variant, keyFields); normalizedEntry = new StudyEntry(); normalizedEntry.setStudyId(entry.getStudyId()); @@ -624,6 +626,36 @@ public List normalizeSymbolic(final Integer start, final Integ Integer copyNumber = sv == null ? null : sv.getCopyNumber(); keyFields = normalizeSymbolic(start, end, reference, alternate, alternates, copyNumber, numAllelesIdx); } + + if (alternate.equals(VariantBuilder.DUP_TANDEM_ALT)) { + if (keyFields.getSv() == null) { + keyFields.setSv(new StructuralVariation()); + } + keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION); + } + + if (sv != null) { + StructuralVariation normalizedSv = keyFields.getSv(); + if (normalizedSv == null) { + normalizedSv = new StructuralVariation(); + } + // CI positions may change during the normalization. Update them. + normalizedSv.setCiStartLeft(sv.getCiStartLeft()); + normalizedSv.setCiStartRight(sv.getCiStartRight()); + normalizedSv.setCiEndLeft(sv.getCiEndLeft()); + normalizedSv.setCiEndRight(sv.getCiEndRight()); + normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); + normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); + + if (keyFields.getSv() == null) { + if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null + || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null + || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) { + keyFields.setSv(normalizedSv); + } + } + } + list.add(keyFields); } @@ -695,7 +727,7 @@ private static VariantKeyFields normalizeMateBreakend( } VariantKeyFields keyFields = new VariantKeyFields(newStart, newStart - 1, numAllelesIdx, newReference, newAlternate); - keyFields.getSv().setBreakend(breakend); + keyFields.setBreakend(breakend); return keyFields; } @@ -718,20 +750,23 @@ private VariantKeyFields normalizeSymbolic( + "contain 0 or 1 nt, but no more. Please, check."); } - Integer cn = VariantBuilder.getCopyNumberFromAlternate(alternate); // if (cn != null) { // // Alternate with the form , being xxx the number of copies, must be normalized into "" // newAlternate = ""; // } String newAlternate; + Integer newCn; if (alternate.equals("") && copyNumber != null) { // Alternate must be of the form , being xxx the number of copies newAlternate = ""; + newCn = copyNumber; } else { newAlternate = alternate; + newCn = VariantBuilder.getCopyNumberFromAlternate(alternate); } + return new VariantKeyFields(newStart, end, numAllelesIdx, newReference, newAlternate, - null, cn, false); + null, newCn, false); } @@ -1380,34 +1415,24 @@ private int[] getGenotypesReorderingMap(int numAllele, int[] alleleMap) { } } - - private Variant newVariant(Variant variant, VariantKeyFields keyFields, StructuralVariation sv) { + private Variant newVariant(Variant variant, VariantKeyFields keyFields) { Variant normalizedVariant = new Variant(variant.getChromosome(), keyFields.getStart(), keyFields.getEnd(), keyFields.getReference(), keyFields.getAlternate()) .setId(variant.getId()) .setNames(variant.getNames()) .setStrand(variant.getStrand()); - if (sv != null) { - if (normalizedVariant.getSv() != null) { - // CI positions may change during the normalization. Update them. - normalizedVariant.getSv().setCiStartLeft(sv.getCiStartLeft()); - normalizedVariant.getSv().setCiStartRight(sv.getCiStartRight()); - normalizedVariant.getSv().setCiEndLeft(sv.getCiEndLeft()); - normalizedVariant.getSv().setCiEndRight(sv.getCiEndRight()); - normalizedVariant.getSv().setLeftSvInsSeq(sv.getLeftSvInsSeq()); - normalizedVariant.getSv().setRightSvInsSeq(sv.getRightSvInsSeq()); - - // Variant will never have CopyNumber, because the Alternate is normalized from to - normalizedVariant.getSv().setCopyNumber(keyFields.getCopyNumber()); - VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber()); - if (cnvSubtype != null) { - normalizedVariant.setType(cnvSubtype); - } - } + if (keyFields.getSv() != null) { + normalizedVariant.setSv(keyFields.getSv()); } - normalizedVariant.setAnnotation(variant.getAnnotation()); + if (keyFields.getCopyNumber() != null) { + VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber()); + if (cnvSubtype != null) { + normalizedVariant.setType(cnvSubtype); + } + } + return normalizedVariant; // normalizedVariant.setAnnotation(variant.getAnnotation()); // if (isSymbolic(variant)) { @@ -1527,8 +1552,10 @@ public VariantKeyFields(int start, int end, int numAllele, String reference, Str this.alternate = alternate; this.originalKeyFields = originalKeyFields == null ? this : originalKeyFields; this.referenceBlock = referenceBlock; - this.sv = new StructuralVariation(); - setCopyNumber(copyNumber); + this.sv = null; + if (copyNumber != null) { + setCopyNumber(copyNumber); + } } @@ -1604,7 +1631,28 @@ public Integer getCopyNumber() { } public VariantKeyFields setCopyNumber(Integer copyNumber) { - sv.setCopyNumber(copyNumber); + if (sv == null) { + if (copyNumber != null) { + sv = new StructuralVariation(); + sv.setCopyNumber(copyNumber); + sv.setType(VariantBuilder.getCNVSubtype(copyNumber)); + } + } else { + sv.setCopyNumber(copyNumber); + sv.setType(VariantBuilder.getCNVSubtype(copyNumber)); + } + return this; + } + + public VariantKeyFields setBreakend(Breakend breakend) { + if (sv == null) { + if (breakend != null) { + sv = new StructuralVariation(); + sv.setBreakend(breakend); + } + } else { + sv.setBreakend(breakend); + } return this; } diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java index f097d1e1a..e59ad530f 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java @@ -230,7 +230,7 @@ protected Variant newVariant(int position, String ref, String altsCsv) { return newVariant(position, position, ref, Arrays.asList(altsCsv.split(",")), "2"); } - protected Variant newVariant(int start, int end, String ref, String altsCsv) { + protected Variant newVariant(int start, Integer end, String ref, String altsCsv) { return newVariant(start, end, ref, Arrays.asList(altsCsv.split(",")), "2"); } @@ -238,12 +238,16 @@ protected Variant newVariant(int position, String ref, List altsList, St return newVariant(position, position, ref, altsList, studyId); } - protected Variant newVariant(int start, int end, String ref, List altsList, String studyId) { + protected Variant newVariant(int start, Integer end, String ref, List altsList, String studyId) { return newVariantBuilder(start, end, ref, altsList, studyId).build(); } - protected VariantBuilder newVariantBuilder(int position, int end, String ref, List altsList, String studyId) { - return Variant.newBuilder("1", position, end, ref, String.join(",", altsList)) + protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, List altsList, String studyId) { + return newVariantBuilder(position, end, ref, String.join(",", altsList), studyId); + } + + protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, String alts, String studyId) { + return Variant.newBuilder("1", position, end, ref, alts) .setStudyId(studyId) .setSampleDataKeys("GT") .setSamples(new ArrayList<>()) diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index a4a62f061..4253d9405 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -9,6 +9,7 @@ import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import java.util.*; +import java.util.function.Consumer; import java.util.stream.Collectors; import static org.junit.Assert.*; @@ -582,9 +583,7 @@ public void testMultiSNP() throws NonStandardCompliantSampleField { public void testNormalizeMultiAllelicPL() throws NonStandardCompliantSampleField { Variant variant = generateVariantWithFormat("X:100:A:T", "GT:GL", "S01", "0/0", "1,2,3", "S02", "0", "1,2"); - List normalize1 = normalizer.normalize(Collections.singletonList(variant), false); - assertEquals("1,2,3", normalize1.get(0).getStudies().get(0).getSampleData("S01", "GL")); - assertEquals("1,2", normalize1.get(0).getStudies().get(0).getSampleData("S02", "GL")); + normalizeUnmodified(variant); Variant variant2 = generateVariantWithFormat("X:100:A:T,C", "GT:GL", "S01", "0/0", "1,2,3,4,5,6", "S02", "A", "1,2,3"); List normalize2 = normalizer.normalize(Collections.singletonList(variant2), false); @@ -614,14 +613,138 @@ public void testCNVsNormalization() throws Exception { .addSample("HG00096", "0|0") .build(); - List normalizedVariantList = normalizer.normalize(Collections.singletonList(variant), true); - assertEquals(1, normalizedVariantList.size()); - assertEquals(new StructuralVariation(86, 150, 150, 211, 0, null, null, - StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariantList.get(0).getSv()); - // Normalize CNV alternate - assertEquals("", normalizedVariantList.get(0).getAlternate()); - assertEquals("1:86<100<150-150<200<211:C:", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getVariantId()); - assertEquals(0, normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 150, 211, 0, null, null, + StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testCNVsNormalizationNoNumber() throws Exception { + Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 150, 211, null, null, null, null, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testCNVsNormalizationNoNumberNoCipos() throws Exception { + Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(null, null, null, null, null, null, null, null, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testCNVsNormalizationUnmodified() throws Exception { + Variant variant = newVariantBuilder(101, 200, "-", Collections.singletonList(""), "2") + .addSample("HG00096", "0|0") + .build(); + + normalizeUnmodified(variant); + } + + @Test + public void testINSsNormalizationWithCIEND() throws Exception { + Variant variant = newVariantBuilder(100, null, "C", Collections.singletonList(""), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addFileData("LEFT_SVINSSEQ", "AAAA") + .addFileData("RIGHT_SVINSSEQ", "CCCC") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, null, null, null, "AAAA", "CCCC", null, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-50<100<111:C:AAAA...CCCC", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals("1:86<101<150:-:AAAA...CCCC", normalizedVariant.toString()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + @Test + public void testDUPTANDEMNormalization() throws Exception { + Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addSample("HG00096", "0|0") + .build(); + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 150, 211, null, null, null, StructuralVariantType.TANDEM_DUPLICATION, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals("1:86<101<150-150<200<211:-:", normalizedVariant.toString()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + + public void normalizeUnmodified(Variant variant) throws NonStandardCompliantSampleField { + normalizer.setGenerateReferenceBlocks(false); + + int hashCode = variant.hashCode(); + List list = normalizer.normalize(Collections.singletonList(variant), false); + assertEquals(1, list.size()); + Variant normVar = list.get(0); + + assertEquals(variant.toString(), normVar.toString()); + assertEquals("Ensure input variant is not modified", hashCode, variant.hashCode()); + assertEquals("Ensure norm variant is not modified", hashCode, normVar.hashCode()); + + list = normalizer.normalize(Collections.singletonList(variant), true); + assertEquals(1, list.size()); + normVar = list.get(0); + + assertEquals(variant.toString(), normVar.toString()); + assertEquals("Ensure input variant is not modified", hashCode, variant.hashCode()); + assertEquals("Ensure norm variant is not modified", hashCode, normVar.hashCode()); + + } + + public void normalizeOne(Variant variant, Consumer consumer) throws NonStandardCompliantSampleField { + normalizer.setGenerateReferenceBlocks(false); + + int hashCode = variant.hashCode(); + List list = normalizer.normalize(Collections.singletonList(variant), false); + assertEquals(1, list.size()); + consumer.accept(list.get(0)); + + int hashCode2 = variant.hashCode(); + + // Check that the original variant has not been modified, and check again, but reusing the input variant + assertEquals("Ensure input variant is not modified", hashCode, hashCode2); + + + list = normalizer.normalize(Collections.singletonList(variant), true); + assertEquals(1, list.size()); + assertSame(variant, list.get(0)); + consumer.accept(variant); + consumer.accept(list.get(0)); + + int hashCode3 = variant.hashCode(); + assertNotEquals(hashCode3, hashCode); } @Test @@ -670,23 +793,40 @@ public void testVNCNormalizationMultiallelic() throws NonStandardCompliantSample @Test public void testCNVsNormalizationCopyNumber() throws NonStandardCompliantSampleField { Variant variant; - List normalizedVariantList; variant = newVariantBuilder(100, 200, "C", Arrays.asList(""), "2") .setSampleDataKeys("GT", "CN") .addSample("HG00096", "0|1","3") .build(); - normalizedVariantList = normalizer.normalize(Collections.singletonList(variant), true); - assertEquals(1, normalizedVariantList.size()); - Variant normalizedVariant = normalizedVariantList.get(0); - assertEquals(new StructuralVariation(null, null, null, null, 3, null, null, - StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv()); - // Normalize CNV alternate - assertEquals("", normalizedVariant.getAlternate()); - assertEquals(101, normalizedVariant.getStart().intValue()); - assertEquals("", normalizedVariant.getReference()); - assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); - assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + normalizeOne(variant, normalizedVariant->{ + assertEquals(new StructuralVariation(null, null, null, null, 3, null, null, + StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals(101, normalizedVariant.getStart().intValue()); + assertEquals("", normalizedVariant.getReference()); + assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); + } + @Test + public void testCNVsNormalizationCopyNumberWithCipos() throws NonStandardCompliantSampleField { + Variant variant; + variant = newVariantBuilder(100, 200, "C", Arrays.asList(""), "2") + .addFileData("CIPOS", "-10,50") + .setSampleDataKeys("GT", "CN") + .addSample("HG00096", "0|1","3") + .build(); + normalizeOne(variant, normalizedVariant->{ + assertEquals(new StructuralVariation(90, 150, null, null, 3, null, null, + StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv()); + // Normalize CNV alternate + assertEquals("", normalizedVariant.getAlternate()); + assertEquals(101, normalizedVariant.getStart().intValue()); + assertEquals("", normalizedVariant.getReference()); + assertEquals("1:90<100<150-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); } @Test @@ -725,38 +865,33 @@ public void testNormalizeSV() throws NonStandardCompliantSampleField { @Test public void testNormalizeDEL() throws NonStandardCompliantSampleField { - Variant variant = newVariant(100, 200, "N", Collections.singletonList(""), STUDY_ID); - List normalized = normalizer.normalize(Collections.singletonList(variant), false); - - assertEquals(1, normalized.size()); - assertEquals(101, normalized.get(0).getStart().intValue()); - assertEquals(200, normalized.get(0).getEnd().intValue()); - assertEquals(new StructuralVariation(), normalized.get(0).getSv()); - System.out.println(normalized.get(0).toJson()); + normalizeOne(variant, normalized -> { + assertEquals(101, normalized.getStart().intValue()); + assertEquals(200, normalized.getEnd().intValue()); + assertEquals(new StructuralVariation(), normalized.getSv()); +// System.out.println(normalized.toJson()); + }); } @Test public void testNormalizeINS() throws NonStandardCompliantSampleField { - String seq = "ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTG"; Variant variant = newVariantBuilder(100, 100, "N", Collections.singletonList(""), STUDY_ID) .addFileData("SVINSSEQ", seq) .build(); - List list = new VariantNormalizer().normalize(Collections.singletonList(variant), false); - - assertEquals(1, list.size()); - Variant normalized = list.get(0); - assertEquals(101, normalized.getStart().intValue()); - assertEquals(100, normalized.getEnd().intValue()); - assertEquals(seq.length(), normalized.getLength().intValue()); - assertEquals(seq.length(), normalized.getLengthAlternate().intValue()); - assertEquals(0, normalized.getLengthReference().intValue()); - assertEquals("", normalized.getReference()); - assertEquals(seq, normalized.getAlternate()); - assertEquals(new StructuralVariation(), normalized.getSv()); - assertEquals("1:100-100:N:", normalized.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); - assertEquals(0, normalized.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + normalizeOne(variant, normalizedVariant -> { + assertEquals(101, normalizedVariant.getStart().intValue()); + assertEquals(100, normalizedVariant.getEnd().intValue()); + assertEquals(seq.length(), normalizedVariant.getLength().intValue()); + assertEquals(seq.length(), normalizedVariant.getLengthAlternate().intValue()); + assertEquals(0, normalizedVariant.getLengthReference().intValue()); + assertEquals("", normalizedVariant.getReference()); + assertEquals(seq, normalizedVariant.getAlternate()); + assertEquals(new StructuralVariation(), normalizedVariant.getSv()); + assertEquals("1:100-100:N:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId()); + assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue()); + }); } @Test @@ -768,66 +903,60 @@ public void testNormalizeSvToIndel() throws NonStandardCompliantSampleField { assertEquals(Variant.SV_THRESHOLD + 1, variant.getLengthAlternate().intValue()); assertNotNull(variant.getSv()); - Variant normVar = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); - assertEquals(VariantType.INDEL, normVar.getType()); - assertEquals(Variant.SV_THRESHOLD, normVar.getLengthAlternate().intValue()); - assertNull(normVar.getSv()); - - // Check that the original variant has not been modified, and check again, but reusing the input variant - assertEquals(VariantType.INSERTION, variant.getType()); - assertEquals(Variant.SV_THRESHOLD + 1, variant.getLengthAlternate().intValue()); - assertNotNull(variant.getSv()); - Variant normVarReuse = new VariantNormalizer().normalize(Collections.singletonList(variant), true).get(0); - assertEquals(VariantType.INDEL, normVarReuse.getType()); - assertEquals(Variant.SV_THRESHOLD, normVarReuse.getLengthAlternate().intValue()); - assertNull(normVarReuse.getSv()); - + normalizeOne(variant, normVar -> { + assertEquals(VariantType.INDEL, normVar.getType()); + assertEquals(Variant.SV_THRESHOLD, normVar.getLengthAlternate().intValue()); + assertNull(normVar.getSv()); + }); } @Test public void testNormalizeWithInsSeq() throws NonStandardCompliantSampleField { Variant variant = new Variant("1:799984<800001<800022:-:ACCACACCCACACAACACACA...TGTGGTGTGTGTGGTGTG"); - Variant normVar = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); - assertEquals(variant, normVar); - assertEquals(variant.toString(), normVar.toString()); + normalizeUnmodified(variant); } @Test public void testNormalizeBND() throws NonStandardCompliantSampleField { - normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, 99, "A", "A[chr9:10[")); - normalizeBnd(newVariant(100, 99, "", "[22:10[."), newVariant(100, 99, "A", "[chr22:10[A")); - normalizeBnd(newVariant(101, 100, "", ".]9:10]"), newVariant(100, 99, "A", "A]chr9:10]")); - normalizeBnd(newVariant(100, 99, "", "]22:10]."), newVariant(100, 99, "A", "]chr22:10]A")); - normalizeBnd(newVariant(100, 99, "", "]22:10]NNN"), newVariant(100, 99, "A", "]chr22:10]NNNA")); + normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, null, "A", "A[chr9:10[")); + normalizeBnd(newVariant(100, 99, "", "[22:10[."), newVariant(100, null, "A", "[chr22:10[A")); + normalizeBnd(newVariant(101, 100, "", ".]9:10]"), newVariant(100, null, "A", "A]chr9:10]")); + normalizeBnd(newVariant(100, 99, "", "]22:10]."), newVariant(100, null, "A", "]chr22:10]A")); + normalizeBnd(newVariant(100, 99, "", "]22:10]NNN"), newVariant(100, null, "A", "]chr22:10]NNNA")); - normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, 99, "A", "[1:10[TA")); - normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, 99, "AC", "[1:10[TAC")); + normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, null, "A", "[1:10[TA")); + normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, null, "AC", "[1:10[TAC")); - normalizeBnd(newVariant(100, 99, "TAC", "[1:10[AC"), newVariant(100, 99, "TAC", "[1:10[AC")); - normalizeBnd(newVariant(100, 99, "TAC", "TA[1:10["), newVariant(100, 99, "TAC", "TA[1:10[")); + normalizeBnd(newVariant(100, 99, "TAC", "[1:10[AC"), newVariant(100, null, "TAC", "[1:10[AC")); + normalizeBnd(newVariant(100, 99, "TAC", "TA[1:10["), newVariant(100, null, "TAC", "TA[1:10[")); + + normalizeBnd(newVariantBuilder(101, 100, "", ".[9:10[", "s1").setCiStart(95, 105).build(), + newVariantBuilder(100, null, "A", "A[chr9:10[", "s1").setCiStart(95,105).setCiEnd(95,105).build()); } private void normalizeBnd(Variant expectedVariant, Variant variant) throws NonStandardCompliantSampleField { - System.out.println("---"); +// System.out.println("---"); boolean expectsNormalization = !expectedVariant.equals(variant); - System.out.println(" - Actual"); - System.out.println(" " + variant.toString()); - System.out.println(" " + variant.toJson()); - System.out.println(" - Expected"); - System.out.println(" " + expectedVariant.toString()); - System.out.println(" " + expectedVariant.toJson()); - System.out.println(" - Normalized (same = " + !expectsNormalization + ")"); - List normalized = normalizer.normalize(Collections.singletonList(variant), false); - - for (Variant v : normalized) { - System.out.println(" " + v.toString()); - System.out.println(" " + v.toJson()); - if (expectsNormalization) { - assertNotNull(v.getStudies().get(0).getFiles().get(0).getCall()); - v.getStudies().get(0).getFiles().get(0).setCall(null); - } - assertEquals(expectedVariant, v); +// System.out.println(" - Actual"); +// System.out.println(" " + variant.toString()); +// System.out.println(" " + variant.toJson()); +// System.out.println(" - Expected"); +// System.out.println(" " + expectedVariant.toString()); +// System.out.println(" " + expectedVariant.toJson()); +// System.out.println(" - Normalized (same = " + !expectsNormalization + ")"); + if (expectsNormalization) { + normalizeOne(variant, normVar -> { + System.out.println(" " + normVar.toString()); + System.out.println(" " + normVar.toJson()); + OriginalCall call = normVar.getStudies().get(0).getFiles().get(0).getCall(); + assertNotNull(call); + normVar.getStudies().get(0).getFiles().get(0).setCall(null); + assertEquals(expectedVariant, normVar); + normVar.getStudies().get(0).getFiles().get(0).setCall(call); + }); + } else { + normalizeUnmodified(variant); } } From a6abc515055de2f42805197c0859852441b91317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 17 Jul 2024 11:03:56 +0100 Subject: [PATCH 2/3] tools: Remove sv.ciEnd from INSERTION and BREAKEND variants. #TASK-6558 --- .../biodata/tools/variant/VariantNormalizer.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index 1b8992c0c..e902ce99f 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -642,8 +642,18 @@ public List normalizeSymbolic(final Integer start, final Integ // CI positions may change during the normalization. Update them. normalizedSv.setCiStartLeft(sv.getCiStartLeft()); normalizedSv.setCiStartRight(sv.getCiStartRight()); - normalizedSv.setCiEndLeft(sv.getCiEndLeft()); - normalizedSv.setCiEndRight(sv.getCiEndRight()); + + // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND. + // At this point, we're removing the CIEND from the normalized variant. + // Do not remove the value from the INFO field (if any). + // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start") + if (keyFields.getEnd() < keyFields.getStart()) { + normalizedSv.setCiEndLeft(null); + normalizedSv.setCiEndRight(null); + } else { + normalizedSv.setCiEndLeft(sv.getCiEndLeft()); + normalizedSv.setCiEndRight(sv.getCiEndRight()); + } normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); From 58bee081bda9506b26f224aad484ad8ad4c38a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 18 Jul 2024 12:38:37 +0100 Subject: [PATCH 3/3] tools: Normalize sv for non-symbolic variants. #TASK-6558 --- .../tools/variant/VariantNormalizer.java | 160 +++++++++++------- .../tools/variant/VariantNormalizerTest.java | 26 +++ .../variant/merge/VariantMergerTest.java | 3 +- 3 files changed, 126 insertions(+), 63 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index e902ce99f..3e16977f5 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -301,19 +301,16 @@ public List normalize(List batch, boolean reuse) throws NonSta normalizedVariants.add(variant); continue; } - String reference = variant.getReference(); //Save original values, as they can be changed + //Save original values, as they can be changed + String reference = variant.getReference(); String alternate = variant.getAlternate(); Integer start = variant.getStart(); Integer end = variant.getEnd(); String chromosome = variant.getChromosome(); if (variant.getStudies() == null || variant.getStudies().isEmpty()) { - List keyFieldsList; - if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternate, variant.getSv()); - } else { - keyFieldsList = normalize(chromosome, start, reference, alternate); - } + List keyFieldsList = normalizeAlleles(variant); + // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) { OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele()); @@ -331,25 +328,16 @@ public List normalize(List batch, boolean reuse) throws NonSta normalizedVariants.add(normalizedVariant); } } else { - for (StudyEntry entry : variant.getStudies()) { - List originalAlternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); - List alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); - alternates.add(alternate); - originalAlternates.add(alternate); - for (String secondaryAlternatesAllele : entry.getSecondaryAlternatesAlleles()) { - alternates.add(secondaryAlternatesAllele); - originalAlternates.add(secondaryAlternatesAllele); - } + if (variant.getStudies().size() != 1) { + throw new IllegalStateException("Only one study per variant is supported when normalizing variants. Found " + + variant.getStudies().size() + " studies. Variant: " + variant); + } else { + StudyEntry entry = variant.getStudies().get(0); + List alternates = getAllAlternates(variant); // FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed - List keyFieldsList; - List originalKeyFieldsList; - if (isSymbolic(variant)) { - keyFieldsList = normalizeSymbolic(start, end, reference, alternates, variant.getSv()); - } else { - keyFieldsList = normalize(chromosome, start, reference, alternates); - } - originalKeyFieldsList = keyFieldsList + List keyFieldsList = normalizeAlleles(variant); + List originalKeyFieldsList = keyFieldsList .stream() .filter(k -> !k.isReferenceBlock()) .map(k -> k.originalKeyFields) @@ -372,8 +360,8 @@ public List normalize(List batch, boolean reuse) throws NonSta originalCall = entry.getFiles().get(0).getCall().getVariantId(); } else { StringBuilder sb = new StringBuilder(variant.toString()); - for (int i = 1; i < originalAlternates.size(); i++) { - sb.append(",").append(originalAlternates.get(i)); + for (int i = 1; i < alternates.size(); i++) { + sb.append(",").append(alternates.get(i)); } originalCall = sb.toString(); } @@ -600,17 +588,54 @@ private Collection sortByPosition(List keyFi // } // } + protected List normalizeAlleles(Variant variant) { + List alternates = getAllAlternates(variant); + + List keyFieldsList; + if (isSymbolic(variant)) { + keyFieldsList = normalizeSymbolic(variant.getStart(), variant.getEnd(), variant.getReference(), alternates, variant.getSv()); + } else { + keyFieldsList = normalize(variant.getChromosome(), variant.getStart(), variant.getReference(), alternates, variant.getSv()); + } + return keyFieldsList; + } + + private static List getAllAlternates(Variant variant) { + List alternates; + if (variant.getStudies() != null && !variant.getStudies().isEmpty()) { + StudyEntry entry = variant.getStudies().get(0); + String alternate = variant.getAlternate(); + alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); + alternates.add(alternate); + for (AlternateCoordinate secondaryAlternate : entry.getSecondaryAlternates()) { + if (secondaryAlternate.getStart() != null && !secondaryAlternate.getStart().equals(variant.getStart())) { + throw new IllegalStateException("Unable to normalize variant where secondary alternates do not start at the same position. " + + "Variant: " + variant + " , secondaryAlternate: " + secondaryAlternate); + } + if (secondaryAlternate.getEnd() != null && !secondaryAlternate.getEnd().equals(variant.getEnd())) { + throw new IllegalStateException("Unable to normalize variant where secondary alternates do not end at the same position. " + + "Variant: " + variant + " (end=" + variant.getEnd() + ") , secondaryAlternate: " + secondaryAlternate); + } + alternates.add(secondaryAlternate.getAlternate()); + } + } else { + alternates = Collections.singletonList(variant.getAlternate()); + } + return Collections.unmodifiableList(alternates); + } + + @Deprecated // Test purposes only public List normalizeSymbolic(Integer start, Integer end, String reference, String alternate, StructuralVariation sv) { return normalizeSymbolic(start, end, reference, Collections.singletonList(alternate), sv); } - @Deprecated + @Deprecated // Test purposes only public List normalizeSymbolic(final Integer start, final Integer end, final String reference, final List alternates) { return normalizeSymbolic(start, end, reference, alternates, null); } - public List normalizeSymbolic(final Integer start, final Integer end, final String reference, + protected List normalizeSymbolic(final Integer start, final Integer end, final String reference, final List alternates, StructuralVariation sv) { List list = new ArrayList<>(alternates.size()); @@ -634,37 +659,7 @@ public List normalizeSymbolic(final Integer start, final Integ keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION); } - if (sv != null) { - StructuralVariation normalizedSv = keyFields.getSv(); - if (normalizedSv == null) { - normalizedSv = new StructuralVariation(); - } - // CI positions may change during the normalization. Update them. - normalizedSv.setCiStartLeft(sv.getCiStartLeft()); - normalizedSv.setCiStartRight(sv.getCiStartRight()); - - // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND. - // At this point, we're removing the CIEND from the normalized variant. - // Do not remove the value from the INFO field (if any). - // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start") - if (keyFields.getEnd() < keyFields.getStart()) { - normalizedSv.setCiEndLeft(null); - normalizedSv.setCiEndRight(null); - } else { - normalizedSv.setCiEndLeft(sv.getCiEndLeft()); - normalizedSv.setCiEndRight(sv.getCiEndRight()); - } - normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); - normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); - - if (keyFields.getSv() == null) { - if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null - || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null - || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) { - keyFields.setSv(normalizedSv); - } - } - } + normalizeSvField(sv, keyFields); list.add(keyFields); } @@ -672,6 +667,40 @@ public List normalizeSymbolic(final Integer start, final Integ return list; } + private static void normalizeSvField(StructuralVariation sv, VariantKeyFields keyFields) { + if (sv != null) { + StructuralVariation normalizedSv = keyFields.getSv(); + if (normalizedSv == null) { + normalizedSv = new StructuralVariation(); + } + // CI positions may change during the normalization. Update them. + normalizedSv.setCiStartLeft(sv.getCiStartLeft()); + normalizedSv.setCiStartRight(sv.getCiStartRight()); + + // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND. + // At this point, we're removing the CIEND from the normalized variant. + // Do not remove the value from the INFO field (if any). + // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start") + if (keyFields.getEnd() < keyFields.getStart()) { + normalizedSv.setCiEndLeft(null); + normalizedSv.setCiEndRight(null); + } else { + normalizedSv.setCiEndLeft(sv.getCiEndLeft()); + normalizedSv.setCiEndRight(sv.getCiEndRight()); + } + normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq()); + normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq()); + + if (keyFields.getSv() == null) { + if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null + || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null + || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) { + keyFields.setSv(normalizedSv); + } + } + } + } + private boolean isNonRef(String alternate) { return alternate.equals(Allele.NO_CALL_STRING) || alternate.equals(VariantBuilder.NON_REF_ALT) @@ -780,12 +809,17 @@ private VariantKeyFields normalizeSymbolic( } + @Deprecated // Test purposes only public List normalize(String chromosome, int position, String reference, String alternate) { - return normalize(chromosome, position, reference, Collections.singletonList(alternate)); + return normalize(chromosome, position, reference, Collections.singletonList(alternate), null); } - public List normalize(String chromosome, int position, String reference, List alternates) - { + @Deprecated // Test purposes only + public List normalize(String chromosome, int position, String reference, List alternates) { + return normalize(chromosome, position, reference, alternates, null); + } + + protected List normalize(String chromosome, int position, String reference, List alternates, StructuralVariation sv) { List list = new ArrayList<>(alternates.size()); int numAllelesIdx = 0; // This index is necessary for getting the samples where the mutated allele is present @@ -829,6 +863,8 @@ public List normalize(String chromosome, int position, String } } + normalizeSvField(sv, keyFields); + if (keyFields != null) { // To deal with cases such as A>GT diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index 4253d9405..a1faf4869 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -684,6 +684,32 @@ public void testINSsNormalizationWithCIEND() throws Exception { }); } + @Test + public void testNormalizeNonSymbolicInsertion() throws Exception { + Variant variant = newVariantBuilder(100, null, "C", Collections.singletonList("CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"), "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-50,11") + .addSample("HG00096", "0|0") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, null, null, null, null, null, null, null), normalizedVariant.getSv()); + }); + } + + @Test + public void testNormalizeNonSymbolicDeletion() throws Exception { + Variant variant = newVariantBuilder(100, null, "CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "C", "2") + .addFileData("CIPOS", "-14,50") + .addFileData("CIEND", "-1,1") + .addSample("HG00096", "0|1") + .build(); + + normalizeOne(variant, normalizedVariant -> { + assertEquals(new StructuralVariation(86, 150, 179, 181, null, null, null, null, null), normalizedVariant.getSv()); + }); + } + @Test public void testDUPTANDEMNormalization() throws Exception { Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2") diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java index 46ab5800e..07533ab5b 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/merge/VariantMergerTest.java @@ -498,7 +498,8 @@ public void testMergeIndelCase1() throws NonStandardCompliantSampleField { Variant v1 = VariantTestUtils.generateVariantWithFormat("1:328:CTT:C", VCFConstants.GENOTYPE_KEY + "," + VCFConstants.GENOTYPE_FILTER_KEY, "S1", "1/2","PASS"); - v1.getStudies().get(0).getSecondaryAlternates().add(new AlternateCoordinate(null,null,331,"CTT", "CTTTC", VariantType.INDEL)); + + v1.getStudies().get(0).getSecondaryAlternates().add(new AlternateCoordinate(null, null, 330, "CTT", "CTTTC", VariantType.INDEL)); Variant v2 = VariantTestUtils.generateVariantWithFormat("1:331:T:TCT", VCFConstants.GENOTYPE_KEY + "," + VCFConstants.GENOTYPE_FILTER_KEY,