diff --git a/.github/workflows/test-analysis.yml b/.github/workflows/test-analysis.yml
index d460871a..e9df51ec 100644
--- a/.github/workflows/test-analysis.yml
+++ b/.github/workflows/test-analysis.yml
@@ -10,11 +10,11 @@ jobs:
name: Test and push Sonar analysis
runs-on: ubuntu-22.04
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
with:
fetch-depth: '0'
- name: Set up JDK 11
- uses: actions/setup-java@v3
+ uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: '11'
diff --git a/biodata-external/pom.xml b/biodata-external/pom.xml
index 347042c9..3c1b2425 100644
--- a/biodata-external/pom.xml
+++ b/biodata-external/pom.xml
@@ -6,7 +6,7 @@
biodata
org.opencb.biodata
- 2.12.1
+ 2.12.3-SNAPSHOT
../pom.xml
diff --git a/biodata-formats/pom.xml b/biodata-formats/pom.xml
index f0b0019e..24182d67 100644
--- a/biodata-formats/pom.xml
+++ b/biodata-formats/pom.xml
@@ -22,7 +22,7 @@
org.opencb.biodata
biodata
- 2.12.1
+ 2.12.3-SNAPSHOT
../pom.xml
diff --git a/biodata-models/pom.xml b/biodata-models/pom.xml
index 81a8b17b..1eb260d6 100644
--- a/biodata-models/pom.xml
+++ b/biodata-models/pom.xml
@@ -22,7 +22,7 @@
org.opencb.biodata
biodata
- 2.12.1
+ 2.12.3-SNAPSHOT
../pom.xml
diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java b/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java
new file mode 100644
index 00000000..8f8cc712
--- /dev/null
+++ b/biodata-models/src/main/java/org/opencb/biodata/models/core/Snp.java
@@ -0,0 +1,147 @@
+/*
+ *
+ *
+ */
+
+package org.opencb.biodata.models.core;
+
+import java.util.List;
+
+public class Snp {
+ private String id;
+ private String chromosome;
+ private int position;
+ private String reference;
+ private List alternates;
+ private String type;
+ private String source;
+ private String version;
+ private SnpAnnotation annotation;
+
+ public Snp() {
+ }
+
+ public Snp(String id, String chromosome, int position, String reference, List alternates, String type,
+ String source, String version, SnpAnnotation annotation) {
+ this.id = id;
+ this.chromosome = chromosome;
+ this.position = position;
+ this.reference = reference;
+ this.alternates = alternates;
+ this.type = type;
+ this.source = source;
+ this.version = version;
+ this.annotation = annotation;
+ }
+
+ @Override
+ public String toString() {
+ final StringBuilder sb = new StringBuilder("Snp{");
+ sb.append("id='").append(id).append('\'');
+ sb.append(", chromosome='").append(chromosome).append('\'');
+ sb.append(", position=").append(position);
+ sb.append(", reference='").append(reference).append('\'');
+ sb.append(", alternates=").append(alternates);
+ sb.append(", type='").append(type).append('\'');
+ sb.append(", source='").append(source).append('\'');
+ sb.append(", version='").append(version).append('\'');
+ sb.append(", annotation=").append(annotation);
+ sb.append('}');
+ return sb.toString();
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public Snp setId(String id) {
+ this.id = id;
+ return this;
+ }
+
+ public String getChromosome() {
+ return chromosome;
+ }
+
+ public Snp setChromosome(String chromosome) {
+ this.chromosome = chromosome;
+ return this;
+ }
+
+ public int getPosition() {
+ return position;
+ }
+
+ public Snp setPosition(int position) {
+ this.position = position;
+ return this;
+ }
+
+ public String getReference() {
+ return reference;
+ }
+
+ public Snp setReference(String reference) {
+ this.reference = reference;
+ return this;
+ }
+
+ public List getAlternates() {
+ return alternates;
+ }
+
+ public Snp setAlternates(List alternates) {
+ this.alternates = alternates;
+ return this;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public Snp setType(String type) {
+ this.type = type;
+ return this;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public Snp setSource(String source) {
+ this.source = source;
+ return this;
+ }
+
+ public String getVersion() {
+ return version;
+ }
+
+ public Snp setVersion(String version) {
+ this.version = version;
+ return this;
+ }
+
+ public SnpAnnotation getAnnotation() {
+ return annotation;
+ }
+
+ public Snp setAnnotation(SnpAnnotation annotation) {
+ this.annotation = annotation;
+ return this;
+ }
+}
diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java b/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java
new file mode 100644
index 00000000..16fab718
--- /dev/null
+++ b/biodata-models/src/main/java/org/opencb/biodata/models/core/SnpAnnotation.java
@@ -0,0 +1,79 @@
+/*
+ *
+ *
+ */
+
+package org.opencb.biodata.models.core;
+
+import org.opencb.biodata.models.variant.avro.PopulationFrequency;
+
+import java.util.List;
+import java.util.Map;
+
+public class SnpAnnotation {
+
+ private List flags;
+ private String gene;
+ private List populationFrequencies;
+ private Map additionalAttributes;
+
+ public SnpAnnotation() {
+ }
+
+ public SnpAnnotation(List flags, String gene, List populationFrequencies, Map additionalAttributes) {
+ this.flags = flags;
+ this.gene = gene;
+ this.populationFrequencies = populationFrequencies;
+ this.additionalAttributes = additionalAttributes;
+ }
+
+ public List getFlags() {
+ return flags;
+ }
+
+ public SnpAnnotation setFlags(List flags) {
+ this.flags = flags;
+ return this;
+ }
+
+ public String getGene() {
+ return gene;
+ }
+
+ public SnpAnnotation setGene(String gene) {
+ this.gene = gene;
+ return this;
+ }
+
+ public List getPopulationFrequencies() {
+ return populationFrequencies;
+ }
+
+ public SnpAnnotation setPopulationFrequencies(List populationFrequencies) {
+ this.populationFrequencies = populationFrequencies;
+ return this;
+ }
+
+ public Map getAdditionalAttributes() {
+ return additionalAttributes;
+ }
+
+ public SnpAnnotation setAdditionalAttributes(Map additionalAttributes) {
+ this.additionalAttributes = additionalAttributes;
+ return this;
+ }
+}
diff --git a/biodata-tools/pom.xml b/biodata-tools/pom.xml
index e2438f80..fdb21cbf 100644
--- a/biodata-tools/pom.xml
+++ b/biodata-tools/pom.xml
@@ -22,7 +22,7 @@
org.opencb.biodata
biodata
- 2.12.1
+ 2.12.3-SNAPSHOT
../pom.xml
@@ -53,6 +53,12 @@
com.databricks
SnpEff
+
+
+ distlib
+ distlib
+
+
org.rocksdb
diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java
index 895767e9..54b84cc0 100644
--- a/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java
+++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/alignment/BamManager.java
@@ -49,6 +49,7 @@
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
/**
@@ -62,7 +63,7 @@ public class BamManager implements AutoCloseable {
public static final int DEFAULT_WINDOW_SIZE = 1;
public static final int MAX_NUM_RECORDS = 50000;
- public static final int MAX_REGION_COVERAGE = 100000;
+ public static final int MAX_REGION_COVERAGE = 500000;
public static final String COVERAGE_BIGWIG_EXTENSION = ".bw";
private Logger logger;
@@ -191,7 +192,10 @@ public Path calculateBigWigCoverage(Path bigWigPath, int windowSize) throws IOEx
return bigWigPath;
}
-
+ /**
+ * @deprecated (since getFileHeader().getTextHeader() is deprecated !)
+ */
+ @Deprecated
public String header() {
return samReader.getFileHeader().getTextHeader();
}
@@ -338,7 +342,7 @@ public List getChunks(Region region) {
BAMIndex index = samReader.indexing().getIndex();
return index.getSpanOverlapping(sequenceIndex, start, end).getChunks();
}
- return null;
+ return Collections.emptyList();
}
public List getBreakpoints(Region region) throws IOException {
@@ -378,7 +382,7 @@ public List getBreakpoints(Region region) throws IOException {
}
}
}
- return null;
+ return Collections.emptyList();
}
/**
@@ -445,7 +449,7 @@ public AlignmentGlobalStats stats(Region region, AlignmentFilters fil
return calculateGlobalStats(iterator(region, filters, options));
}
- private AlignmentGlobalStats calculateGlobalStats(BamIterator iterator) throws IOException {
+ private AlignmentGlobalStats calculateGlobalStats(BamIterator iterator) {
AlignmentGlobalStats alignmentGlobalStats = new AlignmentGlobalStats();
SamRecordAlignmentGlobalStatsCalculator calculator = new SamRecordAlignmentGlobalStatsCalculator();
while (iterator.hasNext()) {
diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java
index 990d61e8..e902ce99 100644
--- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java
+++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java
@@ -306,19 +306,18 @@ public List normalize(List batch, boolean reuse) throws NonSta
Integer start = variant.getStart();
Integer end = variant.getEnd();
String chromosome = variant.getChromosome();
- StructuralVariation sv = variant.getSv();
if (variant.getStudies() == null || variant.getStudies().isEmpty()) {
List keyFieldsList;
if (isSymbolic(variant)) {
- keyFieldsList = normalizeSymbolic(start, end, reference, alternate, sv);
+ keyFieldsList = normalizeSymbolic(start, end, reference, alternate, variant.getSv());
} else {
keyFieldsList = normalize(chromosome, start, reference, alternate);
}
// Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
OriginalCall call = new OriginalCall(variant.toString(), keyFields.getNumAllele());
- Variant normalizedVariant = newVariant(variant, keyFields, sv);
+ Variant normalizedVariant = newVariant(variant, keyFields);
if (keyFields.getPhaseSet() != null) {
StudyEntry studyEntry = new StudyEntry();
studyEntry.setSamples(
@@ -346,7 +345,7 @@ public List normalize(List batch, boolean reuse) throws NonSta
List keyFieldsList;
List originalKeyFieldsList;
if (isSymbolic(variant)) {
- keyFieldsList = normalizeSymbolic(start, end, reference, alternates, sv);
+ keyFieldsList = normalizeSymbolic(start, end, reference, alternates, variant.getSv());
} else {
keyFieldsList = normalize(chromosome, start, reference, alternates);
}
@@ -400,6 +399,9 @@ public List normalize(List batch, boolean reuse) throws NonSta
variant.setEnd(keyFields.getEnd());
variant.setReference(keyFields.getReference());
variant.setAlternate(keyFields.getAlternate());
+ if (keyFields.getSv() != null) {
+ variant.setSv(keyFields.getSv());
+ }
variant.reset();
// Variant is being reused, must ensure the SV field si appropriately created
// if (isSymbolic(variant)) {
@@ -415,7 +417,7 @@ public List normalize(List batch, boolean reuse) throws NonSta
}
samples = entry.getSamples();
} else {
- normalizedVariant = newVariant(variant, keyFields, sv);
+ normalizedVariant = newVariant(variant, keyFields);
normalizedEntry = new StudyEntry();
normalizedEntry.setStudyId(entry.getStudyId());
@@ -624,6 +626,46 @@ public List normalizeSymbolic(final Integer start, final Integ
Integer copyNumber = sv == null ? null : sv.getCopyNumber();
keyFields = normalizeSymbolic(start, end, reference, alternate, alternates, copyNumber, numAllelesIdx);
}
+
+ if (alternate.equals(VariantBuilder.DUP_TANDEM_ALT)) {
+ if (keyFields.getSv() == null) {
+ keyFields.setSv(new StructuralVariation());
+ }
+ keyFields.getSv().setType(StructuralVariantType.TANDEM_DUPLICATION);
+ }
+
+ if (sv != null) {
+ StructuralVariation normalizedSv = keyFields.getSv();
+ if (normalizedSv == null) {
+ normalizedSv = new StructuralVariation();
+ }
+ // CI positions may change during the normalization. Update them.
+ normalizedSv.setCiStartLeft(sv.getCiStartLeft());
+ normalizedSv.setCiStartRight(sv.getCiStartRight());
+
+ // Structural variants that affect a single point (INSERTIONS or Breakends) should not have CIEND.
+ // At this point, we're removing the CIEND from the normalized variant.
+ // Do not remove the value from the INFO field (if any).
+ // The END is the same as the start (which, in base-1 means that "end == start -1" , so "end < start")
+ if (keyFields.getEnd() < keyFields.getStart()) {
+ normalizedSv.setCiEndLeft(null);
+ normalizedSv.setCiEndRight(null);
+ } else {
+ normalizedSv.setCiEndLeft(sv.getCiEndLeft());
+ normalizedSv.setCiEndRight(sv.getCiEndRight());
+ }
+ normalizedSv.setLeftSvInsSeq(sv.getLeftSvInsSeq());
+ normalizedSv.setRightSvInsSeq(sv.getRightSvInsSeq());
+
+ if (keyFields.getSv() == null) {
+ if (normalizedSv.getCiStartLeft() != null || normalizedSv.getCiStartRight() != null
+ || normalizedSv.getCiEndLeft() != null || normalizedSv.getCiEndRight() != null
+ || normalizedSv.getLeftSvInsSeq() != null || normalizedSv.getRightSvInsSeq() != null) {
+ keyFields.setSv(normalizedSv);
+ }
+ }
+ }
+
list.add(keyFields);
}
@@ -695,7 +737,7 @@ private static VariantKeyFields normalizeMateBreakend(
}
VariantKeyFields keyFields = new VariantKeyFields(newStart, newStart - 1, numAllelesIdx, newReference, newAlternate);
- keyFields.getSv().setBreakend(breakend);
+ keyFields.setBreakend(breakend);
return keyFields;
}
@@ -718,20 +760,23 @@ private VariantKeyFields normalizeSymbolic(
+ "contain 0 or 1 nt, but no more. Please, check.");
}
- Integer cn = VariantBuilder.getCopyNumberFromAlternate(alternate);
// if (cn != null) {
// // Alternate with the form , being xxx the number of copies, must be normalized into ""
// newAlternate = "";
// }
String newAlternate;
+ Integer newCn;
if (alternate.equals("") && copyNumber != null) {
// Alternate must be of the form , being xxx the number of copies
newAlternate = "";
+ newCn = copyNumber;
} else {
newAlternate = alternate;
+ newCn = VariantBuilder.getCopyNumberFromAlternate(alternate);
}
+
return new VariantKeyFields(newStart, end, numAllelesIdx, newReference, newAlternate,
- null, cn, false);
+ null, newCn, false);
}
@@ -1380,32 +1425,24 @@ private int[] getGenotypesReorderingMap(int numAllele, int[] alleleMap) {
}
}
-
- private Variant newVariant(Variant variant, VariantKeyFields keyFields, StructuralVariation sv) {
+ private Variant newVariant(Variant variant, VariantKeyFields keyFields) {
Variant normalizedVariant = new Variant(variant.getChromosome(), keyFields.getStart(), keyFields.getEnd(), keyFields.getReference(), keyFields.getAlternate())
.setId(variant.getId())
.setNames(variant.getNames())
.setStrand(variant.getStrand());
- if (sv != null) {
- if (normalizedVariant.getSv() != null) {
- // CI positions may change during the normalization. Update them.
- normalizedVariant.getSv().setCiStartLeft(sv.getCiStartLeft());
- normalizedVariant.getSv().setCiStartRight(sv.getCiStartRight());
- normalizedVariant.getSv().setCiEndLeft(sv.getCiEndLeft());
- normalizedVariant.getSv().setCiEndRight(sv.getCiEndRight());
-
- // Variant will never have CopyNumber, because the Alternate is normalized from to
- normalizedVariant.getSv().setCopyNumber(keyFields.getCopyNumber());
- VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber());
- if (cnvSubtype != null) {
- normalizedVariant.setType(cnvSubtype);
- }
- }
+ if (keyFields.getSv() != null) {
+ normalizedVariant.setSv(keyFields.getSv());
}
-
normalizedVariant.setAnnotation(variant.getAnnotation());
+ if (keyFields.getCopyNumber() != null) {
+ VariantType cnvSubtype = VariantBuilder.getCopyNumberSubtype(keyFields.getCopyNumber());
+ if (cnvSubtype != null) {
+ normalizedVariant.setType(cnvSubtype);
+ }
+ }
+
return normalizedVariant;
// normalizedVariant.setAnnotation(variant.getAnnotation());
// if (isSymbolic(variant)) {
@@ -1525,8 +1562,10 @@ public VariantKeyFields(int start, int end, int numAllele, String reference, Str
this.alternate = alternate;
this.originalKeyFields = originalKeyFields == null ? this : originalKeyFields;
this.referenceBlock = referenceBlock;
- this.sv = new StructuralVariation();
- setCopyNumber(copyNumber);
+ this.sv = null;
+ if (copyNumber != null) {
+ setCopyNumber(copyNumber);
+ }
}
@@ -1602,7 +1641,28 @@ public Integer getCopyNumber() {
}
public VariantKeyFields setCopyNumber(Integer copyNumber) {
- sv.setCopyNumber(copyNumber);
+ if (sv == null) {
+ if (copyNumber != null) {
+ sv = new StructuralVariation();
+ sv.setCopyNumber(copyNumber);
+ sv.setType(VariantBuilder.getCNVSubtype(copyNumber));
+ }
+ } else {
+ sv.setCopyNumber(copyNumber);
+ sv.setType(VariantBuilder.getCNVSubtype(copyNumber));
+ }
+ return this;
+ }
+
+ public VariantKeyFields setBreakend(Breakend breakend) {
+ if (sv == null) {
+ if (breakend != null) {
+ sv = new StructuralVariation();
+ sv.setBreakend(breakend);
+ }
+ } else {
+ sv.setBreakend(breakend);
+ }
return this;
}
diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java
index f097d1e1..e59ad530 100644
--- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java
+++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerGenericTest.java
@@ -230,7 +230,7 @@ protected Variant newVariant(int position, String ref, String altsCsv) {
return newVariant(position, position, ref, Arrays.asList(altsCsv.split(",")), "2");
}
- protected Variant newVariant(int start, int end, String ref, String altsCsv) {
+ protected Variant newVariant(int start, Integer end, String ref, String altsCsv) {
return newVariant(start, end, ref, Arrays.asList(altsCsv.split(",")), "2");
}
@@ -238,12 +238,16 @@ protected Variant newVariant(int position, String ref, List altsList, St
return newVariant(position, position, ref, altsList, studyId);
}
- protected Variant newVariant(int start, int end, String ref, List altsList, String studyId) {
+ protected Variant newVariant(int start, Integer end, String ref, List altsList, String studyId) {
return newVariantBuilder(start, end, ref, altsList, studyId).build();
}
- protected VariantBuilder newVariantBuilder(int position, int end, String ref, List altsList, String studyId) {
- return Variant.newBuilder("1", position, end, ref, String.join(",", altsList))
+ protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, List altsList, String studyId) {
+ return newVariantBuilder(position, end, ref, String.join(",", altsList), studyId);
+ }
+
+ protected VariantBuilder newVariantBuilder(int position, Integer end, String ref, String alts, String studyId) {
+ return Variant.newBuilder("1", position, end, ref, alts)
.setStudyId(studyId)
.setSampleDataKeys("GT")
.setSamples(new ArrayList<>())
diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java
index 95265190..4253d940 100644
--- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java
+++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java
@@ -9,6 +9,7 @@
import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField;
import java.util.*;
+import java.util.function.Consumer;
import java.util.stream.Collectors;
import static org.junit.Assert.*;
@@ -582,9 +583,7 @@ public void testMultiSNP() throws NonStandardCompliantSampleField {
public void testNormalizeMultiAllelicPL() throws NonStandardCompliantSampleField {
Variant variant = generateVariantWithFormat("X:100:A:T", "GT:GL", "S01", "0/0", "1,2,3", "S02", "0", "1,2");
- List normalize1 = normalizer.normalize(Collections.singletonList(variant), false);
- assertEquals("1,2,3", normalize1.get(0).getStudies().get(0).getSampleData("S01", "GL"));
- assertEquals("1,2", normalize1.get(0).getStudies().get(0).getSampleData("S02", "GL"));
+ normalizeUnmodified(variant);
Variant variant2 = generateVariantWithFormat("X:100:A:T,C", "GT:GL", "S01", "0/0", "1,2,3,4,5,6", "S02", "A", "1,2,3");
List normalize2 = normalizer.normalize(Collections.singletonList(variant2), false);
@@ -614,14 +613,138 @@ public void testCNVsNormalization() throws Exception {
.addSample("HG00096", "0|0")
.build();
- List normalizedVariantList = normalizer.normalize(Collections.singletonList(variant), true);
- assertEquals(1, normalizedVariantList.size());
- assertEquals(new StructuralVariation(86, 150, 150, 211, 0, null, null,
- StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariantList.get(0).getSv());
- // Normalize CNV alternate
- assertEquals("", normalizedVariantList.get(0).getAlternate());
- assertEquals("1:86<100<150-150<200<211:C:", normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getVariantId());
- assertEquals(0, normalizedVariantList.get(0).getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ normalizeOne(variant, normalizedVariant -> {
+ assertEquals(new StructuralVariation(86, 150, 150, 211, 0, null, null,
+ StructuralVariantType.COPY_NUMBER_LOSS, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
+ }
+
+ @Test
+ public void testCNVsNormalizationNoNumber() throws Exception {
+ Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2")
+ .addFileData("CIPOS", "-14,50")
+ .addFileData("CIEND", "-50,11")
+ .addSample("HG00096", "0|0")
+ .build();
+
+ normalizeOne(variant, normalizedVariant -> {
+ assertEquals(new StructuralVariation(86, 150, 150, 211, null, null, null, null, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
+ }
+
+ @Test
+ public void testCNVsNormalizationNoNumberNoCipos() throws Exception {
+ Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2")
+ .addSample("HG00096", "0|0")
+ .build();
+
+ normalizeOne(variant, normalizedVariant -> {
+ assertEquals(new StructuralVariation(null, null, null, null, null, null, null, null, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
+ }
+
+ @Test
+ public void testCNVsNormalizationUnmodified() throws Exception {
+ Variant variant = newVariantBuilder(101, 200, "-", Collections.singletonList(""), "2")
+ .addSample("HG00096", "0|0")
+ .build();
+
+ normalizeUnmodified(variant);
+ }
+
+ @Test
+ public void testINSsNormalizationWithCIEND() throws Exception {
+ Variant variant = newVariantBuilder(100, null, "C", Collections.singletonList(""), "2")
+ .addFileData("CIPOS", "-14,50")
+ .addFileData("CIEND", "-50,11")
+ .addFileData("LEFT_SVINSSEQ", "AAAA")
+ .addFileData("RIGHT_SVINSSEQ", "CCCC")
+ .addSample("HG00096", "0|0")
+ .build();
+
+ normalizeOne(variant, normalizedVariant -> {
+ assertEquals(new StructuralVariation(86, 150, null, null, null, "AAAA", "CCCC", null, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals("1:86<100<150-50<100<111:C:AAAA...CCCC", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals("1:86<101<150:-:AAAA...CCCC", normalizedVariant.toString());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
+ }
+
+ @Test
+ public void testDUPTANDEMNormalization() throws Exception {
+ Variant variant = newVariantBuilder(100, 200, "C", Collections.singletonList(""), "2")
+ .addFileData("CIPOS", "-14,50")
+ .addFileData("CIEND", "-50,11")
+ .addSample("HG00096", "0|0")
+ .build();
+ normalizeOne(variant, normalizedVariant -> {
+ assertEquals(new StructuralVariation(86, 150, 150, 211, null, null, null, StructuralVariantType.TANDEM_DUPLICATION, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals("1:86<100<150-150<200<211:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals("1:86<101<150-150<200<211:-:", normalizedVariant.toString());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
+ }
+
+ public void normalizeUnmodified(Variant variant) throws NonStandardCompliantSampleField {
+ normalizer.setGenerateReferenceBlocks(false);
+
+ int hashCode = variant.hashCode();
+ List list = normalizer.normalize(Collections.singletonList(variant), false);
+ assertEquals(1, list.size());
+ Variant normVar = list.get(0);
+
+ assertEquals(variant.toString(), normVar.toString());
+ assertEquals("Ensure input variant is not modified", hashCode, variant.hashCode());
+ assertEquals("Ensure norm variant is not modified", hashCode, normVar.hashCode());
+
+ list = normalizer.normalize(Collections.singletonList(variant), true);
+ assertEquals(1, list.size());
+ normVar = list.get(0);
+
+ assertEquals(variant.toString(), normVar.toString());
+ assertEquals("Ensure input variant is not modified", hashCode, variant.hashCode());
+ assertEquals("Ensure norm variant is not modified", hashCode, normVar.hashCode());
+
+ }
+
+ public void normalizeOne(Variant variant, Consumer consumer) throws NonStandardCompliantSampleField {
+ normalizer.setGenerateReferenceBlocks(false);
+
+ int hashCode = variant.hashCode();
+ List list = normalizer.normalize(Collections.singletonList(variant), false);
+ assertEquals(1, list.size());
+ consumer.accept(list.get(0));
+
+ int hashCode2 = variant.hashCode();
+
+ // Check that the original variant has not been modified, and check again, but reusing the input variant
+ assertEquals("Ensure input variant is not modified", hashCode, hashCode2);
+
+
+ list = normalizer.normalize(Collections.singletonList(variant), true);
+ assertEquals(1, list.size());
+ assertSame(variant, list.get(0));
+ consumer.accept(variant);
+ consumer.accept(list.get(0));
+
+ int hashCode3 = variant.hashCode();
+ assertNotEquals(hashCode3, hashCode);
}
@Test
@@ -670,23 +793,40 @@ public void testVNCNormalizationMultiallelic() throws NonStandardCompliantSample
@Test
public void testCNVsNormalizationCopyNumber() throws NonStandardCompliantSampleField {
Variant variant;
- List normalizedVariantList;
variant = newVariantBuilder(100, 200, "C", Arrays.asList(""), "2")
.setSampleDataKeys("GT", "CN")
.addSample("HG00096", "0|1","3")
.build();
- normalizedVariantList = normalizer.normalize(Collections.singletonList(variant), true);
- assertEquals(1, normalizedVariantList.size());
- Variant normalizedVariant = normalizedVariantList.get(0);
- assertEquals(new StructuralVariation(null, null, null, null, 3, null, null,
- StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv());
- // Normalize CNV alternate
- assertEquals("", normalizedVariant.getAlternate());
- assertEquals(101, normalizedVariant.getStart().intValue());
- assertEquals("", normalizedVariant.getReference());
- assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
- assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ normalizeOne(variant, normalizedVariant->{
+ assertEquals(new StructuralVariation(null, null, null, null, 3, null, null,
+ StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals(101, normalizedVariant.getStart().intValue());
+ assertEquals("", normalizedVariant.getReference());
+ assertEquals("1:100-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
+ }
+ @Test
+ public void testCNVsNormalizationCopyNumberWithCipos() throws NonStandardCompliantSampleField {
+ Variant variant;
+ variant = newVariantBuilder(100, 200, "C", Arrays.asList(""), "2")
+ .addFileData("CIPOS", "-10,50")
+ .setSampleDataKeys("GT", "CN")
+ .addSample("HG00096", "0|1","3")
+ .build();
+ normalizeOne(variant, normalizedVariant->{
+ assertEquals(new StructuralVariation(90, 150, null, null, 3, null, null,
+ StructuralVariantType.COPY_NUMBER_GAIN, null), normalizedVariant.getSv());
+ // Normalize CNV alternate
+ assertEquals("", normalizedVariant.getAlternate());
+ assertEquals(101, normalizedVariant.getStart().intValue());
+ assertEquals("", normalizedVariant.getReference());
+ assertEquals("1:90<100<150-200:C:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
}
@Test
@@ -725,38 +865,33 @@ public void testNormalizeSV() throws NonStandardCompliantSampleField {
@Test
public void testNormalizeDEL() throws NonStandardCompliantSampleField {
-
Variant variant = newVariant(100, 200, "N", Collections.singletonList(""), STUDY_ID);
- List normalized = normalizer.normalize(Collections.singletonList(variant), false);
-
- assertEquals(1, normalized.size());
- assertEquals(101, normalized.get(0).getStart().intValue());
- assertEquals(200, normalized.get(0).getEnd().intValue());
- assertEquals(new StructuralVariation(), normalized.get(0).getSv());
- System.out.println(normalized.get(0).toJson());
+ normalizeOne(variant, normalized -> {
+ assertEquals(101, normalized.getStart().intValue());
+ assertEquals(200, normalized.getEnd().intValue());
+ assertEquals(new StructuralVariation(), normalized.getSv());
+// System.out.println(normalized.toJson());
+ });
}
@Test
public void testNormalizeINS() throws NonStandardCompliantSampleField {
-
String seq = "ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTG";
Variant variant = newVariantBuilder(100, 100, "N", Collections.singletonList(""), STUDY_ID)
.addFileData("SVINSSEQ", seq)
.build();
- List list = new VariantNormalizer().normalize(Collections.singletonList(variant), false);
-
- assertEquals(1, list.size());
- Variant normalized = list.get(0);
- assertEquals(101, normalized.getStart().intValue());
- assertEquals(100, normalized.getEnd().intValue());
- assertEquals(seq.length(), normalized.getLength().intValue());
- assertEquals(seq.length(), normalized.getLengthAlternate().intValue());
- assertEquals(0, normalized.getLengthReference().intValue());
- assertEquals("", normalized.getReference());
- assertEquals(seq, normalized.getAlternate());
- assertEquals(new StructuralVariation(), normalized.getSv());
- assertEquals("1:100-100:N:", normalized.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
- assertEquals(0, normalized.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ normalizeOne(variant, normalizedVariant -> {
+ assertEquals(101, normalizedVariant.getStart().intValue());
+ assertEquals(100, normalizedVariant.getEnd().intValue());
+ assertEquals(seq.length(), normalizedVariant.getLength().intValue());
+ assertEquals(seq.length(), normalizedVariant.getLengthAlternate().intValue());
+ assertEquals(0, normalizedVariant.getLengthReference().intValue());
+ assertEquals("", normalizedVariant.getReference());
+ assertEquals(seq, normalizedVariant.getAlternate());
+ assertEquals(new StructuralVariation(), normalizedVariant.getSv());
+ assertEquals("1:100-100:N:", normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getVariantId());
+ assertEquals(0, normalizedVariant.getStudies().get(0).getFiles().get(0).getCall().getAlleleIndex().intValue());
+ });
}
@Test
@@ -768,58 +903,60 @@ public void testNormalizeSvToIndel() throws NonStandardCompliantSampleField {
assertEquals(Variant.SV_THRESHOLD + 1, variant.getLengthAlternate().intValue());
assertNotNull(variant.getSv());
- Variant normVar = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0);
- assertEquals(VariantType.INDEL, normVar.getType());
- assertEquals(Variant.SV_THRESHOLD, normVar.getLengthAlternate().intValue());
- assertNull(normVar.getSv());
-
- // Check that the original variant has not been modified, and check again, but reusing the input variant
- assertEquals(VariantType.INSERTION, variant.getType());
- assertEquals(Variant.SV_THRESHOLD + 1, variant.getLengthAlternate().intValue());
- assertNotNull(variant.getSv());
- Variant normVarReuse = new VariantNormalizer().normalize(Collections.singletonList(variant), true).get(0);
- assertEquals(VariantType.INDEL, normVarReuse.getType());
- assertEquals(Variant.SV_THRESHOLD, normVarReuse.getLengthAlternate().intValue());
- assertNull(normVarReuse.getSv());
+ normalizeOne(variant, normVar -> {
+ assertEquals(VariantType.INDEL, normVar.getType());
+ assertEquals(Variant.SV_THRESHOLD, normVar.getLengthAlternate().intValue());
+ assertNull(normVar.getSv());
+ });
+ }
+ @Test
+ public void testNormalizeWithInsSeq() throws NonStandardCompliantSampleField {
+ Variant variant = new Variant("1:799984<800001<800022:-:ACCACACCCACACAACACACA...TGTGGTGTGTGTGGTGTG");
+ normalizeUnmodified(variant);
}
@Test
public void testNormalizeBND() throws NonStandardCompliantSampleField {
- normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, 99, "A", "A[chr9:10["));
- normalizeBnd(newVariant(100, 99, "", "[22:10[."), newVariant(100, 99, "A", "[chr22:10[A"));
- normalizeBnd(newVariant(101, 100, "", ".]9:10]"), newVariant(100, 99, "A", "A]chr9:10]"));
- normalizeBnd(newVariant(100, 99, "", "]22:10]."), newVariant(100, 99, "A", "]chr22:10]A"));
- normalizeBnd(newVariant(100, 99, "", "]22:10]NNN"), newVariant(100, 99, "A", "]chr22:10]NNNA"));
+ normalizeBnd(newVariant(101, 100, "", ".[9:10["), newVariant(100, null, "A", "A[chr9:10["));
+ normalizeBnd(newVariant(100, 99, "", "[22:10[."), newVariant(100, null, "A", "[chr22:10[A"));
+ normalizeBnd(newVariant(101, 100, "", ".]9:10]"), newVariant(100, null, "A", "A]chr9:10]"));
+ normalizeBnd(newVariant(100, 99, "", "]22:10]."), newVariant(100, null, "A", "]chr22:10]A"));
+ normalizeBnd(newVariant(100, 99, "", "]22:10]NNN"), newVariant(100, null, "A", "]chr22:10]NNNA"));
- normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, 99, "A", "[1:10[TA"));
- normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, 99, "AC", "[1:10[TAC"));
+ normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, null, "A", "[1:10[TA"));
+ normalizeBnd(newVariant(100, 99, "", "[1:10[T"), newVariant(100, null, "AC", "[1:10[TAC"));
- normalizeBnd(newVariant(100, 99, "TAC", "[1:10[AC"), newVariant(100, 99, "TAC", "[1:10[AC"));
- normalizeBnd(newVariant(100, 99, "TAC", "TA[1:10["), newVariant(100, 99, "TAC", "TA[1:10["));
+ normalizeBnd(newVariant(100, 99, "TAC", "[1:10[AC"), newVariant(100, null, "TAC", "[1:10[AC"));
+ normalizeBnd(newVariant(100, 99, "TAC", "TA[1:10["), newVariant(100, null, "TAC", "TA[1:10["));
+
+ normalizeBnd(newVariantBuilder(101, 100, "", ".[9:10[", "s1").setCiStart(95, 105).build(),
+ newVariantBuilder(100, null, "A", "A[chr9:10[", "s1").setCiStart(95,105).setCiEnd(95,105).build());
}
private void normalizeBnd(Variant expectedVariant, Variant variant) throws NonStandardCompliantSampleField {
- System.out.println("---");
+// System.out.println("---");
boolean expectsNormalization = !expectedVariant.equals(variant);
- System.out.println(" - Actual");
- System.out.println(" " + variant.toString());
- System.out.println(" " + variant.toJson());
- System.out.println(" - Expected");
- System.out.println(" " + expectedVariant.toString());
- System.out.println(" " + expectedVariant.toJson());
- System.out.println(" - Normalized (same = " + !expectsNormalization + ")");
- List normalized = normalizer.normalize(Collections.singletonList(variant), false);
-
- for (Variant v : normalized) {
- System.out.println(" " + v.toString());
- System.out.println(" " + v.toJson());
- if (expectsNormalization) {
- assertNotNull(v.getStudies().get(0).getFiles().get(0).getCall());
- v.getStudies().get(0).getFiles().get(0).setCall(null);
- }
- assertEquals(expectedVariant, v);
+// System.out.println(" - Actual");
+// System.out.println(" " + variant.toString());
+// System.out.println(" " + variant.toJson());
+// System.out.println(" - Expected");
+// System.out.println(" " + expectedVariant.toString());
+// System.out.println(" " + expectedVariant.toJson());
+// System.out.println(" - Normalized (same = " + !expectsNormalization + ")");
+ if (expectsNormalization) {
+ normalizeOne(variant, normVar -> {
+ System.out.println(" " + normVar.toString());
+ System.out.println(" " + normVar.toJson());
+ OriginalCall call = normVar.getStudies().get(0).getFiles().get(0).getCall();
+ assertNotNull(call);
+ normVar.getStudies().get(0).getFiles().get(0).setCall(null);
+ assertEquals(expectedVariant, normVar);
+ normVar.getStudies().get(0).getFiles().get(0).setCall(call);
+ });
+ } else {
+ normalizeUnmodified(variant);
}
}
diff --git a/pom.xml b/pom.xml
index 97688342..6166063d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -22,7 +22,7 @@
org.opencb.biodata
biodata
- 2.12.1
+ 2.12.3-SNAPSHOT
pom
Biodata
@@ -38,7 +38,7 @@
- 4.12.0
+ 4.12.1-SNAPSHOT
2.11.4
4.4
1.7.7
@@ -198,6 +198,12 @@
com.databricks
SnpEff
${SnpEff.version}
+
+
+ distlib
+ distlib
+
+
com.google.guava