From a42ad75d20cb8ae18f8f0c47940cdceaa97f10cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 4 Nov 2024 22:20:36 +0000 Subject: [PATCH 1/4] tools: Fix "Duplicate allele" while exporting to VCF. #TASK-4682 --- .../converters/VariantContextConverter.java | 54 ++++++++++++++++-- .../VariantAvroToVariantContextConverter.java | 5 +- ...VariantProtoToVariantContextConverter.java | 5 +- .../VariantContextConverterTest.java | 57 +++++++++++++++++-- 4 files changed, 109 insertions(+), 12 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java index e6ab3ad9a..c8c28e4fa 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java @@ -27,7 +27,10 @@ import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.tools.commons.Converter; +import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter; import org.opencb.commons.datastore.core.ObjectMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.text.DecimalFormat; import java.util.*; @@ -38,6 +41,7 @@ * Created by jtarraga on 07/02/17. */ public abstract class VariantContextConverter implements Converter { + private final Logger logger = LoggerFactory.getLogger(VariantAvroToVariantContextConverter.class); public static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######"); public static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###"); @@ -246,7 +250,7 @@ protected double getQuality(List> fileAttributes) { } } catch (NumberFormatException e) { // Nothing to do - e.getMessage(); + logger.warn("Invalid QUAL value found: " + attrs.get(StudyEntry.QUAL)); } } } @@ -254,9 +258,10 @@ protected double getQuality(List> fileAttributes) { return (qual == Double.MAX_VALUE ? VariantContext.NO_LOG10_PERROR : (-0.1 * qual)); } - protected List getGenotypes(List alleleList, List sampleDataKeys, BiFunction getSampleData) { + protected List getGenotypes(List alleleList, List sampleDataKeys, BiFunction getSampleData, Set duplicatedAlleles) { String refAllele = alleleList.get(0); Set noCallAlleles = getNoCallAlleleIdx(alleleList); + Map finalAlleleMap = getDedupAlleleMap(alleleList); List genotypes = new ArrayList<>(); if (this.sampleNames != null) { @@ -289,6 +294,14 @@ protected List getGenotypes(List alleleList, List samp case "AD": if (StringUtils.isNotEmpty(value) && !value.equals(".")) { int[] ad = getInts(value); + if (!duplicatedAlleles.isEmpty() && ad.length == alleleList.size()) { + int[] finalAD = new int[finalAlleleMap.size()]; + for (int i = 0; i < ad.length; i++) { + finalAD[finalAlleleMap.get(alleleList.get(i))] += ad[i]; + } + genotypeBuilder.AD(finalAD); + ad = finalAD; + } genotypeBuilder.AD(ad); } else { genotypeBuilder.noAD(); @@ -341,7 +354,7 @@ private int[] getInts(String value) { return ints; } - protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List alleleList, boolean isNoVariation, Set filters, double qual, ObjectMap attributes, List genotypes) { + protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List alleleList, boolean isNoVariation, Set filters, double qual, ObjectMap attributes, List genotypes, Set duplicatedAlleles) { String refAllele = alleleList.get(0); VariantContextBuilder variantContextBuilder = new VariantContextBuilder() .chr(chromosome) @@ -355,7 +368,12 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en if (isNoVariation && alleleList.get(1).isEmpty()) { variantContextBuilder.alleles(refAllele); } else { - variantContextBuilder.alleles(alleleList.stream().filter(a -> !a.equals(NO_CALL_ALLELE)).collect(Collectors.toList())); + List finalAlleles = alleleList.stream() + .filter(a -> !a.equals(NO_CALL_ALLELE)) + .collect(Collectors.toList()); + // Remove first occurrence of duplicated allele + duplicatedAlleles.forEach(finalAlleles::remove); + variantContextBuilder.alleles(finalAlleles); } if (genotypes.isEmpty()) { @@ -372,6 +390,34 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en return variantContextBuilder.make(); } + protected Map getDedupAlleleMap(List alleleList) { + Map finalAlleleIdxMap = new HashMap<>(); + for (String allele : alleleList) { + // Assign an index to each unique allele + finalAlleleIdxMap.putIfAbsent(allele, finalAlleleIdxMap.size()); + } + return finalAlleleIdxMap; + } + + protected Set getDuplicatedAlleles(String chromosome, int start, List alleleList) { + Set duplicatedAlleles; + if (alleleList.size() > 2 && new HashSet<>(alleleList).size() != alleleList.size()) { + Set allelesSet = new HashSet<>(); + + duplicatedAlleles = new HashSet<>(); + for (String allele : alleleList) { + if (!allelesSet.add(allele)) { + duplicatedAlleles.add(allele); + } + } + logger.warn("Duplicated alleles found in variant " + chromosome + ":" + start + " : Denormalized alleles" + alleleList + + " , duplicated alleles: " + duplicatedAlleles); + } else { + duplicatedAlleles = Collections.emptySet(); + } + return duplicatedAlleles; + } + protected abstract Object getStudy(T variant); protected abstract Iterator getStudiesId(T variant); diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java index 78fde6101..74c24b072 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java @@ -63,6 +63,7 @@ public VariantContext convert(Variant variant) { int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); List alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); + Set duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList); boolean isNoVariation = type.equals(VariantType.NO_VARIATION); // ID @@ -102,9 +103,9 @@ public VariantContext convert(Variant variant) { } // SAMPLES - List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData); + List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, duplicatedAlleles); - return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes); + return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles); } /** diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java index 7904f8e23..61d862452 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java @@ -60,6 +60,7 @@ public VariantContext convert(VariantProto.Variant variant) { int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); List alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); + Set duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList); boolean isNoVariation = type.equals(VariantProto.VariantType.NO_VARIATION); // ID @@ -96,9 +97,9 @@ public VariantContext convert(VariantProto.Variant variant) { // SAMPLES BiFunction getSampleData = (sampleName, id) -> getSampleData(studyEntry, formatPositions, sampleName, id); - List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData); + List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, duplicatedAlleles); - return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes); + return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles); } public String getSampleData(VariantProto.StudyEntry studyEntry, Map formatPositions, String sampleName, String field) { diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java index 942090f10..e54c5141c 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java @@ -1,23 +1,27 @@ package org.opencb.biodata.tools.variant.converters; -import org.apache.commons.lang3.StringUtils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.lang3.tuple.Pair; import org.junit.Test; +import org.opencb.biodata.formats.variant.vcf4.VcfUtils; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.AlternateCoordinate; -import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter; +import org.opencb.biodata.tools.variant.merge.VariantMerger; +import java.io.ByteArrayOutputStream; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.*; /** * Created on 29/11/17. @@ -26,6 +30,51 @@ */ public class VariantContextConverterTest { + @Test + public void testDuplicatedAllele() throws NonStandardCompliantSampleField { + String studyId = "s"; + Variant variant = Variant.newBuilder("1", 1000, null, "AGTATATTGT", "A") + .setStudyId(studyId) + .setSampleDataKeys("GT", "AD") + .addSample("s1", "1/1", "10,10") + .addSample("s2", "0/1", "0,10") + .build(); + Variant variant2 = Variant.newBuilder("1", 1002, null, "TATATTGTGT", "TT,T") + .setStudyId(studyId) + .setSampleDataKeys("GT", "AD") + .addSample("s3", "0/2", "1,1,10") + .addSample("s4", "1/1", "1,10,1") + .build(); + + + Variant normalized = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); + Variant normalized2 = new VariantNormalizer().normalize(Collections.singletonList(variant2), false).get(0); + + Variant merged = new VariantMerger().merge(normalized, normalized2); + +// System.out.println("merged = " + merged.toJson()); + + // Convert to VariantContext + List sampleNames = merged.getSampleNames(studyId); + VariantAvroToVariantContextConverter converter = new VariantAvroToVariantContextConverter(studyId, sampleNames, Collections.emptyList()); + VariantContext context = converter.convert(merged); + +// System.out.println("context = " + context); + + // Print as VCF + ByteArrayOutputStream os = new ByteArrayOutputStream(); + VariantContextWriter writer = VcfUtils.createVariantContextWriter(os, null, Options.ALLOW_MISSING_FIELDS_IN_HEADER); + writer.setHeader(new VCFHeader(Collections.emptySet(), sampleNames)); + writer.add(context); + writer.close(); + System.out.println(os); + assertArrayEquals(new String[]{"1", "1000", ".", "AGTATATTGTG", "AG,AGT", ".", ".", ".", "GT:AD", + "1/1:10,10,0", + "0/1:0,10,0", + "0/1:1,10,1", + "2/2:1,1,10"}, os.toString().trim().split("\t")); + } + @Test public void adjustedVariantStart() throws Exception { testBuildAllele("1:824337:TGC:TC,TG"); From a6e638caeb899b7e7a182539a456de4261fbeac0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 6 Nov 2024 12:00:59 +0000 Subject: [PATCH 2/4] tools: Remove duplicated and incompatible alleles writing into VCF. #TASK-4682 --- .../converters/VariantContextConverter.java | 91 ++++++++----- .../VariantAvroToVariantContextConverter.java | 123 +++++++++++++----- ...VariantProtoToVariantContextConverter.java | 9 +- .../VariantContextConverterTest.java | 103 ++++++++++++--- 4 files changed, 238 insertions(+), 88 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java index c8c28e4fa..db58fcf91 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java @@ -27,7 +27,6 @@ import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.tools.commons.Converter; -import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter; import org.opencb.commons.datastore.core.ObjectMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +40,7 @@ * Created by jtarraga on 07/02/17. */ public abstract class VariantContextConverter implements Converter { - private final Logger logger = LoggerFactory.getLogger(VariantAvroToVariantContextConverter.class); + private final Logger logger = LoggerFactory.getLogger(VariantContextConverter.class); public static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######"); public static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###"); @@ -152,8 +151,7 @@ protected static MutablePair adjustedVariantStart( protected static String getReferenceBase(String chromosome, int from, int to, Map referenceAlleles) { int length = to - from; if (length < 0) { - throw new IllegalStateException( - "Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to); + return ""; } StringBuilder sb = new StringBuilder(length); for (int i = from; i < to; i++) { @@ -258,10 +256,9 @@ protected double getQuality(List> fileAttributes) { return (qual == Double.MAX_VALUE ? VariantContext.NO_LOG10_PERROR : (-0.1 * qual)); } - protected List getGenotypes(List alleleList, List sampleDataKeys, BiFunction getSampleData, Set duplicatedAlleles) { + protected List getGenotypes(List alleleList, List sampleDataKeys, BiFunction getSampleData, Set discardedAlleles) { String refAllele = alleleList.get(0); Set noCallAlleles = getNoCallAlleleIdx(alleleList); - Map finalAlleleMap = getDedupAlleleMap(alleleList); List genotypes = new ArrayList<>(); if (this.sampleNames != null) { @@ -270,7 +267,7 @@ protected List getGenotypes(List alleleList, List samp sampleDataKeys = this.sampleFormats; } - for (String sampleName : this.sampleNames) { + samplesLoop: for (String sampleName : this.sampleNames) { GenotypeBuilder genotypeBuilder = new GenotypeBuilder().name(sampleName); for (String key : sampleDataKeys) { String value = getSampleData.apply(sampleName, key); @@ -284,7 +281,20 @@ protected List getGenotypes(List alleleList, List samp List alleles = new ArrayList<>(); for (int gtIdx : genotype.getAllelesIdx()) { if (gtIdx < alleleList.size() && gtIdx >= 0 && !noCallAlleles.contains(gtIdx)) { // .. AND NOT a nocall allele - alleles.add(Allele.create(alleleList.get(gtIdx), gtIdx == 0)); // allele is ref. if the alleleIndex is 0 + String alternate = alleleList.get(gtIdx); + if (discardedAlleles.contains(gtIdx)) { + // If this genotype contains a duplicated allele, and it is not the first occurrence, skip it + logger.warn("Skipping allele '" + alleleToString(alternate) + "' for sample '" + sampleName + "'"); + genotypes.add(new GenotypeBuilder().name(sampleName) + .alleles(Arrays.asList( + Allele.create(NO_CALL_ALLELE, false), + Allele.create(NO_CALL_ALLELE, false) + )).make() + ); + // skip the rest of the sample data + continue samplesLoop; + } + alleles.add(Allele.create(alternate, gtIdx == 0)); // allele is ref. if the alleleIndex is 0 } else { alleles.add(Allele.create(NO_CALL_ALLELE, false)); // genotype of a secondary alternate, or an actual missing } @@ -294,14 +304,6 @@ protected List getGenotypes(List alleleList, List samp case "AD": if (StringUtils.isNotEmpty(value) && !value.equals(".")) { int[] ad = getInts(value); - if (!duplicatedAlleles.isEmpty() && ad.length == alleleList.size()) { - int[] finalAD = new int[finalAlleleMap.size()]; - for (int i = 0; i < ad.length; i++) { - finalAD[finalAlleleMap.get(alleleList.get(i))] += ad[i]; - } - genotypeBuilder.AD(finalAD); - ad = finalAD; - } genotypeBuilder.AD(ad); } else { genotypeBuilder.noAD(); @@ -354,7 +356,9 @@ private int[] getInts(String value) { return ints; } - protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List alleleList, boolean isNoVariation, Set filters, double qual, ObjectMap attributes, List genotypes, Set duplicatedAlleles) { + protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List alleleList, + boolean isNoVariation, Set filters, double qual, ObjectMap attributes, + List genotypes, Set discardedAlleles) { String refAllele = alleleList.get(0); VariantContextBuilder variantContextBuilder = new VariantContextBuilder() .chr(chromosome) @@ -368,11 +372,14 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en if (isNoVariation && alleleList.get(1).isEmpty()) { variantContextBuilder.alleles(refAllele); } else { - List finalAlleles = alleleList.stream() + List finalAlleles = new ArrayList<>(alleleList); + for (Integer i : discardedAlleles) { + finalAlleles.set(i, null); + } + finalAlleles = finalAlleles.stream() + .filter(Objects::nonNull) .filter(a -> !a.equals(NO_CALL_ALLELE)) .collect(Collectors.toList()); - // Remove first occurrence of duplicated allele - duplicatedAlleles.forEach(finalAlleles::remove); variantContextBuilder.alleles(finalAlleles); } @@ -382,24 +389,35 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en variantContextBuilder.genotypes(genotypes); } - + if (isSymbolic(alleleList.get(1))) { + attributes.append(VCFConstants.END_KEY, end); + } variantContextBuilder.attributes(attributes); variantContextBuilder.id(idForVcf); - return variantContextBuilder.make(); + try { + return variantContextBuilder.make(); + } catch (RuntimeException e) { + throw new IllegalArgumentException( + "Error creating VariantContext: " + chromosome + ":" + start + "-" + end + ":" + alleleList, e); + } } - protected Map getDedupAlleleMap(List alleleList) { - Map finalAlleleIdxMap = new HashMap<>(); - for (String allele : alleleList) { - // Assign an index to each unique allele - finalAlleleIdxMap.putIfAbsent(allele, finalAlleleIdxMap.size()); - } - return finalAlleleIdxMap; + /** + * Check if the allele is a symbolic allele other than or <*> + * @param allele Allele + * @return True if the allele is a symbolic allele + */ + protected static boolean isSymbolic(String allele) { + return allele.startsWith("<") && allele.endsWith(">") && !isNonRef(allele); + } + + protected static boolean isNonRef(String allele) { + return allele.equals(Allele.NON_REF_STRING) || allele.equals(Allele.UNSPECIFIED_ALTERNATE_ALLELE_STRING); } - protected Set getDuplicatedAlleles(String chromosome, int start, List alleleList) { + protected Set getDuplicatedAlleles(List alleleList) { Set duplicatedAlleles; if (alleleList.size() > 2 && new HashSet<>(alleleList).size() != alleleList.size()) { Set allelesSet = new HashSet<>(); @@ -410,8 +428,6 @@ protected Set getDuplicatedAlleles(String chromosome, int start, List getDuplicatedAlleles(String chromosome, int start, List getStudiesId(T variant); + + protected static String allelesToString(Collection alleles) { + return alleles.stream() + .map(VariantContextConverter::alleleToString) + .collect(Collectors.joining(",")); + } + + private static String alleleToString(String a) { + return a.length() > 10 ? (a.substring(0, 10) + "...[" + a.length() + "]") : a; + } + } diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java index 74c24b072..050781fe8 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java @@ -45,6 +45,20 @@ public VariantAvroToVariantContextConverter(String study, List sampleNam @Override public VariantContext convert(Variant variant) { + try { + return createContext(variant); + } catch (RuntimeException e) { + try { + logger.warn("Error creating VariantContext for variant " + variant); + logger.warn("JSON : " + variant.toJson()); + } catch (RuntimeException e2) { + e.addSuppressed(e2); + } + throw e; + } + } + + protected VariantContext createContext(Variant variant) { init(variant); StudyEntry studyEntry = getStudy(variant); @@ -59,11 +73,32 @@ public VariantContext convert(Variant variant) { .stream() .map(entry -> entry.getCall() == null ? null : entry.getCall().getVariantId()) .iterator()); - Pair adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles); + Pair adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles, Collections.emptySet()); int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); List alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); - Set duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList); + Set discardedAlleleIdx = getDiscardedAlleles(variant, alleleList); + if (!discardedAlleleIdx.isEmpty()) { + Set duplicatedAlleles = getDuplicatedAlleles(alleleList); + List otherDiscardedAlleles = new ArrayList<>(discardedAlleleIdx.size()); + for (Integer alleleIdx: discardedAlleleIdx) { + String allele = alleleList.get(alleleIdx); + if (!duplicatedAlleles.contains(allele)) { + otherDiscardedAlleles.add(allele); + } + } + // If there are duplicated alleles, we need to re-adjust the start/end positions and the alleles + adjustedStartEndPositions = adjustedVariantStart(variant, studyEntry, referenceAlleles, discardedAlleleIdx); + start = adjustedStartEndPositions.getLeft(); + end = adjustedStartEndPositions.getRight(); + alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); + + logger.warn("Discard alleles from variant " + chromosome + ":" + start + "-" + end + " , " + + "Alleles : " + allelesToString(alleleList) + " , " + + "discarded allele indexes : " + discardedAlleleIdx + ", " + + "duplicated alleles: " + allelesToString(duplicatedAlleles) + ", " + + "other discarded alleles: " + allelesToString(otherDiscardedAlleles)); + } boolean isNoVariation = type.equals(VariantType.NO_VARIATION); // ID @@ -103,31 +138,40 @@ public VariantContext convert(Variant variant) { } // SAMPLES - List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, duplicatedAlleles); - - return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles); + List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, discardedAlleleIdx); + return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, discardedAlleleIdx); } /** * Adjust start/end if a reference base is required due to an empty allele. All variants are checked due to SecAlts. - * @param variant {@link Variant} object. - * @param study Study + * + * @param variant {@link Variant} object. + * @param study Study + * @param discardedAlleles Set of alleles that are going to be discarded * @return Pair The adjusted (or same) start/end position e.g. SV and MNV as SecAlt, INDEL, etc. */ - public static Pair adjustedVariantStart(Variant variant, StudyEntry study, Map referenceAlleles) { + public static Pair adjustedVariantStart(Variant variant, StudyEntry study, Map referenceAlleles, + Set discardedAlleles) { if (variant.getType().equals(VariantType.NO_VARIATION)) { return new ImmutablePair<>(variant.getStart(), variant.getEnd()); } MutablePair pos = adjustedVariantStart(variant.getStart(), variant.getEnd(), variant.getReference(), variant.getAlternate(), referenceAlleles, null); + int alleleIdx = 2; for (AlternateCoordinate alternateCoordinate : study.getSecondaryAlternates()) { - int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue(); - int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue(); + if (discardedAlleles.contains(alleleIdx)) { + // Do not adjust start/end based on discarded alleles + continue; + } else { + int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue(); + int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue(); - String reference = alternateCoordinate.getReference() == null ? variant.getReference() : alternateCoordinate.getReference(); - String alternate = alternateCoordinate.getAlternate() == null ? variant.getAlternate() : alternateCoordinate.getAlternate(); + String reference = alternateCoordinate.getReference() == null ? variant.getReference() : alternateCoordinate.getReference(); + String alternate = alternateCoordinate.getAlternate() == null ? variant.getAlternate() : alternateCoordinate.getAlternate(); - adjustedVariantStart(alternateStart, alternateEnd, reference, alternate, referenceAlleles, pos); + adjustedVariantStart(alternateStart, alternateEnd, reference, alternate, referenceAlleles, pos); + } + alleleIdx++; } return pos; } @@ -164,31 +208,40 @@ public List buildAlleles(Variant variant, Pair adjuste return alleles; } -/* - // this function was moved to the parent class: VariantContextConverter - public String buildAllele(String chromosome, Integer start, Integer end, String allele, Pair adjustedRange) { - if (start.equals(adjustedRange.getLeft()) && end.equals(adjustedRange.getRight())) { - return allele; // same start / end + protected Set getDiscardedAlleles(Variant variant, List alleleList) { + if (alleleList.size() <= 2) { + return Collections.emptySet(); } - if (StringUtils.startsWith(allele, "*")) { - return allele; // no need - } - return getReferenceBase(chromosome, adjustedRange.getLeft(), start) + allele - + getReferenceBase(chromosome, end, adjustedRange.getRight()); - } -*/ - - /* - // this function was moved to the parent class: VariantContextConverter - private String getReferenceBase(String chromosome, Integer from, Integer to) { - int length = to - from; - if (length < 0) { - throw new IllegalStateException( - "Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to); + + String mainAlternate = alleleList.get(1); + Set allelesSet = new HashSet<>(); + allelesSet.add(alleleList.get(0)); + allelesSet.add(mainAlternate); + + boolean symbolicMainAlt = isSymbolic(mainAlternate); + Set discardedAlleles = new HashSet<>(); + for (int i = 2; i < alleleList.size(); i++) { + String allele = alleleList.get(i); + if (!allelesSet.add(allele)) { + // Remove duplicated + discardedAlleles.add(i); + } else if (symbolicMainAlt != isSymbolic(allele) && !isNonRef(allele)) { + // If the main alternate is symbolic, all other alleles must be symbolic + // If the main alternate is not symbolic, all other alleles must not be symbolic + // Do not discard or <*> + discardedAlleles.add(i); + } else if (symbolicMainAlt && isSymbolic(allele)) { + // If the main alternate is symbolic, and the allele is symbolic, check if they have the same coordinates + AlternateCoordinate alternateCoordinate = variant.getStudies().get(0).getSecondaryAlternates().get(i - 2); + if ((alternateCoordinate.getStart() != null && !alternateCoordinate.getStart().equals(variant.getStart())) + || (alternateCoordinate.getEnd() != null && !alternateCoordinate.getEnd().equals(variant.getEnd()))) { + discardedAlleles.add(i); + } + } } - return StringUtils.repeat('N', length); // current return default base TODO load reference sequence + + return discardedAlleles; } -*/ private void addCohortStatsMultiInfoField(StudyEntry studyEntry, Map attributes) { if (studyEntry.getStats() == null || studyEntry.getStats().size() == 0) { diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java index 61d862452..6cb434ace 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java @@ -23,7 +23,8 @@ /** * Created by jtarraga on 07/02/17. */ -public class VariantProtoToVariantContextConverter extends VariantContextConverter { +@Deprecated +public abstract class VariantProtoToVariantContextConverter extends VariantContextConverter { private final Logger logger = LoggerFactory.getLogger(VariantProtoToVariantContextConverter.class); @@ -60,7 +61,7 @@ public VariantContext convert(VariantProto.Variant variant) { int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); List alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); - Set duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList); + Set duplicatedAlleles = getDuplicatedAlleles(alleleList); boolean isNoVariation = type.equals(VariantProto.VariantType.NO_VARIATION); // ID @@ -97,9 +98,9 @@ public VariantContext convert(VariantProto.Variant variant) { // SAMPLES BiFunction getSampleData = (sampleName, id) -> getSampleData(studyEntry, formatPositions, sampleName, id); - List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, duplicatedAlleles); + List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, null); - return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles); + return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, null); } public String getSampleData(VariantProto.StudyEntry studyEntry, Map formatPositions, String sampleName, String field) { diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java index e54c5141c..80165c23d 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java @@ -4,6 +4,7 @@ import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; +import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.junit.Test; import org.opencb.biodata.formats.variant.vcf4.VcfUtils; @@ -46,33 +47,101 @@ public void testDuplicatedAllele() throws NonStandardCompliantSampleField { .addSample("s4", "1/1", "1,10,1") .build(); + checkVcf("1 1000 . AGTATATTGT A,AGT . . . GT:AD 1/1:10,10,0,0 0/1:0,10,0,0 ./.:. 2/2:1,0,1,10", merge(norm(variant), norm(variant2))); + checkVcf("1 1001 . GTATATTGTG G,GT . . . GT:AD ./.:. ./.:. 0/1:1,10,1,0 2/2:1,1,10,0", merge(norm(variant2), norm(variant))); + checkVcf("1 1000 . AGTATATTGT A,AGT . . . GT:AD 1/1:10,10,0,0 0/1:0,10,0,0 ./.:. 2/2:1,0,1,10", merge(norm(variant), norm(variant2, 1))); + } + + @Test + public void testDuplicatedAlleleSV() throws NonStandardCompliantSampleField { + Variant variant1 = Variant.newBuilder("1", 1000, 1020, "A", "") + .setStudyId("s") + .setSampleDataKeys("GT", "AD") + .addSample("s1", "1/1", "10,10") + .addSample("s2", "0/1", "0,10") + .build(); + Variant variant2 = Variant.newBuilder("1", 1002, 1022, "T", "") + .setStudyId("s") + .setSampleDataKeys("GT", "AD") + .addSample("s3", "0/1", "1,10") + .addSample("s4", "1/1", "10,1") + .build(); + Variant variantDup = Variant.newBuilder("1", 1002, 1022, "T", "") + .setStudyId("s") + .setSampleDataKeys("GT", "AD") + .addSample("s3", "0/1", "1,10") + .addSample("s4", "1/1", "10,1") + .build(); + Variant variantDupMatch = Variant.newBuilder("1", 1000, 1020, "T", "") + .setStudyId("s") + .setSampleDataKeys("GT", "AD") + .addSample("s3", "0/1", "1,10") + .addSample("s4", "1/1", "10,1") + .build(); + Variant variant3 = Variant.newBuilder("1", 1002, null, "TATATTGTGT", "TT,T") + .setStudyId("s") + .setSampleDataKeys("GT", "AD") + .addSample("s5", "0/2", "1,1,10") + .addSample("s6", "1/1", "1,10,1") + .build(); - Variant normalized = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); - Variant normalized2 = new VariantNormalizer().normalize(Collections.singletonList(variant2), false).get(0); + checkVcf("1 1000 . A . . END=1020 GT:AD 1/1:10,10 0/1:0,10", variant1); + checkVcf("1 1002 . T . . END=1022 GT:AD 0/1:1,10 1/1:10,1", variant2); + checkVcf("1 1002 . TATATTGTGT TT,T . . . GT:AD 0/2:1,1,10 1/1:1,10,1", variant3); + checkVcf("1 1002 . T . . END=1022 GT:AD 0/1:1,10 1/1:10,1", variantDup); + checkVcf("1 1000 . T . . END=1020 GT:AD 0/1:1,10 1/1:10,1", variantDupMatch); + checkVcf("variant1 + variant2", "1 1000 . A . . END=1020 GT:AD 1/1:10,10,0 0/1:0,10,0 ./.:. ./.:.", merge(norm(variant1), norm(variant2))); + checkVcf("variant2 + variant1", "1 1002 . T . . END=1022 GT:AD ./.:. ./.:. 0/1:1,10,0 1/1:10,1,0", merge(norm(variant2), norm(variant1))); + checkVcf("variant2 + variant3", "1 1002 . T . . END=1022 GT:AD 0/1:1,10,0,0 1/1:10,1,0,0 ./.:. ./.:.", merge(norm(variant2), norm(variant3))); + checkVcf("variant3 + variant2", "1 1002 . TATATTGTGT T,TT . . . GT:AD ./.:. ./.:. 0/1:1,10,1,0 2/2:1,1,10,0", merge(norm(variant3), norm(variant2))); + checkVcf("variant1 + variantDup", "1 1000 . A . . END=1020 GT:AD 1/1:10,10,0 0/1:0,10,0 ./.:. ./.:.", merge(norm(variant1), norm(variantDup))); + checkVcf("variant1 + variantDupMatch","1 1000 . A , . . END=1020 GT:AD 1/1:10,10,0 0/1:0,10,0 0/2:1,0,10 2/2:10,0,1", merge(norm(variant1), norm(variantDupMatch))); + } - Variant merged = new VariantMerger().merge(normalized, normalized2); + public void checkVcf(String expectedVcf, Variant variant) { + checkVcf(null, expectedVcf, variant); + } + public void checkVcf(String message, String expectedVcf, Variant variant) { + if (message == null) { + message = variant.toString(); + } + String vcf = toVcf(variant).replace("\n", ""); +// System.out.println(message + " = " + vcf.replace("\t", " ")); + expectedVcf = expectedVcf.replaceAll(" +", " "); + assertEquals(message, expectedVcf.replace(" ", "\t"), vcf); + } -// System.out.println("merged = " + merged.toJson()); + private static Variant merge(Variant variant, Variant variant2) { + return new VariantMerger().merge(variant, variant2); + } - // Convert to VariantContext - List sampleNames = merged.getSampleNames(studyId); + private static VariantContext toContext(Variant variant) { + String studyId = variant.getStudies().get(0).getStudyId(); + List sampleNames = variant.getSampleNames(studyId); VariantAvroToVariantContextConverter converter = new VariantAvroToVariantContextConverter(studyId, sampleNames, Collections.emptyList()); - VariantContext context = converter.convert(merged); + VariantContext context = converter.convert(variant); + return context; + } -// System.out.println("context = " + context); + private static String toVcf(Variant variant) { + return toVcf(toContext(variant)); + } - // Print as VCF + private static String toVcf(VariantContext context) { ByteArrayOutputStream os = new ByteArrayOutputStream(); VariantContextWriter writer = VcfUtils.createVariantContextWriter(os, null, Options.ALLOW_MISSING_FIELDS_IN_HEADER); - writer.setHeader(new VCFHeader(Collections.emptySet(), sampleNames)); + writer.setHeader(new VCFHeader(Collections.emptySet(), context.getSampleNamesOrderedByName())); writer.add(context); writer.close(); - System.out.println(os); - assertArrayEquals(new String[]{"1", "1000", ".", "AGTATATTGTG", "AG,AGT", ".", ".", ".", "GT:AD", - "1/1:10,10,0", - "0/1:0,10,0", - "0/1:1,10,1", - "2/2:1,1,10"}, os.toString().trim().split("\t")); + return os.toString(); + } + + private static Variant norm(Variant variant) throws NonStandardCompliantSampleField { + return norm(variant, 0); + } + + private static Variant norm(Variant variant, int idx) throws NonStandardCompliantSampleField { + return new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(idx); } @Test @@ -128,7 +197,7 @@ private void testBuildAllele(String varStr) throws NonStandardCompliantSampleFie .map(entry -> entry.getCall() == null ? null : entry.getCall().getVariantId()) .iterator()); - Pair adjustedRange = VariantAvroToVariantContextConverter.adjustedVariantStart(v, v.getStudy("S"), referenceMap); + Pair adjustedRange = VariantAvroToVariantContextConverter.adjustedVariantStart(v, v.getStudy("S"), referenceMap, Collections.emptySet()); System.out.println(""); System.out.println(varStr + " -> " + v + " ( " + normalized.stream().map(Object::toString).collect(Collectors.joining(" , ")) + " )"); System.out.println(adjustedRange); From 2187ae8f3e7005ab50499719006d4a292c4b0b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 7 Nov 2024 10:52:48 +0000 Subject: [PATCH 3/4] tools: Fix NPE discarding intermediate secondary alternates. #TASK-4682 --- .../VariantAvroToVariantContextConverter.java | 1 - .../VariantContextConverterTest.java | 37 ++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java index 050781fe8..7767e051a 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java @@ -161,7 +161,6 @@ public static Pair adjustedVariantStart(Variant variant, Study for (AlternateCoordinate alternateCoordinate : study.getSecondaryAlternates()) { if (discardedAlleles.contains(alleleIdx)) { // Do not adjust start/end based on discarded alleles - continue; } else { int alternateStart = alternateCoordinate.getStart() == null ? variant.getStart() : alternateCoordinate.getStart().intValue(); int alternateEnd = alternateCoordinate.getEnd() == null ? variant.getEnd() : alternateCoordinate.getEnd().intValue(); diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java index 80165c23d..4c32c8493 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java @@ -6,10 +6,12 @@ import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.codehaus.jackson.map.ObjectMapper; import org.junit.Test; import org.opencb.biodata.formats.variant.vcf4.VcfUtils; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.AlternateCoordinate; +import org.opencb.biodata.models.variant.avro.VariantAvro; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import org.opencb.biodata.tools.variant.VariantNormalizer; @@ -17,7 +19,9 @@ import org.opencb.biodata.tools.variant.merge.VariantMerger; import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -53,7 +57,7 @@ public void testDuplicatedAllele() throws NonStandardCompliantSampleField { } @Test - public void testDuplicatedAlleleSV() throws NonStandardCompliantSampleField { + public void testDuplicatedAlleleSV() throws NonStandardCompliantSampleField, IOException { Variant variant1 = Variant.newBuilder("1", 1000, 1020, "A", "") .setStudyId("s") .setSampleDataKeys("GT", "AD") @@ -98,6 +102,37 @@ public void testDuplicatedAlleleSV() throws NonStandardCompliantSampleField { checkVcf("variant1 + variantDupMatch","1 1000 . A , . . END=1020 GT:AD 1/1:10,10,0 0/1:0,10,0 0/2:1,0,10 2/2:10,0,1", merge(norm(variant1), norm(variantDupMatch))); } + @Test + public void testNonNullAllele() throws IOException { + + String jsonVariant = "{\"id\": \"11:555177:G:A\", \"names\": [], \"chromosome\": \"11\", \"start\": 555177, \"end\": 555177," + + " \"reference\": \"G\", \"alternate\": \"A\", \"strand\": \"+\", \"sv\": null, \"length\": 1, \"type\": \"SNV\", " + + "\"studies\": [{\"studyId\": \"test@HG38:SJD-hg38\", " + + "\"files\": [" + + "{\"fileId\": \"f1.vcf.gz\", \"call\": null, \"data\": {\"FILTER\": \"PASS\", \"QUAL\": \"30.0\"}}, " + + "{\"fileId\": \"f2.vcf.gz\", \"call\": {\"variantId\": \"11:555177:GA:-\", \"alleleIndex\": 0}, \"data\": {}}, " + +// "{\"fileId\": \"f2.vcf.gz\", \"call\": {\"variantId\": \"11:555176:GGA:G\", \"alleleIndex\": 0}, \"data\": {}}, " + + "{\"fileId\": \"f3.vcf.gz\", \"call\": {\"variantId\": \"11:554942-556408:-:\", \"alleleIndex\": 0}, \"data\": {}}]," + + " \"secondaryAlternates\": [" + + "{\"chromosome\": \"11\", \"start\": 554942, \"end\": 556408, \"reference\": \"\", \"alternate\": \"\", \"type\": \"DELETION\"}, " + + "{\"chromosome\": \"11\", \"start\": 555177, \"end\": 555178, \"reference\": \"GA\", \"alternate\": \"\", \"type\": \"INDEL\"}]," + + " \"sampleDataKeys\": [\"GT\"], " + + "\"samples\": [" + + "{\"sampleId\": null, \"fileIndex\": 0, \"data\": [\"0/1\"]}, " + + "{\"sampleId\": null, \"fileIndex\": null, \"data\": [\"0/2\"]}, " + + "{\"sampleId\": null, \"fileIndex\": null, \"data\": [\"3/3\"]}], " + + "\"issues\": [], \"stats\": [], \"scores\": []}], \"annotation\": null}"; + Variant variant4 = new Variant(new ObjectMapper().readValue(jsonVariant, VariantAvro.class)); + + HashMap samplesPosition = new HashMap<>(); + samplesPosition.put("s1", 0); + samplesPosition.put("s2", 1); + samplesPosition.put("s3", 2); + variant4.getStudies().get(0).setSamplesPosition(samplesPosition); + + checkVcf("11 555176 . NGA NAA,N 30 PASS . GT 0/1 ./. 2/2", variant4); + } + public void checkVcf(String expectedVcf, Variant variant) { checkVcf(null, expectedVcf, variant); } From 252082121d06841ff1c4865bf39a71c1220b7b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 7 Nov 2024 11:07:49 +0000 Subject: [PATCH 4/4] tools: Fix VariantContextConverterTest. #TASK-4682 --- .../variant/converters/VariantContextConverter.java | 2 +- .../converters/VariantContextConverterTest.java | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java index db58fcf91..2cc2e0329 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java @@ -445,7 +445,7 @@ protected static String allelesToString(Collection alleles) { } private static String alleleToString(String a) { - return a.length() > 10 ? (a.substring(0, 10) + "...[" + a.length() + "]") : a; + return a.length() > 20 ? (a.substring(0, 10) + "...[" + a.length() + "]") : a; } } diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java index 4c32c8493..79318c65b 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java @@ -37,6 +37,11 @@ public class VariantContextConverterTest { @Test public void testDuplicatedAllele() throws NonStandardCompliantSampleField { + // REF : AGTATATTGTGT AGTATATTGTG + // V1 : 1000:AGTATATTGT/A A---------GT -> A---------G -> AG + // V2 : 1002:TATATTGTGT/TT AGT--------T -> AGT-------- -> AGT + // V3 : 1002:TATATTGTGT/T AG---------T -> AG--------- -> AG + String studyId = "s"; Variant variant = Variant.newBuilder("1", 1000, null, "AGTATATTGT", "A") .setStudyId(studyId) @@ -51,9 +56,9 @@ public void testDuplicatedAllele() throws NonStandardCompliantSampleField { .addSample("s4", "1/1", "1,10,1") .build(); - checkVcf("1 1000 . AGTATATTGT A,AGT . . . GT:AD 1/1:10,10,0,0 0/1:0,10,0,0 ./.:. 2/2:1,0,1,10", merge(norm(variant), norm(variant2))); - checkVcf("1 1001 . GTATATTGTG G,GT . . . GT:AD ./.:. ./.:. 0/1:1,10,1,0 2/2:1,1,10,0", merge(norm(variant2), norm(variant))); - checkVcf("1 1000 . AGTATATTGT A,AGT . . . GT:AD 1/1:10,10,0,0 0/1:0,10,0,0 ./.:. 2/2:1,0,1,10", merge(norm(variant), norm(variant2, 1))); + checkVcf("1 1000 . AGTATATTGTG AG,AGT . . . GT:AD 1/1:10,10,0,0 0/1:0,10,0,0 ./.:. 2/2:1,0,1,10", merge(norm(variant), norm(variant2))); + checkVcf("1 1001 . GTATATTGTG G,GT . . . GT:AD ./.:. ./.:. 0/1:1,10,1,0 2/2:1,1,10,0", merge(norm(variant2), norm(variant))); + checkVcf("1 1000 . AGTATATTGTG AG,AGT . . . GT:AD 1/1:10,10,0,0 0/1:0,10,0,0 ./.:. 2/2:1,0,1,10", merge(norm(variant), norm(variant2, 1))); } @Test