Skip to content

Commit

Permalink
tools: Fix "Duplicate allele" while exporting to VCF. #TASK-4682
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Nov 4, 2024
1 parent a561fac commit a42ad75
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.tools.commons.Converter;
import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter;
import org.opencb.commons.datastore.core.ObjectMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.DecimalFormat;
import java.util.*;
Expand All @@ -38,6 +41,7 @@
* Created by jtarraga on 07/02/17.
*/
public abstract class VariantContextConverter<T> implements Converter<T, VariantContext> {
private final Logger logger = LoggerFactory.getLogger(VariantAvroToVariantContextConverter.class);

public static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######");
public static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###");
Expand Down Expand Up @@ -246,17 +250,18 @@ protected double getQuality(List<Map<String, String>> fileAttributes) {
}
} catch (NumberFormatException e) {
// Nothing to do
e.getMessage();
logger.warn("Invalid QUAL value found: " + attrs.get(StudyEntry.QUAL));
}
}
}
}
return (qual == Double.MAX_VALUE ? VariantContext.NO_LOG10_PERROR : (-0.1 * qual));
}

protected List<Genotype> getGenotypes(List<String> alleleList, List<String> sampleDataKeys, BiFunction<String, String, String> getSampleData) {
protected List<Genotype> getGenotypes(List<String> alleleList, List<String> sampleDataKeys, BiFunction<String, String, String> getSampleData, Set<String> duplicatedAlleles) {
String refAllele = alleleList.get(0);
Set<Integer> noCallAlleles = getNoCallAlleleIdx(alleleList);
Map<String, Integer> finalAlleleMap = getDedupAlleleMap(alleleList);

List<Genotype> genotypes = new ArrayList<>();
if (this.sampleNames != null) {
Expand Down Expand Up @@ -289,6 +294,14 @@ protected List<Genotype> getGenotypes(List<String> alleleList, List<String> samp
case "AD":
if (StringUtils.isNotEmpty(value) && !value.equals(".")) {
int[] ad = getInts(value);
if (!duplicatedAlleles.isEmpty() && ad.length == alleleList.size()) {
int[] finalAD = new int[finalAlleleMap.size()];
for (int i = 0; i < ad.length; i++) {
finalAD[finalAlleleMap.get(alleleList.get(i))] += ad[i];
}
genotypeBuilder.AD(finalAD);
ad = finalAD;
}
genotypeBuilder.AD(ad);
} else {
genotypeBuilder.noAD();
Expand Down Expand Up @@ -341,7 +354,7 @@ private int[] getInts(String value) {
return ints;
}

protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List<String> alleleList, boolean isNoVariation, Set<String> filters, double qual, ObjectMap attributes, List<Genotype> genotypes) {
protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List<String> alleleList, boolean isNoVariation, Set<String> filters, double qual, ObjectMap attributes, List<Genotype> genotypes, Set<String> duplicatedAlleles) {
String refAllele = alleleList.get(0);
VariantContextBuilder variantContextBuilder = new VariantContextBuilder()
.chr(chromosome)
Expand All @@ -355,7 +368,12 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
if (isNoVariation && alleleList.get(1).isEmpty()) {
variantContextBuilder.alleles(refAllele);
} else {
variantContextBuilder.alleles(alleleList.stream().filter(a -> !a.equals(NO_CALL_ALLELE)).collect(Collectors.toList()));
List<String> finalAlleles = alleleList.stream()
.filter(a -> !a.equals(NO_CALL_ALLELE))
.collect(Collectors.toList());
// Remove first occurrence of duplicated allele
duplicatedAlleles.forEach(finalAlleles::remove);
variantContextBuilder.alleles(finalAlleles);
}

if (genotypes.isEmpty()) {
Expand All @@ -372,6 +390,34 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en
return variantContextBuilder.make();
}

protected Map<String, Integer> getDedupAlleleMap(List<String> alleleList) {
Map<String, Integer> finalAlleleIdxMap = new HashMap<>();
for (String allele : alleleList) {
// Assign an index to each unique allele
finalAlleleIdxMap.putIfAbsent(allele, finalAlleleIdxMap.size());
}
return finalAlleleIdxMap;
}

protected Set<String> getDuplicatedAlleles(String chromosome, int start, List<String> alleleList) {
Set<String> duplicatedAlleles;
if (alleleList.size() > 2 && new HashSet<>(alleleList).size() != alleleList.size()) {
Set<String> allelesSet = new HashSet<>();

duplicatedAlleles = new HashSet<>();
for (String allele : alleleList) {
if (!allelesSet.add(allele)) {
duplicatedAlleles.add(allele);
}
}
logger.warn("Duplicated alleles found in variant " + chromosome + ":" + start + " : Denormalized alleles" + alleleList
+ " , duplicated alleles: " + duplicatedAlleles);
} else {
duplicatedAlleles = Collections.emptySet();
}
return duplicatedAlleles;
}

protected abstract Object getStudy(T variant);

protected abstract Iterator<String> getStudiesId(T variant);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public VariantContext convert(Variant variant) {
int start = adjustedStartEndPositions.getLeft();
int end = adjustedStartEndPositions.getRight();
List<String> alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);
Set<String> duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList);
boolean isNoVariation = type.equals(VariantType.NO_VARIATION);

// ID
Expand Down Expand Up @@ -102,9 +103,9 @@ public VariantContext convert(Variant variant) {
}

// SAMPLES
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData);
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, duplicatedAlleles);

return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes);
return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public VariantContext convert(VariantProto.Variant variant) {
int start = adjustedStartEndPositions.getLeft();
int end = adjustedStartEndPositions.getRight();
List<String> alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles);
Set<String> duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList);
boolean isNoVariation = type.equals(VariantProto.VariantType.NO_VARIATION);

// ID
Expand Down Expand Up @@ -96,9 +97,9 @@ public VariantContext convert(VariantProto.Variant variant) {
// SAMPLES
BiFunction<String, String, String> getSampleData = (sampleName, id) -> getSampleData(studyEntry, formatPositions, sampleName, id);

List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData);
List<Genotype> genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, duplicatedAlleles);

return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes);
return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles);
}

public String getSampleData(VariantProto.StudyEntry studyEntry, Map<String, Integer> formatPositions, String sampleName, String field) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
package org.opencb.biodata.tools.variant.converters;

import org.apache.commons.lang3.StringUtils;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.commons.lang3.tuple.Pair;
import org.junit.Test;
import org.opencb.biodata.formats.variant.vcf4.VcfUtils;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField;
import org.opencb.biodata.tools.variant.VariantNormalizer;
import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter;
import org.opencb.biodata.tools.variant.merge.VariantMerger;

import java.io.ByteArrayOutputStream;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.*;

/**
* Created on 29/11/17.
Expand All @@ -26,6 +30,51 @@
*/
public class VariantContextConverterTest {

@Test
public void testDuplicatedAllele() throws NonStandardCompliantSampleField {
String studyId = "s";
Variant variant = Variant.newBuilder("1", 1000, null, "AGTATATTGT", "A")
.setStudyId(studyId)
.setSampleDataKeys("GT", "AD")
.addSample("s1", "1/1", "10,10")
.addSample("s2", "0/1", "0,10")
.build();
Variant variant2 = Variant.newBuilder("1", 1002, null, "TATATTGTGT", "TT,T")
.setStudyId(studyId)
.setSampleDataKeys("GT", "AD")
.addSample("s3", "0/2", "1,1,10")
.addSample("s4", "1/1", "1,10,1")
.build();


Variant normalized = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0);
Variant normalized2 = new VariantNormalizer().normalize(Collections.singletonList(variant2), false).get(0);

Variant merged = new VariantMerger().merge(normalized, normalized2);

// System.out.println("merged = " + merged.toJson());

// Convert to VariantContext
List<String> sampleNames = merged.getSampleNames(studyId);
VariantAvroToVariantContextConverter converter = new VariantAvroToVariantContextConverter(studyId, sampleNames, Collections.emptyList());
VariantContext context = converter.convert(merged);

// System.out.println("context = " + context);

// Print as VCF
ByteArrayOutputStream os = new ByteArrayOutputStream();
VariantContextWriter writer = VcfUtils.createVariantContextWriter(os, null, Options.ALLOW_MISSING_FIELDS_IN_HEADER);
writer.setHeader(new VCFHeader(Collections.emptySet(), sampleNames));
writer.add(context);
writer.close();
System.out.println(os);
assertArrayEquals(new String[]{"1", "1000", ".", "AGTATATTGTG", "AG,AGT", ".", ".", ".", "GT:AD",
"1/1:10,10,0",
"0/1:0,10,0",
"0/1:1,10,1",
"2/2:1,1,10"}, os.toString().trim().split("\t"));
}

@Test
public void adjustedVariantStart() throws Exception {
testBuildAllele("1:824337:TGC:TC,TG");
Expand Down

0 comments on commit a42ad75

Please sign in to comment.