Skip to content

Commit

Permalink
Update EstimatePoolingFractions to be able to use the per-sample/geno…
Browse files Browse the repository at this point in the history
…type AF instead of 0/0.5/1 when available.
  • Loading branch information
tfenne committed Dec 12, 2020
1 parent 4ca76a8 commit 467dc5d
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,9 @@ import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util.{Io, Metric, Sequences}
import com.fulcrumgenomics.vcf.ByIntervalListVariantContextIterator
import com.fulcrumgenomics.vcf.api.{Variant, VcfSource}
import htsjdk.samtools.util.SamLocusIterator.LocusInfo
import htsjdk.samtools.util._
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression

@clp(group=ClpGroups.SamOrBam, description=
Expand All @@ -49,6 +46,11 @@ import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression
|for the alternative allele fractions at each SNP locus, using as inputs the individual sample's genotypes.
|Only SNPs that are bi-allelic within the pooled samples are used.
|
|Each sample's contribution of REF vs. ALT alleles at each site is derived in one of two ways. If
|the sample's genotype in the VCF has the `AF` attribute then the value from that field will be used. If the
|genotype has no AF attribute then the contribution is estimated based on the genotype (e.g. 0/0 will be 100%
|ref, 0/1 will be 50% ref and 50% alt, etc.).
|
|Various filtering parameters can be used to control which loci are used:
|
|- _--intervals_ will restrict analysis to variants within the described intervals
Expand Down Expand Up @@ -90,7 +92,10 @@ class EstimatePoolingFractions
alt = v.alleles.alts.head.value.charAt(0),
expectedSampleFractions = sampleNames.map { s =>
val gt = v.gt(s)
if (gt.isHomRef) 0 else if (gt.isHet) 0.5 else 1.0
gt.get[IndexedSeq[Float]]("AF") match {
case None => if (gt.isHomRef) 0 else if (gt.isHet) 0.5 else 1.0
case Some(afs) => afs(0)
}
}
)}.toArray

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
package com.fulcrumgenomics.bam

import java.nio.file.Paths

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.{SamRecord, SamSource, SamWriter}
import com.fulcrumgenomics.testing.UnitSpec
import com.fulcrumgenomics.util.Metric
import com.fulcrumgenomics.vcf.api
import com.fulcrumgenomics.vcf.api.{Genotype, VcfCount, VcfFieldType, VcfFormatHeader, VcfSource, VcfWriter}
import htsjdk.samtools.SAMFileHeader.SortOrder
import htsjdk.samtools.{MergingSamRecordIterator, SamFileHeaderMerger}
import org.scalatest.ParallelTestExecution
Expand Down Expand Up @@ -106,4 +107,63 @@ class EstimatePoolingFractionsTest extends UnitSpec with ParallelTestExecution {
expected should (be >= m.ci99_low and be <= m.ci99_high)
}
}

it should "accurately estimate a three sample mixture using the AF genotype field" in {
val samples = Samples.take(3)
val Seq(s1, s2, s3) = samples
val bams = Bams.take(3)
val bam = merge(bams)

val vcf = {
val vcf = makeTempFile("mixture.", ".vcf.gz")
val in = api.VcfSource(Vcf)
val hd = in.header.copy(
samples = IndexedSeq(s1, "two_sample_mixture"),
formats = VcfFormatHeader("AF", VcfCount.OnePerAltAllele, kind=VcfFieldType.Float, description="Allele Frequency") +: in.header.formats
)
val out = VcfWriter(vcf, hd)

in.filter(_.alleles.size == 2).foreach { v =>
val gts = samples.map(v.gt)

// Only bother with sites where all samples have called genotypes and there is variation
if (gts.forall((_.isFullyCalled)) && gts.flatMap(_.calls).toSet.size > 1) {
// Make a mixture of the 2nd and 3rd samples
val (mixCalls, mixAf) = {
val input = gts.drop(1)
if (input.forall(_.isHomRef)) (IndexedSeq(v.alleles.ref, v.alleles.ref), 0.0)
else if (input.forall(_.isHomVar)) (IndexedSeq(v.alleles.alts.head, v.alleles.alts.head), 1.0)
else {
val calls = input.flatMap(_.calls)
(IndexedSeq(v.alleles.ref, v.alleles.alts.head), calls.count(_ != v.alleles.ref) / calls.size.toDouble)
}
}

val mixtureGt = Genotype(
alleles = v.alleles,
sample = "two_sample_mixture",
calls = mixCalls,
attrs = Map("AF" -> IndexedSeq[Float](mixAf.toFloat))
)

out += v.copy(genotypes=Map(s1 -> gts.head, mixtureGt.sample -> mixtureGt))
}
}

in.safelyClose()
out.close()
vcf
}

// Run the estimator and test the outputs
val out = makeTempFile("pooling_metrics.", ".txt")
new EstimatePoolingFractions(vcf=vcf, bam=bam, output=out, minGenotypeQuality = -1).execute()
val metrics = Metric.read[PoolingFractionMetric](out)

metrics should have size 2
metrics.foreach {m =>
val expected = if (m.sample == samples.head) 1/3.0 else 2/3.0
expected should (be >= m.ci99_low and be <= m.ci99_high)
}
}
}

0 comments on commit 467dc5d

Please sign in to comment.