Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to skip the mate cigar in EstimateRnaSeqInsertSize #871

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,8 @@ import com.fulcrumgenomics.util.GeneAnnotations._
import com.fulcrumgenomics.util._
import htsjdk.samtools.SamPairUtil.PairOrientation
import htsjdk.samtools._
import htsjdk.samtools.filter._
import htsjdk.samtools.util.{CoordMath, Interval, OverlapDetector}

import scala.jdk.CollectionConverters._

@clp(description =
"""Computes the insert size for RNA-Seq experiments.
|
Expand All @@ -54,6 +51,7 @@ import scala.jdk.CollectionConverters._
|chromosomes. Finally, skips transcripts where too few mapped read bases overlap exonic sequence.
|
|This tool requires each mapped pair to have the mate cigar (`MC`) tag. Use `SetMateInformation` to add the mate cigar.
|Use the `--skip-missing-mate-cigar` to skip reads missing the mate cigar tag.
|
|The output metric file will have the extension `.rna_seq_insert_size.txt` and the output histogram file will have
|the extension `.rna_seq_insert_size_histogram.txt`. The histogram file gives for each orientation (`FR`, `RF`, `tandem`),
Expand All @@ -74,7 +72,8 @@ class EstimateRnaSeqInsertSize
""""
) val deviations: Double = 10.0,
@arg(flag='q', doc="Ignore reads with mapping quality less than this value.") val minimumMappingQuality: Int = 30,
@arg(flag='m', doc="The minimum fraction of read bases that must overlap exonic sequence in a transcript") val minimumOverlap: Double = 0.95
@arg(flag='m', doc="The minimum fraction of read bases that must overlap exonic sequence in a transcript") val minimumOverlap: Double = 0.95,
@arg(doc="Skip reads who are missing their mate cigar") val skipMissingMateCigar: Boolean = false
) extends FgBioTool with LazyLogging {
import EstimateRnaSeqInsertSize._

Expand All @@ -90,12 +89,20 @@ class EstimateRnaSeqInsertSize
val in = SamSource(input)
val refFlatSource = RefFlatSource(refFlat, Some(in.dict))
val counters = pairOrientations.map { pairOrientation => (pairOrientation, new NumericCounter[Long]()) }.toMap
val filter = new AggregateFilter(EstimateRnaSeqInsertSize.filters(minimumMappingQuality=minimumMappingQuality, includeDuplicates=includeDuplicates).asJava)
var numReadPairs = 0L
val recordIterator = in.iterator.filter { rec =>
progress.record(rec)
if (rec.paired && rec.firstOfPair) numReadPairs += 1
!filter.filterOut(rec.asSam)

rec.firstOfPair && // so templates are considered only once
rec.pf && // don't want low quality reads
rec.paired && // only want paired reads
!rec.secondary && !rec.supplementary && // no secondary or supplementary
rec.mapq >= minimumMappingQuality && // decent mapping quality
(includeDuplicates || !rec.duplicate) && // no duplicates, unless specified
rec.mapped && rec.mateMapped && // both ends of a pair are mapped
rec.refIndex == rec.mateRefIndex && // both ends of a pair map to the same contig
(!skipMissingMateCigar || rec.mateCigar.isDefined) // skip records with missing mate cigars if specified
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved filters out of the old "htsjdk" filter style, so this could use a double-check

}

for (gene <- refFlatSource; locus <- gene.loci) {
Expand Down Expand Up @@ -181,30 +188,6 @@ object EstimateRnaSeqInsertSize {
val RnaSeqInsertSizeMetricExtension: String = ".rnaseq_insert_size.txt"
val RnaSeqInsertSizeMetricHistogramExtension: String = ".rnaseq_insert_size_histogram.txt"

private trait SingleEndSamRecordFilter extends SamRecordFilter {
override def filterOut(r1: SAMRecord, r2: SAMRecord): Boolean = filterOut(r1) || filterOut(r2)
}

def filter(f: SamRecord => Boolean): SamRecordFilter = new SingleEndSamRecordFilter {
override def filterOut(r: SAMRecord): Boolean = f(r.asInstanceOf[SamRecord])
}

private def filters(minimumMappingQuality: Int, includeDuplicates: Boolean) = List(
MateMappedFilter, // also filters out unpaired reads
new FailsVendorReadQualityFilter,
new AlignedFilter(true),
new SecondaryOrSupplementaryFilter,
new MappingQualityFilter(minimumMappingQuality),
DuplicatesFilter(includeDuplicates=includeDuplicates),
FirstOfPairOnlyFilter,
DifferentReferenceIndexFilter
)

private def MateMappedFilter = filter(r => !r.paired || r.mateUnmapped)
private def DuplicatesFilter(includeDuplicates: Boolean) = filter(r => !includeDuplicates && r.duplicate)
private def FirstOfPairOnlyFilter = filter(_.firstOfPair)
private def DifferentReferenceIndexFilter = filter(r => r.refIndex != r.mateRefIndex)

private[rnaseq] def getAndRequireMateCigar(rec: SamRecord): Cigar = {
rec.mateCigar.getOrElse {
throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,4 +347,49 @@ class EstimateRnaSeqInsertSizeTest extends UnitSpec with OptionValues {
}
}
}

it should "run end-to-end and ignore reads missing a mate cigar" in {
val builder = new SamBuilder()
// FR = (133804075 + 100) - 133801671 = 2504
builder.addPair(contig=2, start1=133801671, start2=133804075, strand1=Plus, strand2=Minus) // overlaps ACKR4 by 100%
builder.addPair(contig=2, start1=133801671, start2=133804075, strand1=Plus, strand2=Minus).foreach { rec =>
rec.remove("MC") // remove the mate cigar
}

// fails, since the default is to require the mate cigar
assertThrows[Exception] {
val bam = builder.toTempFile()
new EstimateRnaSeqInsertSize(input=bam, refFlat=RefFlatFile).execute()
}

val bam = builder.toTempFile()
val out = PathUtil.pathTo(PathUtil.removeExtension(bam).toString + EstimateRnaSeqInsertSize.RnaSeqInsertSizeMetricExtension)
new EstimateRnaSeqInsertSize(input=bam, refFlat=RefFlatFile, skipMissingMateCigar=true).execute()
val metrics = Metric.read[InsertSizeMetric](path=out)
metrics.length shouldBe PairOrientation.values().length

val expectedMetrics = Seq(
InsertSizeMetric(
pair_orientation = PairOrientation.FR,
read_pairs = 1,
standard_deviation = 0,
mean = 2504,
min = 1,
max = 1,
median = 2504,
median_absolute_deviation = 0
),
)

metrics.zip(expectedMetrics).foreach { case (actual, expected) =>
actual shouldBe expected
}

val histogramPath = PathUtil.pathTo(PathUtil.removeExtension(bam).toString + EstimateRnaSeqInsertSize.RnaSeqInsertSizeMetricHistogramExtension)
Io.readLines(path=histogramPath).mkString("\n") shouldBe
"""
|insert_size fr rf tandem
|2504 1 0 0
""".stripMargin.trim
}
}