diff --git a/docs/_static/images/func+export_coverage.svg b/docs/_static/images/func+export_coverage.svg new file mode 100644 index 000000000..c8a0f9e0c --- /dev/null +++ b/docs/_static/images/func+export_coverage.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa9ef42c69435a42c8f70ffbef8acab75f67ee830037a3fdee19a9fa397351ec +size 5492 diff --git a/snapatac2-core/src/export.rs b/snapatac2-core/src/export.rs index ba4a7e9b5..358549cd2 100644 --- a/snapatac2-core/src/export.rs +++ b/snapatac2-core/src/export.rs @@ -294,12 +294,12 @@ where .map(|(k, v)| GenomicRange::new(k, 0, *v)) .collect(); let mut counter = SparseBinnedCoverage::new(&genome, bin_size); - let mut total_count = 0.0; + let mut total_reads = 0.0; fragments.for_each(|frag| { let not_in_blacklist = blacklist_regions.map_or(true, |bl| !bl.is_overlapped(&frag)); if not_in_blacklist { if ignore_for_norm.map_or(true, |x| !x.contains(frag.chrom())) { - total_count += 1.0; + total_reads += 1.0; } counter.insert(&frag, 1.0); } @@ -307,9 +307,9 @@ where let norm_factor = match normalization { None => 1.0, - Some(Normalization::RPKM) => total_count * bin_size as f32 / 1e9, - Some(Normalization::CPM) => total_count / 1e6, - Some(Normalization::BPM) => todo!(), + Some(Normalization::RPKM) => total_reads * bin_size as f32 / 1e9, + Some(Normalization::CPM) => total_reads / 1e6, + Some(Normalization::BPM) => counter.get_coverage().values().sum::() / 1e6, Some(Normalization::RPGC) => todo!(), }; @@ -480,6 +480,82 @@ fn create_bigwig_from_bedgraph>( mod tests { use super::*; + #[test] + fn test_bedgraph1() { + let fragments: Vec = vec![ + Fragment::new("chr1", 0, 10), + Fragment::new("chr1", 3, 13), + Fragment::new("chr1", 5, 41), + Fragment::new("chr1", 8, 18), + Fragment::new("chr1", 15, 25), + Fragment::new("chr1", 22, 24), + Fragment::new("chr1", 23, 33), + Fragment::new("chr1", 29, 40), + ]; + let genome: ChromSizes = [("chr1", 50)].into_iter().collect(); + + let output: Vec<_> = create_bedgraph_from_fragments( + fragments.clone().into_iter(), + &genome, + 3, + None, + None, + None, + None, + ).into_iter().map(|x| x.value).collect(); + let expected = vec![1.0, 3.0, 4.0, 3.0, 2.0, 4.0, 3.0, 2.0]; + assert_eq!(output, expected); + + let output: Vec<_> = create_bedgraph_from_fragments( + fragments.clone().into_iter(), + &genome, + 5, + None, + None, + None, + None, + ).into_iter().map(|x| x.value).collect(); + let expected = vec![2.0, 4.0, 3.0, 4.0, 3.0, 2.0, 1.0]; + assert_eq!(output, expected); + } + + #[test] + fn test_bedgraph2() { + let reader = crate::utils::open_file_for_read("test/fragments.tsv.gz"); + let mut reader = bed_utils::bed::io::Reader::new(reader, None); + let fragments: Vec = reader.records().map(|x| x.unwrap()).collect(); + + let reader = crate::utils::open_file_for_read("test/coverage.bdg.gz"); + let mut reader = bed_utils::bed::io::Reader::new(reader, None); + let expected: Vec> = reader.records().map(|x| x.unwrap()).collect(); + + let output = create_bedgraph_from_fragments( + fragments.clone().into_iter(), + &[("chr1", 248956422), ("chr2", 242193529)].into_iter().collect(), + 1, + None, + None, + None, + None, + ); + assert_eq!(output, expected); + + let output = create_bedgraph_from_fragments( + fragments.into_iter(), + &[("chr1", 248956422), ("chr2", 242193529)].into_iter().collect(), + 1, + None, + None, + Some(Normalization::BPM), + None, + ); + let scale_factor: f32 = expected.iter().map(|x| x.value * x.len() as f32).sum::() / 1e6; + assert_eq!(output, expected.into_iter().map(|mut x| { + x.value = x.value / scale_factor; + x + }).collect::>()); + } + #[test] fn test_smoothing() { let input = vec![ diff --git a/snapatac2-core/src/preprocessing/qc.rs b/snapatac2-core/src/preprocessing/qc.rs index 7e2d0f035..afc0a64a9 100644 --- a/snapatac2-core/src/preprocessing/qc.rs +++ b/snapatac2-core/src/preprocessing/qc.rs @@ -9,7 +9,7 @@ pub type CellBarcode = String; /// Fragments from single-cell ATAC-seq experiment. Each fragment is represented /// by a genomic coordinate, cell barcode and a integer value. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct Fragment { pub chrom: String, pub start: u64, @@ -20,6 +20,17 @@ pub struct Fragment { } impl Fragment { + pub fn new(chrom: impl Into, start: u64, end: u64) -> Self { + Self { + chrom: chrom.into(), + start, + end, + barcode: None, + count: 1, + strand: None, + } + } + pub fn to_insertions(&self) -> SmallVec<[GenomicRange; 2]> { match self.strand { None => smallvec![ diff --git a/snapatac2-python/python/snapatac2/export/__init__.py b/snapatac2-python/python/snapatac2/export/__init__.py index fea3328a7..aeaac1c0c 100644 --- a/snapatac2-python/python/snapatac2/export/__init__.py +++ b/snapatac2-python/python/snapatac2/export/__init__.py @@ -83,7 +83,7 @@ def export_coverage( selections: list[str] | None = None, bin_size: int = 10, blacklist: Path | None = None, - normalization: Literal["RPKM", "CPM"] | None = "RPKM", + normalization: Literal["RPKM", "CPM", "BPM"] | None = "RPKM", ignore_for_norm: list[str] | None = None, min_frag_length: int | None = None, max_frag_length: int | None = 2000, @@ -108,6 +108,9 @@ def export_coverage( normalizes by the total number of reads and the length of the region. The normalization can be disabled by setting `normalization=None`. + .. image:: /_static/images/func+export_coverage.svg + :align: center + Parameters ---------- adata @@ -123,7 +126,10 @@ def export_coverage( blacklist A BED file containing the blacklisted regions. normalization - Normalization method. If `None`, no normalization is performed. + Normalization method. If `None`, no normalization is performed. Options: + - RPKM (per bin) = #reads per bin / (#mapped_reads (in millions) * bin length (kb)). + - CPM (per bin) = #reads per bin / #mapped_reads (in millions). + - BPM (per bin) = #reads per bin / sum of all reads per bin (in millions). ignore_for_norm A list of chromosomes to ignore for normalization. min_frag_length