-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfilter_matrix.R
73 lines (60 loc) · 3.04 KB
/
filter_matrix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Usage:
# Rscript filter_matrix.R --input_matrix=MGH-1904_100000.matrix --bins=MGH-1904_100000_abs.bed --blacklist=blacklist.bed --output_matrix=MGH-1904_100000.blacklist.matrix
library(readr)
library(optparse)
suppressPackageStartupMessages(library(GenomicRanges))
parser <- OptionParser()
parser <- add_option(parser, c("-i", "--input_matrix"), type="character",
help="Input matrix file in TSV sparse triplet format (i j count)")
parser <- add_option(parser, c("-o", "--output_matrix"), type="character",
help="Output matrix file in TSV sparse triplet format (i j count)")
parser <- add_option(parser, c("-b", "--bins"), type="character",
help="Bins (chr start end)")
parser <- add_option(parser, c("--blacklist"), type="character",
default=NULL, help="Regions to filter out")
args <- parse_args(parser, args = c(
"--input_matrix=MGH-1904_100000.matrix",
"--output_matrix=MGH-1904_100000.blacklist.matrix",
"--bins=MGH-1904_100000_abs.bed",
"--blacklist=blacklist.bed"))
args <- parse_args(parser)
args
# Read bins
message ("Reading bins: ", args$bins)
bins <- read_tsv(args$bins,
col_names = c("chrom", "start", "end", "id"),
col_types = cols_only(chrom = col_character(),
start = col_integer(),
end = col_integer()))
bin_gr <- GRanges(seqnames=bins$chrom, IRanges(start=bins$start, end=bins$end))
# Read matrix
message ("Reading input matrix: ", args$input_matrix)
tab <- read_tsv(args$input_matrix,
col_names = c("i", "j", "count"),
col_types = cols_only(i = col_integer(),
j = col_integer(),
count = col_double()))
message (nrow(bins), " non-zero cells")
# Read blacklist
if (!is.null(args$blacklist)) {
message ("Reading blacklist: ", args$blacklist)
blacklist <- read_tsv(args$blacklist,
col_names = c("chrom", "start", "end"),
col_types = cols_only(chrom = col_character(),
start = col_integer(),
end = col_integer()))
blacklist_gr <- GRanges(seqnames=blacklist$chrom, IRanges(start=blacklist$start+1, end=blacklist$end))
bin_drop_idx <- which(countOverlaps(bin_gr, blacklist_gr) > 0)
message ("Excluding ", length(bin_drop_idx), " bins out of ", nrow(bins), " because of overlap with the blacklist")
}
# Drop matrix cells
message ("Dropping cells")
drop_idx <- tab$i %in% bin_drop_idx | tab$j %in% bin_drop_idx
tab <- tab[!drop_idx,]
# Sort matrix
message ("Sorting by i,j")
o <- order(tab$i, tab$j)
tab <- tab[o,]
# Save matrix in sparse triplet format
message ("Writing output matrix: ", args$output_matrix)
write.table(tab, file=args$output_matrix, sep="\t", quote=FALSE, row.names = FALSE, col.names = FALSE)