-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdense_zones.R
181 lines (154 loc) · 7.54 KB
/
dense_zones.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
##Finds transcription factor dense DNA zones for different accumulation
##threshold values. The function works only if the "accumulation" in input is
##calculated for a single chromosome.
##input:
## accumulation: a list of four elements, as the output of the accumulation
## function.
## threshold_step: an integer, the step used to calculate the threshold
## values.
## chr: optional argument, a string with a cromosome name. It is needed to
## apply the function only to a single cromosome present in the accumulation
## input found with chr = "all". If chr = "all" (default value) the function
## operates for all the chromosome present in the input.
## writeBed: When set to TRUE, for each threshold value (and for each
## chromosome) a ".bed" file with the chromosome and genomic coordinates of
## the dense zones found is created.
##output: a list of eight elements:
## zones: a list with "GRanges" objects with the dense zones found for each
## chromosome and threshold value considered.
## zones_count: a list with a data frame, for each chromosome considered,
## containing the considered threshold values and the number of dense zones
## obtained with each of the threshold values.
## bases_count: a list with a data frame, for each chromosome considered,
## containing the considered threshold values and the total number of bases
## belonging to the dense zones obtained with each of the threshold values.
## lengths: a list with a data frame, for each chromosome considered,
## containing the considered threshold values and min, max, mean, median and
## standard deviation of the dense zone lengths obtained with each of the
## considered threshold values.
## distances: a list with a data frame, for each chromosome considered,
## containing the considered threshold values and min, max, mean, median and
## standard deviation of the distances between adjacent dense zones obtained
## with each of the threshold values.
## acctype: a string with the accumulation type used.
## chr: a string with the chromosome name associated with the accumulation
## vector used.
## w: an integer with half-width of the window used to calculate the
## accumulation vector.
## When writeBed is set to TRUE, for each threshold value (and for each
## chromosome) a ".bed" file with the chromosome and genomic coordinates of
## the dense zones found is created.
dense_zones <- function(accumulation, threshold_step, chr = NULL, writeBed =
FALSE) {
if (!is.list(accumulation))
stop("'accumulation' must be an object of type 'list'.")
if (class(accumulation$accvector) != "Rle" & class(accumulation$accvector)
!= "SimpleRleList")
stop("'accvector' element of the list 'accumulation' must be of class
'Rle' or 'SimpleRleList'.")
if (!is.numeric(threshold_step))
stop("'threshold_step' must be an object of type 'numeric'.")
if (threshold_step < 1)
stop("'threshold_step' must be > 0.")
## initializing vectors
d_zones <- list()
zones_count <- list()
bases_count <- list()
lengths <- list()
distances <- list()
acc_tf <- accumulation$accvector
if(accumulation$chr != "all") {
acc_regions <- GRanges(seqnames = rep(accumulation$chr,
length(ranges(acc_tf))), ranges = ranges(acc_tf),
score = runValue(acc_tf))
c <- accumulation$chr
} else {
acc_regions <- as(acc_tf,"GRanges")
if(!is.null(chr)) {
c <- chr
acc_regions <- acc_regions[seqnames(acc_regions) == chr]
} else {
c <- "all"
}
}
acc_regions <- acc_regions[score(acc_regions) > 0]
u=unique(seqnames(acc_regions))
for (k in seq_along(u)) {
acc_regions_chr <- acc_regions[seqnames(acc_regions) == u[k]]
threshold <- seq(1, max(score(acc_regions_chr)), threshold_step)
n_zones <- vector()
n_bases <- vector()
length_zone_min <- vector()
length_zone_max <- vector()
length_zone_mean <- vector()
length_zone_median <- vector()
length_zone_sd <- vector()
dist_zone_min <- vector()
dist_zone_max <- vector()
dist_zone_mean <- vector()
dist_zone_median <- vector()
dist_zone_sd <- vector()
z=list()
for (i in seq_along(threshold)) {
zones <- reduce(acc_regions_chr[score(acc_regions_chr) >=
threshold[i]])
z[[i]]=zones
## counting the number of dense zones that are formed by contiguos bases
## given a threshold, and the number of bases belonging to the zones.
n_bases[i] <- sum(width(zones))
n_zones[i] <- length(zones)
if (writeBed == TRUE) {
## writing on files chromosome and positions of starting and ending
## points
write.table(data.frame(rep(u[k], length(zones)),
start(zones) - 1, end(zones)),
file = paste(accumulation$acctype, "_acc_w_",
accumulation$w, "_", u[k], "_dense_zones_th_",
threshold[i], ".bed", sep = ""), row.names = FALSE,
col.names = FALSE, quote = FALSE, sep = "\t")
## finding elements of length dataframe
}
length_zone_min[i] <- min(width(zones))
length_zone_max[i] <- max(width(zones))
length_zone_mean[i] <- mean(width(zones))
length_zone_median[i] <- median(width(zones))
length_zone_sd[i] <- sd(width(zones))
## calculating distances of dense zones
if (length(zones) > 1) {
dist_zone <- start(zones)[-1] - end(zones[-length(end(zones))])
dist_zone_min[i] <- min(dist_zone)
dist_zone_max[i] <- max(dist_zone)
dist_zone_mean[i] <- mean(dist_zone)
dist_zone_median[i] <- median(dist_zone)
dist_zone_sd[i] <- sd(dist_zone)
} else {
dist_zone <- NA
dist_zone_min[i] <- NA
dist_zone_max[i] <- NA
dist_zone_mean[i] <- NA
dist_zone_median[i] <- NA
dist_zone_sd[i] <- NA
}
}
zones_count[[k]] <- data.frame(threshold, n_zones)
bases_count[[k]] <- data.frame(threshold, n_bases)
lengths[[k]] <- data.frame(threshold, n_zones, length_zone_min,
length_zone_max, length_zone_mean,
length_zone_median, length_zone_sd)
distances[[k]] <- data.frame(threshold, n_zones, dist_zone_min,
dist_zone_max, dist_zone_mean,
dist_zone_median, dist_zone_sd)
names(z) <- paste("th_", threshold, sep="")
d_zones[[k]] <- z
}
## result dataframes creation
names(zones_count) <- u
names(bases_count) <- u
names(lengths) <- u
names(distances) <- u
names(d_zones) <- u
return(list(zones = d_zones, zones_count = zones_count, bases_count =
bases_count, lengths = lengths, distances = distances,
chr = c, w = accumulation$w, acctype =
accumulation$acctype))
}