-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspace.py
100 lines (82 loc) · 3.83 KB
/
space.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import spacepile
print(spacepile)
import numpy as np
import pysam
# make matricies of at most this width. usually this will be ~ 10% longer than read-length
max_width = 9
# NOTE, these would likely be drawn from a bam file that is sorted by fragment (an option in samtools sort) or grouped by UMI
# these are available as aln.cigartuples in pysam.
# here we show 3 reads for an example.
cigs = [
[(pysam.CHARD_CLIP, 100), (pysam.CMATCH, 4)], # read 1. 4M ACTG
[(pysam.CMATCH, 2), (pysam.CDEL, 2), (pysam.CMATCH, 1)], # read 2. 2M2D1M CTC
[(pysam.CMATCH, 2), (pysam.CINS, 3), (pysam.CMATCH, 2)], # read 2. 2M2D1M ACGGGTG
]
posns = [0, 1, 0] # the 2nd read starts 1 base after the other reads.
# idxs will be filled by `space` call.
idxs = np.zeros((len(cigs), max_width), dtype=np.uint32)
spacepile.space(idxs, cigs, posns)
# now we have the indexes into the original sequence or base-qualities or IPDs. etc.
END = np.iinfo(np.uint32).max
SPACE = END - 1
assert np.array_equal(idxs,
[[ 0, 1,SPACE,SPACE,SPACE, 2, 3,END,END],
[END, 0,SPACE,SPACE,SPACE, 1,SPACE,SPACE, 2],
[ 0, 1, 2, 3, 4, 5, 6,END,END]]), idxs
# sequences likely retrieved from pysam's aln.query_sequence or aln.query_alingment_sequence
raw_sequences = ["ACTG", "CTC", "ACGGGTG"]
base_qs = [[ 40, 50, 60, 70], [60, 60, 60], [45, 45, 45, 55, 55, 55, 60]]
sequences = np.zeros((len(cigs), max_width), dtype=np.int16)
bqs = np.zeros_like(sequences)
for i, s in enumerate(raw_sequences):
sequences[i, :len(s)] = np.array(list(s), dtype='U1').view(np.int32) # Q: is there a better way to do this in numpy?
bqs[i, :len(s)] = np.array(base_qs[i], dtype=np.int32)
# NOTE we will put sequences and bqs in the same output matrix.
mat = np.zeros((2 * len(cigs), max_width), dtype=np.int16)
spacepile.translate(idxs, sequences, mat[:len(cigs)])
# NOTE how the columns generally match
assert np.array_equal(mat[:len(cigs)],
[[65,67,-2,-2,-2,84,71,-1,-1],
[-1,67,-2,-2,-2,84,-2,-2,67],
[65,67,71,71,71,84,71,-1,-1]])
spacepile.translate(idxs, bqs, mat[len(cigs):])
assert np.array_equal(mat[len(cigs):],
[[40,50,-2,-2,-2,60,70,-1,-1],
[-1,60,-2,-2,-2,60,-2,-2,60],
[45,45,45,55,55,55,60,-1,-1]])
#print(mat)
# now repeat for many sequences and send many mats to a learner.
# MIDNSHP=XB
cigs = [
#[(0, 172-126), (4, 930)], # 1040S 172M 930S
#[(0, 136-126), (2, 2), (0, 34), (5, 1767)],
[(4, 1040), (0, 172), (4, 930)], # 1040S 172M 930S
[(5, 205), (0, 136), (2, 2), (0, 34), (5, 1767)],
[(5, 628), (0, 39), (2, 1), (0, 38), (1, 1), (0, 82), (5, 1354)],
[(5, 1899), (0, 36), (2, 1), (0, 72), (1, 2), (0, 27), (5, 106)],
[(5, 1487), (0, 40), (1, 1), (0, 99), (5, 515)]
]
posns = [x + 1040 for x in [0, 0, 8, 21, 33]][:len(cigs)]
print(posns)
max_width = 177
idxs = np.zeros((len(cigs), max_width), dtype=np.uint32)
spacepile.space(idxs, cigs, posns)
print(idxs)
cigs = [
[(4, 799), (0, 16), (1, 2), (0, 23), (2, 1), (0, 90), (4, 12)], # 799S 16M 2I 23M 1D 90M 12 S
[(5, 384), (0, 57), (2, 2), (0, 56), (5, 445)], # 384H 57M 2D 56M 445H
]
posns = [21271006, 21271021]
max_width = 177
idxs = np.zeros((len(cigs), max_width), dtype=np.uint32)
spacepile.space(idxs, cigs, posns)
print(idxs)
cigs = [
[(5, 239), (0, 20), (2, 2), (0, 35), (5, 648)],
[(5, 672), (0, 54), (2, 1), (0, 15), (1, 1), (0, 12), (5, 188)],
[(5, 298), (0, 23), (1, 1), (0, 3), (2, 2), (0, 58), (1, 1), (0, 5), (1, 1), (0, 29), (5, 523)]]
posns = [ 0, 12, 27]
for max_width in range(10, 100, 10):
idxs = np.zeros((len(cigs), max_width), dtype=np.uint32)
spacepile.space(idxs, cigs, posns)
print(idxs)