-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_simple.sql
222 lines (137 loc) · 5.36 KB
/
test_simple.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
drop table t;
drop table t2;
drop table t3;
drop TABLE test_typmod;
drop table km;
DROP TABLE kmers;
DROP TABLE qkmers;
DROP INDEX idxxxmm;
drop extension if exists postdna;
CREATE EXTENSION postdna;
-- Data Types
CREATE TABLE t (id integer, dna dna, kmer kmer, qkmer qkmer);
INSERT INTO t VALUES
(0, 'CAT', 'taccc', 'CGTAN'),
(1, 'CATG', 'ttt', 'MNTA'),
(2, 'CATGG', 'cca', 'CCCGAAAA'),
(3, 'TACAGATA', 'TTAcca', 'RYBNCCCGGT'),
(5, 'TACAGATAA', 'ccaGG', 'VTA'),
(6, 'CAAAATAAGCGAAAT', 'CCcca', 'CG'),
(7, 'CATATATATAGGGGGAAAATTTTTTT', 'CCAcca', 'TTTGCD');
SELECT * from t;
INSERT INTO t VALUES
(8, 'CATATATATAGGGGGAAAATTTTTTT', 'CCAccATATATATAGGGGGAAAATTTTTTATATATATAGGGGGAAAATTTTTTa', 'TTTGCD');
INSERT INTO t VALUES
(8, 'CATATATATAGGGGGAAAATTTTTTT', 'CCAccATATATATA', 'TTTGGGGGGAAAATTTTTTATATATATAGGGGGAAAATTTTTTaGGGGGACD');
---------------------------------------------------------------------------------------------------------------------------
-- Type Modifier
CREATE TABLE test_typmod (dna dna(10), kmer kmer(5));
INSERT INTO test_typmod (dna, kmer) VALUES ('ACGT', 'CGCCC');
SELECT * FROM test_typmod;
INSERT INTO test_typmod (dna) VALUES ('ACGTACGTACGTCC');
INSERT INTO test_typmod (kmer) VALUES ('ACGTACGTACGTCC');
CREATE TABLE test_typmod2 (kmer kmer(34));
---------------------------------------------------------------------------------------------------------------------------
--Length
SELECT kmer, length(kmer) AS kmer_length ,
dna, length(dna) AS dna_length,
qkmer, length(qkmer) AS qkmer_length
FROM t;
---------------------------------------------------------------------------------------------------------------------------
-- generate kmers
SELECT * FROM generate_kmers('CTGAACCCGGATT', 8);
SELECT * FROM generate_kmers('CTGAAATCGCTTAT', 20);
SELECT * FROM generate_kmers('CTGAAATCGCTTATCTGAAATCGCTTATCTGAAATCGCTTATCTGAAATCGCTTATCTGAAATCGCTTAT', 33);
---------------------------------------------------------------------------------------------------------------------------
-- Contains
INSERT INTO t VALUES
(8, 'CATATATTT', 'TGCGCC', 'HGNNCC'),
(9, 'CATATTCGT', 'TGCTTCGTC', 'WSMBDCGNN'),
(10, 'CATATTCGT', 'CGCGCGCTT', 'NNNNNNNNN')
(11, 'CATATTCGT', 'CGCGCGCTT', 'GCTGG');
select qkmer, kmer from t where qkmer @> kmer;
select qkmer, kmer from t where qkmer_contains(qkmer , kmer);
select qkmer_contains('NNGCC' , 'CCTTC');
---------------------------------------------------------------------------------------------------------------------------
-- starts-with test
CREATE TABLE kmers (kmer kmer);
INSERT INTO kmers VALUES ('TACGAAACT'), ('ATACGATCCT'), ('TACGACTA'), ('TACGCTC');
SELECT * FROM kmers WHERE kmer_starts_with(kmer, 'TACGA');
SELECT * FROM kmers WHERE kmer ^@ 'TACG';
CREATE TABLE qkmers (qkmer qkmer);
INSERT INTO qkmers VALUES ('TAYCSGAAACT'), ('ATACGATCCT'), ('TAVCGACTA'), ('TAYCGCTC'), ('TAYCSGCTC');
SELECT * FROM qkmers WHERE qkmer_starts_with(qkmer, 'TAYCS'::qkmer);
SELECT * FROM qkmers WHERE qkmer ^@ 'TAV';
---------------------------------------------------------------------------------------------------------------------------
-- Counting
SELECT k.kmer, count(k.kmer) FROM generate_kmers(
'ACTCGATGCAGTCAGTCGATGCATCATCAT'
, 3) AS k(kmer) GROUP BY k.kmer;
SELECT k.kmer, count(k.kmer) FROM generate_kmers(
'ACTACTACT'
, 3) AS k(kmer) GROUP BY k.kmer;
WITH kmers AS (
SELECT k.kmer, count(*)
FROM generate_kmers('CGTCGAAAAA', 4) AS k(kmer) GROUP BY k.kmer
)
SELECT sum(count) AS total_count,
count(*) AS distinct_count,
count(*) FILTER (WHERE count = 1) AS unique_count FROM kmers;
---------------------------------------------------------------------------------------------------------------------------
-- Canonicalize
CREATE TABLE t2 (kmer kmer);
INSERT INTO t2 (kmer)
SELECT 'ACTGCC'
FROM generate_series(1, 5000);
INSERT INTO t2 (kmer)
SELECT 'CTACTA'
FROM generate_series(1, 5000);
INSERT INTO t2 (kmer)
SELECT 'TAGTAG'
FROM generate_series(1, 5000);
INSERT INTO t2 (kmer)
SELECT 'TTTT'
FROM generate_series(1, 5000);
select kmer,kmer_canonicalize(kmer), count(*)
from t2
group by kmer;
with kmers as (select distinct kmer from t2)
select distinct kmer_canonicalize(kmer), count(*)
from t2
where kmer_canonicalize(kmer) in (select kmer from kmers)
group by kmer_canonicalize(kmer)
UNION
select distinct kmer, count(*)
from t2
where kmer_canonicalize(kmer) not in (select kmer from kmers)
group by kmer
;
---------------------------------------------------------------------------------------------------------------------------
--index
CREATE TABLE t3 (kmer kmer);
select pg_relation_size('t3');
-- insert > 20 Milion values
INSERT INTO t3 (kmer)
SELECT 'ACTGCC'
FROM generate_series(1, 5000);
INSERT INTO t3 (kmer)
SELECT 'ACGGCT'
FROM generate_series(1, 5000000);
INSERT INTO t3 (kmer)
SELECT 'ACTCGT'
FROM generate_series(1, 4500000);
INSERT INTO t3 (kmer)
SELECT 'GGGGGG'
FROM generate_series(1, 10000000);
INSERT INTO t3 (kmer)
SELECT 'CTACTA'
FROM generate_series(1, 5000);
Explain analyze select * from t3 where kmer='CTACTA';
Explain analyze select * from t3 where kmer ^@ 'CTAC';
Explain analyze select * from t3 where 'NNNNNN' @> kmer;
CREATE INDEX idxxxmm ON t3 USING SPGIST(kmer);
SET enable_seqscan TO off;
Explain analyze select * from t3 where kmer='CTACTA';
Explain analyze select * from t3 where kmer ^@ 'CTAC';
Explain analyze select * from t3 where 'NNNNNN' @> kmer;
SET enable_seqscan TO on;