-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTestingQueries.SQL
182 lines (145 loc) · 3.49 KB
/
TestingQueries.SQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
-- Enable the postdna extension
drop extension if exists postdna cascade;
CREATE EXTENSION IF NOT EXISTS postdna;
-- Drop tables if they exist
DROP TABLE IF EXISTS kmers;
DROP TABLE IF EXISTS dna_sequences;
-- Create dna_sequences table
CREATE TABLE dna_sequences (
seq_id TEXT PRIMARY KEY,
seq DNA NOT NULL
);
-- Verify the table creation
SELECT * FROM dna_sequences;
-- Create kmers table
CREATE TABLE kmers (
id SERIAL PRIMARY KEY,
kmer kmer NOT NULL,
seq_id TEXT REFERENCES dna_sequences(seq_id)
);
-- Generate and insert k-mers
INSERT INTO kmers (kmer, seq_id)
SELECT k.kmer, s.seq_id
FROM dna_sequences s,
generate_kmers(s.seq, 5) AS k(kmer);
--Verify the table creation
SELECT * FROM kmers;
-- Test length functions
SELECT seq_id, length(seq) AS dna_length
FROM dna_sequences;
--
SELECT DISTINCT kmer, length(kmer) AS kmer_length
FROM kmers
LIMIT 5;
-- Test k-mer counting
SELECT kmer, COUNT(*) AS frequency
FROM kmers
GROUP BY kmer
ORDER BY frequency DESC
LIMIT 10;
-- Most frequent k-mers
SELECT kmer, COUNT(*) AS frequency
FROM kmers
GROUP BY kmer
ORDER BY frequency DESC
LIMIT 10;
-- K-mer frequency per sequence
SELECT s.seq_id, k.kmer, COUNT(*) AS frequency
FROM kmers k
JOIN dna_sequences s ON k.seq_id = s.seq_id
GROUP BY s.seq_id, k.kmer
ORDER BY s.seq_id, frequency DESC
LIMIT 10;
-- Count all 5-mers in a DNA sequence
SELECT k.kmer, count(*)
FROM kmers k GROUP BY k.kmer
ORDER BY count(*) DESC;
-- Return the total, distinct and unique count of 5-mers in a DNA sequence
WITH kmers AS (
SELECT k.kmer, count(*)
FROM kmers k GROUP BY k.kmer
)
SELECT sum(count) AS total_count,
count(*) AS distinct_count,
count(*) FILTER (WHERE count = 1) AS unique_count FROM kmers;
CREATE INDEX kmer_spgist_idx ON kmers USING spgist (kmer);
set enable_seqscan = off;
-- Canonicalize
SELECT kmer, count(*)
FROM kmers
WHERE kmer = 'CTACT'
group by kmer;
SELECT kmer
FROM kmers
WHERE kmer ='TAGTA';
select kmer, count(*)
from kmers
WHERE kmer ='AAAAA' or kmer ='TTTTT'
group by kmer;
EXPLAIN ANALYZE
select kmer_canonicalize(kmer), count(*)
from kmers
group by kmer_canonicalize(kmer);
-- Create SP-GiST index
-- CREATE INDEX kushari ON kmers USING SPGIST(kmer);
-- select * from kmers where kmer='ACTCA';
-------------------------------
-- No index, hash and spgist comparison
-- Test index performance
EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE kmer = 'ACGTA'::kmer;
-- Test starts_with with kmer
--EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE kmer ^@ 'ACGT';
-- Test contains
--EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE 'ACG'::qkmer @> kmer;
-- Test contains with qkmer
EXPLAIN ANALYZE
SELECT DISTINCT kmer
FROM kmers
WHERE 'ANGTA'::qkmer @> kmer;
-- hash index
CREATE INDEX kmer_hash_idx ON kmers USING hash (kmer);
set enable_seqscan = off;
-- Test index performance
EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE kmer = 'ACGTA'::kmer;
drop index kmer_hash_idx;
-- spgist index
CREATE INDEX kmer_spgist_idx ON kmers USING spgist (kmer);
-- Test index performance
EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE kmer = 'ACGTA'::kmer;
-- Test starts_with with kmer
EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE kmer ^@ 'ACGT';
-- Test contains
EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE 'ACG'::qkmer @> kmer;
-- Test contains with qkmer
EXPLAIN ANALYZE
SELECT kmer
FROM kmers
WHERE 'ANGTA'::qkmer @> kmer;
set enable_seqscan = on;
drop index kmer_spgist_idx;
-------------
-- -- Clean up
-- DROP TABLE IF EXISTS kmers;
-- DROP TABLE IF EXISTS dna_sequences;
-- DROP INDEX IF EXISTS kmer_spgist_idx;