-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patherrortests.c
112 lines (106 loc) · 3.45 KB
/
errortests.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "errortests.h"
#include "fastq.h"
#include "histogram.h"
#include "histogramtests.h"
#include "bloomfilter.h"
#include "minsketch.h"
#include "tests.h"
#include "hash.h"
#include "error.h"
// Global parameters for all error tests
#define FASTQ_FILE "test.fastq"
#define KMER_SIZE 10
#define MINSKETCH_WIDTH 50
#define MINSKETCH_HEIGHT 3
// Macros that wrap common histogram uses for tests
#define USING(type, counter, expressions) histogram *h = histogram_new(type, counter); ASSERT(h != NULL); expressions; histogram_free(h);
#define READ(...) histogram_read(h, FASTQ_FILE, KMER_SIZE)
#define COUNT(str) histogram_count(h, str)
#define FOR_EACH_KMER(kmer, expressions) fastq_for_each_kmer(FASTQ_FILE, KMER_SIZE, kmer, expressions);
#define LOAD_FACTOR(...) histogram_load_factor(h)
#define ERROR_CORRECT(sequence, cutoff) error_correct(h, sequence, KMER_SIZE, cutoff)
// Reads in a file comprising reads of all As with one errant T.
// Calculates the mean, variance, standard deviation, and outliers
// These are flags for the first test.
// Setting these to true/false will result in more output for human verification
#define PRINT_STATS false
#define PRINT_OUTLIERS false
TEST(error_test1(), {
USING(MINSKETCH, minsketch_new(MINSKETCH_WIDTH, MINSKETCH_HEIGHT), {
// Populate the histogram
READ();
// Calculate the mean
int n = 0;
float mean = 0.0;
FOR_EACH_KMER(kmer, {
n++;
mean += COUNT(kmer);
});
mean /= (float)n;
// Calculate the variance and standard deviation
float variance = 0.0f;
FOR_EACH_KMER(kmer, {
variance += powf(COUNT(kmer) - mean, 2);
});
variance /= n - 1;
float stddev = sqrtf(variance);
// Print out the outliers
int outliers = 0;
FOR_EACH_KMER(kmer, {
int count = COUNT(kmer);
if (count <= ceil(mean - 2 * stddev)) {
#if PRINT_OUTLIERS
PRINT("%s: %u", kmer, count);
#endif
outliers++;
}
});
// Print stats about #kmers, mean, standard deviation, outliers, and load factor
#if PRINT_STATS
PRINT("len: %d", n);
PRINT("mean: %f", mean);
PRINT("stddev: %f", stddev);
PRINT("outliers: %d (%f%%)", outliers, 100*outliers/(float)(n));
PRINT("load factor: %f%%", LOAD_FACTOR());
#endif
// Do other fake sequences appear frequently?
// Ideally, these would all have 0 frequency hits.
// Failing here means someone either changed the hash functions or drew unlucky seeds
char temp[KMER_SIZE * 2 + 1];
memset(temp, 0, KMER_SIZE * 2 + 1);
for (int i = 0; i < KMER_SIZE * 2; i++) {
temp[i] = 'A' + i;
ASSERT(COUNT(temp) == 0);
}
});
})
// Reads in a file comprising reads of all As with one errant T.
// The test will try to correct the sequence.
// These are flags for the second test.
// Setting these to true/false will result in more output for human verification
#define CUTOFF 1
#define CORRECT_SEQUENCE "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
TEST(error_test2(), {
USING(MINSKETCH, minsketch_new(MINSKETCH_WIDTH, MINSKETCH_HEIGHT), {
// Populate the histogram
READ();
// Correct the *reads* of the file
fastq *f = fastq_new(FASTQ_FILE);
ASSERT(f != NULL);
while(fastq_read(f)) {
if (ERROR_CORRECT(f->sequence, CUTOFF)) {
ASSERT(strcmp(f->sequence, CORRECT_SEQUENCE) == 0);
}
}
fastq_free(f);
});
});
// Driver for error tests
TEST(error_test(), {
ASSERT(error_test1() == true);
ASSERT(error_test2() == true);
})