forked from saralpulit/UveitisJIA_MHC-fineMapping
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_qc.sh
126 lines (105 loc) · 4.44 KB
/
run_qc.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/bin/sh
#$ -cwd
#$ -pe threaded 1
#$ -S /bin/bash
## Set Plink (https://www.cog-genomics.org/plink2) Stable Beta 4.2 (3 November 2017)
plink2=/hpc/local/software/plink-1.9/plink
#########################
##### SAMPLE QC
##########################
## Here's the prefix for the data
data=jia.uviitis.hg19.merge
## Calculate missingness across the samples; drop samples with missingness > 5%
$plink2 --bfile $data \
--missing \
--out $data
## Prepare a set of high-quality SNPs for the rest of sample QC
## Identify SNPs in regions that can be highly stratified by population
awk ' $1==2 && $4 > 129883530 && $4 < 140283530 { print $2 } ' $data.bim > $data.pca.exclude.snps ## SNPs in the lactase locus
awk ' $1==6 && $4 > 24092021 && $4 < 38892022 { print $2 } ' $data.bim >> $data.pca.exclude.snps ## SNPs in the MHC
awk ' $1==8 && $4 > 6612592 && $4 < 13455629 { print $2 } ' $data.bim >> $data.pca.exclude.snps ## Inversion on chr8
awk ' $1==17 && $4 > 40546474 && $4 < 44644684 { print $2 } ' $data.bim >> $data.pca.exclude.snps ## Inversion on chr17
## Identify non-autosomal SNPs
awk ' $1 > 22 { print $2 } ' $data.bim >> $data.pca.exclude.snps ## Non-autosomal SNPs
## Identify A/T and C/G SNPs (if frequency is near 50%, strandedness can be tricky here)
cat $data.bim | awk ' ($5=="A" && $6=="T") || ($5=="T" && $6=="A") || ($5=="C" && $6=="G") || ($5=="G" && $6=="C") { print $2 } ' >> $data.pca.exclude.snps
## Drop the SNPs we just identified, and keep the remaining data for sample QC
## Also keep MAF > 5% and missingness < 0.1% SNPs, and LD prune (r2 = 0.2)
$plink2 --bfile $data \
--maf 0.05 \
--geno 0.001 \
--exclude ./$data.pca.exclude.snps \
--indep-pairwise 50 5 0.2 \
--out $data.pca
## PREPARATION FOR PCA
## Extract overlap with HapMap3 and merge everything together
## hapmap.jia.overlap.snps contains a list of SNPs that overlap between the JIA data and HapMap 3
$plink2 --bfile $data \
--extract ./hapmap.jia.overlap.snps \
--make-bed \
--out $data.pca
## Extract the same set of SNPs from the HapMap 3 data (lifted over to hg19)
$plink2 --bfile /hpc/cog_gonl/sara/hapmap3/hg19/hapmap3_r2.hg19.dbsnp138.fwd.founders \
--extract hapmap.jia.overlap.snps \
--make-bed \
--out hapmap3.pca
## Merge together the study data and the HapMap 3 data
$plink2 --bfile hapmap3.pca \
--bmerge $data.pca.bed $data.pca.bim $data.pca.fam \
--make-bed \
--out jia.hapmap3.merge.pca
## Merging may throw an error at SNPs that may need their strand flipped
## Note: if strand flipping doesn't resolve the error, just drop the SNPs with --exclude
$plink2 --bfile hapmap3.pca \
--flip jia.hapmap3.merge.pca-merge.missnp \
--make-bed \
--out hapmap3.pca
## Recode the data to ped/map file format
$plink2 --bfile jia.hapmap3.merge.pca \
--recode \
--out jia.hapmap3.merge.pca
## Run principal component analysis
## See run_smartPCA.sh in the Afib-Stroke-Overlap repository
## Drop PCA outliers
./run_smartPCA.sh jia.hapmap3.merge.pca
## Check for sex mismatches between phenotype and genotype data; drop any mismatches
$plink2 --bfile $data \
--check-sex \
--out $data
## Calculate heterozygozity/inbreeding of samples
## Drop all samples > 3sd from the distribution
$plink2 --bfile $data \
--extract $data.pca.prune.in \
--het \
--out $data
## Calculate relatedness; drop samples with pi-hat > 0.125 (cousin relationship or higher)
$plink2 --bfile $data \
--extract $data.pca.prune.in \
--genome \
--out $data
## Assemble all sample QC failures in the following file: jia.uviitis.hg19.merge.sample.failures (format: FID IID)
#################
##### SNP QC
#################
## Calculate missingness across SNPs; drop SNPs with missingness > 5%
$plink2 --bfile $data \
--remove ./jia.uviitis.hg19.merge.sample.failures \
--missing \
--out $data
## Calculate Hardy-Weinberg; drop SNPs with HWE p < 1e-6 in controls only
$plink2 --bfile $data \
--remove ./jia.uviitis.hg19.merge.sample.failures \
--hardy \
--pheno ./jia.uviitis.hg19.merge.pheno.jia_vs_healthy.txt \
--out $data
## Differential missingness between cases and controls; drop SNPs with p < 1e-6
$plink2 --bfile $data \
--remove ./jia.uviitis.hg19.merge.sample.failures \
--test-missing \
--pheno ./jia.uviitis.hg19.merge.pheno.jia_vs_healthy.txt \
--out $data
## Check frequency distribution
$plink2 --bfile $data \
--remove ./jia.uviitis.hg19.merge.sample.failures \
--freq \
--out $data