-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathINTERACTION_SBC.R
203 lines (163 loc) · 6.32 KB
/
INTERACTION_SBC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#load libraries
library(tidyverse)
library(data.table)
library(pheatmap)
#get user line arguments; change this
#args = commandArgs(trailingOnly = TRUE)
#cat('1:', args[1], '\n')
#set up input arguments after connecting with everyone else on the team to final.
#i.e. GWAS input file, also another argument for potentially focusing on or skipping the analysis stratified by annotation types.
#set up the plink files/dataset prefix. Should have .bed, .bim, amd .fam files
plinkFilePrefix='GWAS_input'
#read variants of interest
user_variants <- read.table("plinkFilePrefix", sep = "\t", header = F)
#reformat user variants of interest to avinput format (using BIM file)
colnames(user_variants) <- c("chr", "ID", "cM", "position", "A1", "A2")
avinput <- user_variants %>%
select(hg38_chr = chr,
chr_start = position,
chr_end = position,
A1,
A2)
write.table(avinput, "user_variants.avinput", sep = "\t", quote = F, row.names=F, col.names=F)
#annotate using ANNOVAR
input_file <- "user_variants.avinput"
annovar_folder <- "/Users/annovar/"
build <- "-buildver hg38"
output_file <- "-out user_variants.annoavinput"
arguments <- "-dot2underline -remove -protocol refGene,knownGene,ensGene,gnomad211_exome,clinvar_20160302,dbnsfp30a -arg '-splicing_threshold=6','-splicing_threshold=6','-splicing_threshold=6',,, -operation g,g,g,f,f,f -otherinfo"
cmd <- paste("perl", paste0(annovar_folder,"table_annovar.pl"), input_file, paste0(annovar_folder,"humandb"), build, output_file, arguments)
system(cmd)
#OPTIONAL: subset by class of variants and MAF, these can be input later if you just want to use them as snpExtractFile options
intergenic_SNPs <- subset(output_file, Func_refGene == "intergenic" & AF_nfe >= 0.005 & AF_nfe <= 0.995)
intergenic_SNPs$chr_pos <- paste(intergenic_SNPs$Chr, ":", intergenic_SNPs$Start, sep="")
write.table(intergenic_SNPs, "intergenic_SNPs.csv", quote = F, sep = ",", row.names = F)
intronic_SNPs <- subset(output_file, Func_refGene == "intronic" & AF_nfe >= 0.005 & AF_nfe <= 0.995)
intronic_SNPs$chr_pos <- paste(intronic_SNPs$Chr, ":", intronic_SNPs$Start, sep="")
write.table(intronic_SNPs, "intronic_SNPs.csv", quote = F, sep = ",", row.names = F)
exonic_SNPs <- subset(output_file, Func_refGene == "exonic" & ExonicFunc_refGene = 'nonsynonymous SNV' & AF_nfe >= 0.005 & AF_nfe <= 0.995)
exonic_SNPs$chr_pos <- paste(exonic_SNPs$Chr, ":", exonic_SNPs$Start, sep="")
write.table(exonic_nonsynonymous_SNPs, "exonic_nonsynonymous_SNPs.csv", quote = F, sep = ",", row.names = F)
#name of file storing the SNPs to be extracted
#for loop here that runs through different classes of variants.
class_in =c("intergenic_SNPs","intronic_SNPs","exonic_SNPs")
#start loop.
for(i in class_in){
snpExtractFile=class_in
if(file.exists(snpExtractFile)==F)
{
cat('snpExtractFile file not found\n')
q()
}
#prefix of output file
outFilePrefix='Test2'
#path to plink cmd
plinkCmd='/Users/plink'
if(file.exists(plinkCmd)==F)
{
cat('plink cmd not found\n')
q()
}
#read the covariate file
covariateFile='covariates.txt'
covar=fread(covariateFile)
head(covar)
#set up the command
cmd=str_glue('{plinkCmd} --bfile {plinkFilePrefix} --extract {snpExtractFile} --recode A --out {outFilePrefix}')
cat('cmd:', cmd, '\n')
system(cmd)
#read the raw file
genotypeRawFile=str_glue('{outFilePrefix}.raw')
geno=fread(genotypeRawFile)
head(geno)
ncols=ncol(geno)
#number of varaints
numVarsInput=ncols-6
numVarsInput
varNameVec=colnames(geno)[7:ncol(geno)]
varNameVec
#merge the geno and covariate file
final_covar=merge(covar, geno, by.x = "FID", by.y = "FID")
head(final_covar)
#scale to deal with zero values
final_covar[,20:(19+numVarsInput)] = final_covar[,20:(19+numVarsInput)] +1
colnames(final_covar)
#get the 2 pair combinations
varPairList=combn(colnames(final_covar)[20:(20+numVarsInput-1)], 2, simplify = F)
varPairList
#function to call regression
callRegress = function(inVarList,covarDf, inPhenoColumn)
{
#inVarList='4_17968811_A 4_77110365_G'
#covarDf=final_covar
cat('\nvars are:', inVarList, '\n')
s=unlist(strsplit(inVarList, ' '))
firstVar=s[1]
scndVar=s[2]
cat('firstVar:', firstVar, ' scndVar:', scndVar, '\n')
#cat('nrow df:', nrow(covarDf), '\n')
#change column names
colVec=c()
for(colName in colnames(covarDf))
{
if(colName==firstVar)
{
colName='firstVar'
}else if(colName==scndVar)
{
colName='scndVar'
}
colVec=c(colVec, colName)
}
colnames(covarDf)=colVec
#create inetarction term for regression model
covarDf$interactionTerm = covarDf$firstVar*covarDf$scndVar
#assign the index of the two varaints for re assigning the actual names
firstVarIndex=9
scndVarIndex=10
#call the model
fmla=str_glue('{inPhenoColumn} ~ AGE + SEX + PC1 + PC2 + PC3 + PC4 + PC5 + firstVar + scndVar + interactionTerm')
fmla
fmla=as.formula(fmla)
interactModel = glm(fmla, data = covarDf, family="binomial")
summaryStats = summary(interactModel)
statDf=data.frame('Variable'=row.names(coef(summaryStats)), coef(summaryStats))
statDf
colnames(statDf)=c('Variable', 'Estimate', 'StdError', 'Zvalue', 'Pvalue')
#assign back the variant names
statDf$Variable[firstVarIndex]=firstVar
statDf$Variable[scndVarIndex]=scndVar
return(statDf)
}
#call the regression model per each two variants
resList=lapply(varPairList, callRegress, covarDf=final_covar, inPhenoColumn='PHENO')
length(resList)
resList
#loop thru the list and write the data frames ; one file per df
#name each output with the two varaints names
#loop thru the list and write the data frames ; one file per df
#name each output with the two varaints names
matDf = data.frame(matrix(0,ncol=numVarsInput,nrow=numVarsInput))
matDf
colnames(matDf)=varNameVec
rownames(matDf)=varNameVec
matDf
for (num in 1:length(resList))
{
output_tab <- resList[[num]]
outTabName <- as.character(paste("chr",sub(":","_",output_tab[9,1]),"_chr",sub(":","_",output_tab[10,1]),".tab", sep=""))
write.table(output_tab, file=outTabName, sep='\t', row.names = F, quote = F)
#fill the df
matDf[output_tab[9,1],output_tab[10,1]]=output_tab[11,5]
matDf[output_tab[10,1], output_tab[9,1]]=output_tab[11,5]
}
#matDf
#plot
matInput=as.matrix(matDf)
fileName=str_glue('{outFilePrefix}_heatmap.png')
fileName
png(fileName)
corrplot(matInput, is.corr = FALSE, type="upper")
dev.off()
# remember to flag outputs with suffix from class_in list.
}