-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvariant_interaction.R
167 lines (138 loc) · 4.62 KB
/
variant_interaction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#load libraries
suppressMessages( library(tidyverse) )
suppressMessages( library(data.table) )
suppressMessages( library(corrplot) )
cat('\nFinsihed loading libraries\n')
#call script from command line as: Rscript variant_interaction.R [plinkFilePrefix] [snpFile] [outFilePrefix] [covarFile]
#ex:Rscript variant_interaction.R GWAS_input chr1_4SNPs.txt covariates.txt Test3
args = commandArgs(trailingOnly = T)
cat('\nSetting varaibles\n')
#set up the plink files/dataset prefix. Should have .bed, .bim, amd .fam files
plinkFilePrefix=args[1]
#name of file storing the SNPs to be extracted
#snpExtractFile='chr1_2SNPs.txt'
snpExtractFile=args[2]
if(file.exists(snpExtractFile)==F)
{
cat('snpExtractFile file not found\n')
q()
}
#get the covariate file name
covariateFile=args[3]
#prefix of output file
outFilePrefix=args[4]
cat('\nUser arguments are:\n')
cat('plink file prefix:', plinkFilePrefix, '\n')
cat('snpExtract file:', snpExtractFile, '\n')
cat('covariate file:', covariateFile, '\n')
cat('output file prefix:', outFilePrefix, '\n')
#########################
########Methods##########
#function to call regression
callRegress = function(inVarList,covarDf, inPhenoColumn)
{
#inVarList='4_17968811_A 4_77110365_G'
#covarDf=final_covar
cat('\nvars are:', inVarList, '\n')
s=unlist(strsplit(inVarList, ' '))
firstVar=s[1]
scndVar=s[2]
cat('firstVar:', firstVar, ' scndVar:', scndVar, '\n')
#cat('nrow df:', nrow(covarDf), '\n')
#change column names
colVec=c()
for(colName in colnames(covarDf))
{
if(colName==firstVar)
{
colName='firstVar'
}else if(colName==scndVar)
{
colName='scndVar'
}
colVec=c(colVec, colName)
}
colnames(covarDf)=colVec
#create inetarction term for regression model
covarDf$interactionTerm = covarDf$firstVar*covarDf$scndVar
#assign the index of the two varaints for re assigning the actual names
firstVarIndex=9
scndVarIndex=10
#call the model
fmla=str_glue('{inPhenoColumn} ~ AGE + SEX + PC1 + PC2 + PC3 + PC4 + PC5 + firstVar + scndVar + interactionTerm')
fmla
fmla=as.formula(fmla)
interactModel = glm(fmla, data = covarDf, family="binomial")
summaryStats = summary(interactModel)
statDf=data.frame('Variable'=row.names(coef(summaryStats)), coef(summaryStats))
statDf
colnames(statDf)=c('Variable', 'Estimate', 'StdError', 'Zvalue', 'Pvalue')
#assign back the variant names
statDf$Variable[firstVarIndex]=firstVar
statDf$Variable[scndVarIndex]=scndVar
return(statDf)
}
########End of Methods##########
#########################
#read the covar file
covar=fread(covariateFile)
head(covar)
#path to plink cmd
plinkCmd='/Users/rami/Documents/tools/plink/plink_mac_20210416/plink'
if(file.exists(plinkCmd)==F)
{
cat('plink cmd not found\n')
q()
}
#set up the plink command
cmd=str_glue('{plinkCmd} --bfile {plinkFilePrefix} --extract {snpExtractFile} --recode A --out {outFilePrefix}')
cat('cmd:', cmd, '\n')
system(cmd)
#read the raw file
genotypeRawFile=str_glue('{outFilePrefix}.raw')
geno=fread(genotypeRawFile)
head(geno)
ncols=ncol(geno)
#number of varaints
numVarsInput=ncols-6
numVarsInput
cat('Number of input variants:', numVarsInput, '\n')
#make a vector of the input variants
varNameVec=colnames(geno)[7:ncol(geno)]
varNameVec
#merge the geno and covariate file
final_covar=merge(covar, geno, by.x = "FID", by.y = "FID")
head(final_covar)
#scale to deal with zero values
final_covar[,20:(19+numVarsInput)] = final_covar[,20:(19+numVarsInput)] +1
#colnames(final_covar)
#get the 2 pair combinations of variants
varPairList=combn(colnames(final_covar)[20:(20+numVarsInput-1)], 2, simplify = F)
varPairList
#call the regression model per each two variants
resList=lapply(varPairList, callRegress, covarDf=final_covar, inPhenoColumn='PHENO')
#length(resList)
#resList
#initialize matrix to fill in p-values for plotting
matDf = data.frame(matrix(0,ncol=numVarsInput,nrow=numVarsInput))
#matDf
colnames(matDf)=varNameVec
rownames(matDf)=varNameVec
#matDf
#loop thru the list and write the data frames ; one file per df name each output with the two varaints used
for (num in 1:length(resList))
{
output_tab = resList[[num]]
outTabName = as.character(paste(outFilePrefix,"_chr",sub(":","_",output_tab[9,1]),"_chr",sub(":","_",output_tab[10,1]),".tab", sep=""))
write.table(output_tab, file=outTabName, sep='\t', row.names = F, quote = F)
#fill the df
matDf[output_tab[9,1],output_tab[10,1]]=output_tab[11,5]
matDf[output_tab[10,1], output_tab[9,1]]=output_tab[11,5]
}
#plot
matInput=as.matrix(matDf)
fileName=str_glue('{outFilePrefix}_heatmap.png')
fileName
png(fileName)
corrplot(matInput, is.corr = FALSE, type="upper")
dev.off()