-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmergeVtFiles.pl
executable file
·240 lines (212 loc) · 7.37 KB
/
mergeVtFiles.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/perl
#Rajan Banerjee reb2143
#This script will take the encrypted files generated from Alice and Bob and will merge the files.
#This script will be run by Trevor
my $num_dirs=30;
my $src =""; #The directory of one person's numbered gene directories
my $output=""; #A trusted third party's directory that will contain the merged numbered gene directories
my @array=@ARGV;
my $length = scalar(@array);
$! = "Inputted file will not open";
for($i=0; $i<=($length-1); $i++){
if($array[$i] eq -src) {
$i++;
$src ="$array[$i]";
$src=~s/\/$//; #removes the last / of the directory path, if its given.
}
if($array[$i] eq -out){
$i++;
$output="$array[$i]";
$output=~s/\/$//;
}
}
@dir_list = `find $src/* -maxdepth 0 -type d`;
chomp @dir_list;
$num = scalar(@dir_list);
my %h; #The hash of gene names found in the first directory
for (my $it=0; $it < $num; $it++) {
open(info, "$dir_list[$it]"."/info.txt");
while(1) {
binmode info; my $gene_name = "";
if(($n = read info, $gene_name, 8)!=8) {
if($n == 0) {last;}
else { die "Gene name not in required format\n";}
}
my $sep = "";
if(($n = read info, $sep, 1)!=1) {
die "No proper separator after $gene_name. Cannot map\n"; }
chomp $sep;
if(!($sep eq " ")) {
die "No proper tab separator after $gene_name. Cannot map\n"; }
my $s = "";my $i = 0;my @gene = "";
while(($n = read info, $s, 1)!=0) {
if($s ne "\n") {
$gene[$i++] = $s;
} else { last;}
}
if($n == 0) {
die "No proper separator after @gene. Cannot map\n"; }
my $gene_loc = join('', @gene);
if(!exists($h{$gene_name}{$it})){
$h{$gene_name}{$it}=$gene_loc; #a gene name maps to its numbered directory in dir1
} else {
print "Why are there duplicate genes in your info file?!\n"; #there should not be duplicates
}
}
close info;
}
my @geneCount = ((0) x $num_dirs);
open infoOut, ">", "$output"."/info.txt"; #creates a new info file mapping the merged list of genes to a numbered directory
for(my $rdir=0;$rdir<$num_dirs;$rdir++) {
system("mkdir -p $output/DIR_$rdir");
}
#print scalar(keys %h);
my $complete = 0;
my $thresh = 0;
foreach $Gene (sort keys %h){
if(scalar(keys(%{$h{$Gene}}))==1){
formatFiles($h{$Gene});
} else {
mergeFiles($h{$Gene}); #merge the files
}
$complete ++;
if($complete > $thresh) {
print "Merging $complete complete\n";
$thresh+= 1000;
}
}
close infoOut;
sub formatFiles{
my $par = shift;
foreach $ind (keys %{$par}) {
my $index = $ind;
my $dir = $dir_list[$index];
my $filenum = $par->{$index};
my %position; #hash of the encrypted positions
my %person; #hash of the encrypted ids
my $snvCount=0;
my $indCount =0;
my $random_number = int(rand($num_dirs));
$geneCount[$random_number]++;
my $output_dir = $output."/DIR_$random_number/gene".$geneCount[$random_number];
print infoOut "$Gene\t$output_dir\n";
#if (! -d $output_dir) { system("mkdir -p $output_dir"); }#, 0777; chmod 0777, $output_dir;
#else { `rm -rf $output_dir/*`;}
open genotypeOut, ">", "$output_dir.data.geno" or die "help!3";
open phenotypeOut, ">", "$output_dir.data.pheno" or die "help!6";
open weightsOut, ">", "$output_dir.data.wt" or die "help!9";
open genotypeIn, "$dir/$filenum.data.geno";
open phenotypeIn, "$dir/$filenum.data.pheno";
open weightsIn, "$dir/$filenum.data.wt";
while(<weightsIn>){ #this is the polyphen weight file which is: encryptedposition\tscore
my $line = $_;
chomp $line;
my @a = split(/\t/,$line);
if(!exists($position{$a[0]})){ #creates a hash of the encrypted position
$snvCount++;
$position{$a[0]}="$snvCount"."\t"."$a[1]"; #$a[1] is the polyphen weight
print weightsOut "$position{$a[0]}\n"; #rewrites it into an acceptable format
} else {
print "There are duplicate SNPs in gene $filenum! Is there a problem with these files?\n"; #This shouldn't print, otherwise there is a problem!
}
}
while(<phenotypeIn>){
my $line =$_;
chomp $line;
my @a = split(/\t/, $line);
if(!exists($person{$a[0]})){ #creates a hash of encrypted id's
$indCount++;
$person{$a[0]}="$indCount"."\t"."$a[1]";
print phenotypeOut "$person{$a[0]}\n"; #rewrites to an accepted format
} else {
print "There are duplicate ID's in gene $filenum! Is there a problem with these files?\n";
}
}
while(<genotypeIn>){
my $line = $_;
chomp $line;
my @a = split(/\t/,$line);
if(!exists($person{$a[0]})||!exists($position{$a[1]})){ #checks that the position and id in the genotype file do exist in the other two files
print "There are ID's or SNV's in the genotype file of gene $filenum not found in other files of $dir! Is there a problem with these files?\n";
} else {
my @b= split(/\t/, $person{$a[0]});
my @c = split(/\t/, $position{$a[1]});
print genotypeOut "$b[0]\t$c[0]\t$a[2]\n"; #prints it in an accepted format, keeping the same numeric substitution for the encrypted positions and ids
}
}
close genotypeIn;
close genotypeOut;
close phenotypeIn;
close phenotypeOut;
close weightsIn;
close weightsOut;
}
}
sub mergeFiles{
#print "Merge\n";
my $par = shift;
my $random_number = int(rand($num_dirs));
$geneCount[$random_number]++;
my $output_dir = $output."/DIR_$random_number/gene".$geneCount[$random_number];
print infoOut "$Gene\t$output_dir\n";
#if (! -d $output_dir) { system("mkdir -p $output_dir"); }#, 0777; chmod 0777, $output_dir;
#else { `rm -rf $output_dir/*`;}
open genotypeOut, ">", "$output_dir.data.geno" or die "help!3";
open phenotypeOut, ">", "$output_dir.data.pheno" or die "help!6";
open weightsOut, ">", "$output_dir.data.wt" or die "help!9";
my %position;
my $snvCount=0;
my $indCount =0;
foreach $ind (keys %{$par}) {
my $index = $ind;
my $dir = $dir_list[$index];
my $filenum = $par->{$index};
my %person;
open genotypeIn, "<", "$dir/$filenum.data.geno" or die "help!1";
open phenotypeIn, "<", "$dir/$filenum.data.pheno" or die "help!4";
open weightsIn, "<", "$dir/$filenum.data.wt" or die "help!7";
#This is the same in the formatFiles subfunction
while(<weightsIn>){
my $line = $_;
chomp $line;
my @a = split(/\t/,$line);
if(!exists($position{$a[0]})){
$snvCount++;
$position{$a[0]}="$snvCount"."\t"."$a[1]"; #$a[1] is the polyphen weight
print weightsOut "$position{$a[0]}\n";
}
}
#The next two are similar to the two above, but with the Id's instead of the positions.
while(<phenotypeIn>){
my $line =$_;
chomp $line;
my @a = split(/\t/, $line);
if(!exists($person{$a[0]})){
$indCount++;
$person{$a[0]}="$indCount"."\t"."$a[1]";
print phenotypeOut "$person{$a[0]}\n";
} else {
print "There are duplicate ID's in gene $filenum! Is there a problem with these files?\n";
}
}
#These next two while loops are similar to the loop in the formatFiles subfunction
while(<genotypeIn>){
my $line = $_;
chomp $line;
my @a = split(/\t/,$line);
if(!exists($person{$a[0]})||!exists($position{$a[1]})){
print "There are ID's or SNV's in the genotype file of gene $filenum not found in other files! Is there a problem with these files?\n";
} else {
my @b= split(/\t/, $person{$a[0]});
my @c = split(/\t/, $position{$a[1]});
print genotypeOut "$b[0]\t$c[0]\t$a[2]\n";
}
}
close genotypeIn;
close phenotypeIn;
close weightsIn;
}
close genotypeOut;
close phenotypeOut;
close weightsOut;
}