-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathFile_splitter.py
executable file
·155 lines (122 loc) · 5.58 KB
/
File_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
@authors: Juan L. Trincado
@email: [email protected]
File_splitter.py: Split an input file into n pieces and generate a command for Format_genotype.py for
each piece
"""
#V2: This is exactly as V1, but adapted for running Format_genotype_v4.py
import pandas as pd
import logging
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create console handler and set level to info
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
logger.addHandler(ch)
from argparse import ArgumentParser, RawTextHelpFormatter
description = \
"Description:\n\n" + \
"This script splits an input file the file into n pieces"
parser = ArgumentParser(description=description, formatter_class=RawTextHelpFormatter,
add_help=True)
parser.add_argument("-i", "--input", required=True,
help="Input file")
parser.add_argument("-n", "--n", required=True,
help="Number of pieces")
parser.add_argument("-p", "--phenotype", required=False,
help="Psi file")
parser.add_argument("-g", "--genotype", required=False,
help="Psi file")
parser.add_argument("-o", "--output", required=False,
help="Output folder")
parser.add_argument("-s", "--script", required=True,
help="Location of the script to run")
parser.add_argument("-m", "--mode", required=False,
help="Mode of execution (SCLC or TCGA)")
parser.add_argument("-e", "--execution", required=True,
help="Execution options (J for Junctions, IR for Intron Retention)")
def main():
args = parser.parse_args()
try:
input = args.input
output = args.output
n = int(args.n)
phenotype = args.phenotype
genotype = args.genotype
script = args.script
mode = args.mode
execution = args.execution
# input = "/projects_rg/SCLC_cohorts/George/IR/normalized_tpm_George_Peifer.tab"
# output = "/data/users/juanluis/SCLC_quantifications/George/v2/IR/format"
# n = 100
# # phenotype = args.phenotype
# genotype = "/genomics/users/juanluis/FastQTL_analysis/v3/SCLC/Junctions/tables/introns_Ensembl_mutations_filtered.bed"
# script = "/genomics/users/juanluis/comprna/Junckey/Format_genotype_v5.py"
# mode = "SCLC"
# execution = "IR"
logger.info("File "+input+" will be splitted in "+str(n)+" pieces")
# 1. Load the input file
logger.info("Loading "+input+"...")
input_file = pd.read_table(input, delimiter="\t")
nameFileaux = input.split("/")[-1]
# 2. Split the file
logger.info("Splitting the file...")
#Generate also a file with the commands for being executed in the cluster
n = int(n)
output_commands_path = output + "/commands_splitter_"+str(n)+".txt"
commands_file = open(output_commands_path, 'w')
#Get the size of each piece
size = int(len(input_file)/n)
for i in range(n):
start = i * size
end = (i+1)*size
piece = input_file[start:end]
#Save each piece in a separate file
output_path = output + "/" + nameFileaux + ".part" + str(i)
if(execution=="J"):
piece.to_csv(output_path, sep="\t", index=False)
commands_file.write("python3.4 "+ script +" -"+ mode +" -p " + phenotype +
" -g "+ genotype +" -c "+ output_path +
" -o "+ output +"/formatted_genotype.vcf.part" + str(i) +
" -i "+ output +"/ids_not_found.txt.part" + str(i) + "\n")
elif(execution=="IR"):
piece.to_csv(output_path, sep="\t", index=True)
commands_file.write("python "+ script +" -"+ mode +" -p " + output_path +
" -g "+ genotype + " -o "+ output + "/formatted_genotype.vcf.part" + str(i) + "\n")
else:
raise Exception("Set one of the execution flags (-J or -IR)")
if(end<len(input_file)):
logger.info("Extra piece number "+str(i+1))
start = end
end = len(input_file) + 1
piece = input_file[start:end]
# Save each piece in a separate file
output_path = output + "/" + nameFileaux + ".part" + str(i+1)
if(execution=="J"):
piece.to_csv(output_path, sep="\t", index=False)
commands_file.write("python3.4 "+ script +" -"+ mode +" -p " + phenotype +
" -g "+ genotype +" -c "+ output_path +
" -o "+ output +"/formatted_genotype.vcf.part" + str(i+1) +
" -i "+ output +"/ids_not_found.txt.part" + str(i+1) + "\n")
elif (execution == "IR"):
piece.to_csv(output_path, sep="\t", index=True)
commands_file.write("python "+ script +" -"+ mode +" -p " + output_path +
" -g "+ genotype + " -o "+ output + "/formatted_genotype.vcf.part" + str(i+1) + "\n")
else:
raise Exception("Set one of the execution flags (-J or -IR)")
logger.info("Saved " + output_commands_path)
commands_file.close()
logger.info("Done. Exiting program. ")
exit(0)
except Exception as error:
logger.error(repr(error))
logger.error("Aborting execution")
exit(1)
if __name__ == '__main__':
main()