-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpool_results.py
executable file
·150 lines (125 loc) · 5.86 KB
/
pool_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os, sys, glob, math
# TODO The rpkm.tab generation is actually commented because some error converting to int, fix this
try:
inFilePattern = sys.argv[1]+"/*/SJ.out.geneAnnotated.bed"
# totalMappedReadsFilePath = sys.argv[1]+"/totalMappedReads.tab"
# averageLengthFilePath = sys.argv[1]+"/averageLength.tab"
outCountsFilePath = sys.argv[2]+"/readCounts.tab"
outCountsNormFilePath = sys.argv[2]+"/readNormCounts.tab"
# outRpkmFilePath = sys.argv[2]+"/rpkm_final.tab"
totalJunctionReadsPath = sys.argv[3]
# inFilePattern = "/projects_rg/SCLC_cohorts/George/STAR/v2/*/SJ.out.geneAnnotated.bed"
# # totalMappedReadsFilePath = "/projects_rg/Bellmunt/STAR/TEST/totalMappedReads.tab"
# # averageLengthFilePath = "/projects_rg/Bellmunt/STAR/TEST/averageLength.tab"
# totalJunctionReadsPath = "/projects_rg/SCLC_cohorts/tables/splice_junctions_mapped_STAR_all_cohorts.tab"
# outCountsFilePath = "/projects_rg/SCLC_cohorts/George/STAR/v2/readCounts_v2.tab"
# outCountsNormFilePath = "/projects_rg/SCLC_cohorts/George/STAR/v2/normReadCounts.tab"
# # outRpkmFilePath = "/projects_rg/Bellmunt/STAR/TEST/rpkm_final.tab"
# Load the mapped junction reads
totalJunctionReads = {}
with open(totalJunctionReadsPath) as f:
for line in f:
id_sample = line.rstrip().split("\t")[0].split("/")[0]
spliced_reads = int(line.rstrip().split("\t")[1])
if(id_sample not in totalJunctionReads):
totalJunctionReads[id_sample] = spliced_reads
else:
pass
inFilePaths = glob.glob(inFilePattern)
print("\tpool_results.py: Pooling data...")
metaDict = {}
countDict, countNormDict = {}, {}
headerItems = None
samples = []
for inFilePath in inFilePaths:
inFileData = [line.rstrip().split("\t") for line in open(inFilePath)]
sampleID = os.path.basename(os.path.dirname(inFilePath))
print("\t\tpool_results.py: Processing "+sampleID+"...")
if inFilePath == inFilePaths[0]:
headerItems = inFileData.pop(0)
headerItems = headerItems[:4] + headerItems[5:]
else:
inFileData.pop(0)
#headerItems.append(sampleID)
samples.append(sampleID)
if(sampleID in totalJunctionReads):
totalJunctionReads_aux = totalJunctionReads[sampleID]
else:
print("\t\tpool_results.py: " + sampleID + " not in STAR mapped reads")
totalJunctionReads_aux = -1
for rowItems in inFileData:
metaItems = rowItems[:4] + rowItems[5:]
rowID = metaItems[0]
counts = rowItems[4]
#Normalize the counts by the total number of junction reads and multiply by 10⁶
if(counts!=0 and totalJunctionReads_aux!=-1):
normcounts = str((int(counts)/totalJunctionReads_aux)*1000000)
else:
normcounts = 0
if not rowID in metaDict:
metaDict[rowID] = metaItems
if not rowID in countDict:
countDict[rowID] = {}
if not rowID in countNormDict:
countNormDict[rowID] = {}
countDict[rowID][sampleID] = counts
countNormDict[rowID][sampleID] = normcounts
for item in sorted(samples):
headerItems.append(item)
# totalMappedReads = {}
# for line in open(totalMappedReadsFilePath):
# lineitems = line.rstrip().split("\t")
# totalMappedReads[lineitems[0]] = lineitems[1]
#
# averageLength = {}
# for line in open(averageLengthFilePath):
# lineitems = line.rstrip().split("\t")
# averageLength[lineitems[0]] = lineitems[1]
outCountsFile = open(outCountsFilePath, 'w')
outCountsFile.write("\t".join(headerItems) + "\n")
outNormCountsFile = open(outCountsNormFilePath, 'w')
outNormCountsFile.write("\t".join(headerItems) + "\n")
# outRpkmFile = open(outRpkmFilePath, 'w')
# outRpkmFile.write("\t".join(headerItems) + "\n")
print("\tpool_results.py: There are %d genes to calculate" % len(metaDict))
print("")
i = 1
for rowID in sorted(metaDict):
sys.stdout.write("\rCurrently on: %d" % i,)
sys.stdout.flush()
outCountsRow = list(metaDict[rowID])
outNormCountsRow = list(metaDict[rowID])
# outRpkmRow = list(metaDict[rowID])
for sampleID in headerItems[8:]:
sampleCount = "0"
sampleNormCount = "0"
# rpkm = "0"
if sampleID in countDict[rowID]:
sampleCount = str(countDict[rowID][sampleID])
# tmr = float(totalMappedReads[sampleID])
# al = float(averageLength[sampleID])
# rpkm = "%.9f" % ((math.pow(10,9) * float(sampleCount)) / (tmr * al))
outCountsRow.append(sampleCount)
if sampleID in countNormDict[rowID]:
sampleNormCount = str(countNormDict[rowID][sampleID])
# tmr = float(totalMappedReads[sampleID])
# al = float(averageLength[sampleID])
# rpkm = "%.9f" % ((math.pow(10,9) * float(sampleCount)) / (tmr * al))
outNormCountsRow.append(sampleNormCount)
# outRpkmRow.append(rpkm)
outCountsFile.write("\t".join(outCountsRow) + "\n")
outNormCountsFile.write("\t".join(outNormCountsRow) + "\n")
# outRpkmFile.write("\t".join(outRpkmRow) + "\n")
i += 1
i -= 1
sys.stdout.write("\t\rCurrently on: %d\n" % i, )
outCountsFile.close()
print("\tpool_results.py: Generated file "+outCountsFilePath)
outNormCountsFile.close()
print("\tpool_results.py: Generated file "+outCountsNormFilePath)
# outRpkmFile.close()
# print("pool_results.py: Generated file " + outRpkmFilePath)
except Exception as error:
print('\nERROR: ' + repr(error))
print("\tpool_results.py: Aborting execution")
sys.exit(1)