-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatistic.py
119 lines (105 loc) · 4.19 KB
/
statistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
import re
import logging
import argparse
import sys
LOG = logging.getLogger(__name__)
__version__ = "1.0.1" #设置版本信息
__author__ = ("Boya Xu",) #输入作者信息
__email__ = "[email protected]"
__all__ = []
def add_help_args(parser): #帮助函数
parser.add_argument('-f', default="", type=str, help='fasta file')
parser.add_argument('-a', default="", type=str, help='Annotation file')
parser.add_argument('--out', '-o', type=str, default="", help="out put file")
return parser
def process_file(reader):
'''Open, read,and print a file'''
names=[]
index=0
dict={}
for line in reader:
if line.startswith('>'):
if index >=1:
names.append(line)
index =index+1
name=line[:-1]
seq = ''
else:
seq +=line[:-1]
dict[name]=seq
return dict
def statistic(gff, genomic, outfile):
#统计Gene CDS rRNA tRNA信息的主函数
sum_g, num_g, sum_r, sum_c, num_r, sum_t, num_t, num_c = 0, 0, 0, 0, 0, 0, 0, 0
length_g, length_r, length_t, length_c, length = 0, 0, 0, 0, 0
m = 1
input_file = open(genomic, "r")
reader = input_file.readlines()
items = process_file(reader)
for key in items:
length = int(len(items[key]))
with open(gff) as file_object:
for line in file_object:
if line.startswith('#'):
if m == 0:
num_g -= 1
sum_g -= length_g
else:
m = 0
continue
else:
w_1 = line
p1 = re.compile(r'[a-zA-Z]+')
w_1_1 = p1.findall(w_1)
p2 = re.compile(r'\d+')
w_1_2 = p2.findall(w_1)
if w_1_1[2] == 'CDS':
length_c = int(w_1_2[2]) - int(w_1_2[1]) + 1
sum_c += length_c
num_c += 1
m += 1
elif w_1_1[2] == 'gene':
length_g = int(w_1_2[2]) - int(w_1_2[1]) + 1
sum_g += length_g
num_g += 1
elif w_1_1[2] == 'rRNA':
length_r = int(w_1_2[2]) - int(w_1_2[1]) + 1
sum_r += length_r
num_r += 1
elif w_1_1[2] == 'tRNA':
length_t = int(w_1_2[2]) - int(w_1_2[1]) + 1
sum_t += length_t
num_t += 1
else:
continue
len_g_p = round((sum_g * 1.00) / num_g, 2)
len_r_p = round((sum_r * 1.00) / num_r, 2)
len_t_p = round((sum_t * 1.00) / num_t, 2)
len_c_p = round((sum_c * 1.00) / num_c, 2)
percentage_g = round((sum_g * 100.00)/length, 2)
percentage_t = round((sum_t * 100.00)/length, 2)
percentage_r = round((sum_r * 100.00)/length, 2)
percentage_c = round((sum_c * 100.00)/length, 2)
site_g = {"gene": "gene\t{}\t{}\t{}\t{}%".format(sum_g, num_g, len_g_p, percentage_g)}
site_c = {"CDS": "CDS\t{}\t{}\t{}\t{}%".format(sum_c, num_c, len_c_p, percentage_c)}
site_r = {"rRNA": "rRNA\t{}\t{}\t{}\t{}%".format(sum_r, num_r, len_r_p, percentage_r)}
site_t = {"tRNA": "tRNA\t{}\t{}\t{}\t{}%".format(sum_t, num_t, len_t_p, percentage_t)}
f = open(outfile, 'w+')
output = 'class\ttotal\tnumber\taverage\tpercentage\n{}\n{}\n{}\n{}\n'
output = output.format(site_g['gene'], site_c['CDS'], site_r['rRNA'], site_t['tRNA'])
f.write(output)
f.close()
return
def main(): #主函数,执行函数
logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="[%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description='''
name:statistic.py -- 统计分析GeSeq软件注释结果
attention: python statistic.py -f genomic.fasta -a genomic.gff3 -o out
version: %s
contact: %s <%s>\
''' % (__version__, ' '.join(__author__), __email__))
args = add_help_args(parser).parse_args()
statistic(args.a, args.f, args.out)
if __name__ == "__main__": #固定格式,使 import 到其他的 python 脚本中被调用(模块重用)执行
main()