-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcatConsenseGeneflow.py
86 lines (67 loc) · 2.71 KB
/
catConsenseGeneflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/python
import sys
import os.path
import argparse
import re
import logging
import warnings
import csv
import glob
## Function: Test for readable directory
def readable_dir(prospective_dir):
if not os.path.isdir(prospective_dir):
raise argparse.ArgumentTypeError("readable_dir:{0} is not a valid path".format(prospective_dir))
if os.access(prospective_dir, os.R_OK):
if( not prospective_dir.endswith("/") ):
prospective_dir = prospective_dir + "/"
return prospective_dir
else:
raise argparse.ArgumentTypeError("readable_dir:{0} is not a readable dir".format(prospective_dir))
## Function: Filename extractor from filepath
def getIsolateID(filePathString):
splitStr = re.split(pattern='/', string=filePathString)
fileNameIdx = len(splitStr) - 1
isolateString = re.split(pattern='\.', string=splitStr[fileNameIdx])
if(len(isolateString[0]) < 10):
isolateString = re.split(pattern='\.', string=splitStr[0])
return isolateString[0]
logger = logging.getLogger("catConsenseGeneflow.py")
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
## Default: reads a folder full of files with suffix, *. consensus.fa, from Geneflow output
parser = argparse.ArgumentParser(description='reads *consensus.fa or *recode.fasta files in input folder', usage="catConsenseGeneflow.py --inDir input_folder")
parser.add_argument('--inDir', type=readable_dir, action='store')
## for reading .fasta files output by recodeVCFtoConsensus_pipe.py
parser.add_argument('--recode', '-re', default='N', choices=['Y','N'], help="read folder full of files with suffix, *.recode.fasta")
args = parser.parse_args()
from os import listdir
from os.path import isfile, join
#print(args.inDir)
# old option, list comprehension
#onlyFiles = [f for f in listdir(args.inDir) if isfile(join(args.inDir, f))]\
# second option, glob, enables --recode switch
if(args.recode == 'N'):
onlyFiles = glob.glob(args.inDir + "*consensus.fa")
elif(args.recode == 'Y'):
onlyFiles = glob.glob(args.inDir + "*recode.fasta")
#print(onlyFiles[0])
fastaLines = []
## loop with a Python iterator for each input file
## for an iterator, each time same operation is performed, "next" result is given
for fas in onlyFiles:
with open (fas, 'rt') as my_file_handle:
for fasLine in my_file_handle:
fastaLines.append(fasLine)
## special print, cat multifasta without internal '>'
## print(fastaLines[0] + fastaLines[1])
print(">" + getIsolateID(fas))
genomeStr = ''
for l in fastaLines:
if(re.match(r'^(A|T|C|G|N)+', l, re.IGNORECASE)):
genomeStr = genomeStr + l.rstrip()
print(genomeStr)
fastaLines = []