-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
54fb18c
commit 925b650
Showing
1 changed file
with
82 additions
and
163 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,194 +1,113 @@ | ||
|
||
from Bio import SeqIO | ||
from Bio.Seq import Seq | ||
import sys | ||
from os import listdir | ||
from os import listdir | ||
import os | ||
|
||
def read_fasta_files(input_folder_faa,format_input): | ||
files = listdir(input_folder_faa) | ||
fa_all = [] | ||
species_name_all = [] # which are the file name of fasta files | ||
|
||
|
||
|
||
def read_fasta_files(input_folder, format_input="fna"): | ||
|
||
files = listdir(input_folder) | ||
records_all = [] | ||
file_names = [] | ||
for file in files: | ||
sp_name = file.split(".")[:-1] | ||
if file.split(".")[-1] == format_input: | ||
species_name_all.append(".".join(sp_name)) | ||
records_prot = list(SeqIO.parse(input_folder_faa+file, "fasta")) | ||
fa_all.append(records_prot) | ||
print("there are ",len(species_name_all),format_input, " files, and the first file has ",len(fa_all[0]),"sequences in it.") # , sum([len(i) for i in og_all] | ||
|
||
return (species_name_all, fa_all) | ||
|
||
def read_fiveLetter_species_file(input_five_letter_csv): | ||
fiveLetter_species_dic ={} | ||
file1 = open(input_five_letter_csv,"r") | ||
for line in file1: | ||
species_name, fiveLetter_species = line.strip().split("\t") | ||
fiveLetter_species_dic[species_name] = fiveLetter_species | ||
file1.close() | ||
return fiveLetter_species_dic | ||
if file.split(".")[-1] == format_input: | ||
file_names.append(file) | ||
records = list(SeqIO.parse(input_folder + file, "fasta")) | ||
records_all.append(records) | ||
else: | ||
print("we are not reading the file "+str(input_folder+file)+" since extension is not faa.") | ||
if records_all: | ||
print("there are ", len(file_names), format_input, " files, and the first file has ", len(records_all[0]), "sequences in it.") | ||
else: | ||
print("there is no " +format_input, " files in ",input_folder) | ||
return file_names, records_all | ||
|
||
|
||
def write_fiveLetter_species_file(species_name_all, output_five_letter_tsv): | ||
def create_five_letter(file_names, output_five_letter_tsv = "clean_five_letter_species.tsv"): | ||
|
||
fiveLetter_species_dic = {} | ||
try: | ||
for species_name in species_name_all: # let's try to extract a code which unique from file name | ||
fiveLetter_species= species_name.split(".")[0].split("_")[1][-5:] # GCA_000849305.1_ViralProj14697_translated_cds.faa # JN032115.1_cds_from_genomic.fna | ||
fiveLetter_species_dic[species_name] = fiveLetter_species | ||
except: | ||
fiveLetter_species_dic = {} | ||
|
||
if len(set(fiveLetter_species_dic.values())) != len(set(species_name_all)): | ||
#"we assume the last five letter of NCBI is unique, please provide five letter code for species name as input as Read2tree works with five letter code specicies name." | ||
fiveLetter_species_dic = {} | ||
countr=0 | ||
for species_name in species_name_all: | ||
fiveLetter_species= "s"+str(countr).zfill(4) #species_name.split(".")[0].split("_")[1][-5:] | ||
fiveLetter_species_dic[species_name] = fiveLetter_species | ||
countr+=1 | ||
|
||
|
||
file1 = open(output_five_letter_tsv,"w") | ||
countr = 0 | ||
for file_name in file_names: | ||
fiveLetter_species = "s" + str(countr).zfill(4) | ||
fiveLetter_species_dic[file_name] = fiveLetter_species | ||
countr += 1 | ||
file_out = open(output_five_letter_tsv, "w") | ||
for species_name, fiveLetter in fiveLetter_species_dic.items(): | ||
file1.write(species_name+"\t"+fiveLetter+"\n") | ||
file1.close() | ||
|
||
return fiveLetter_species_dic | ||
|
||
file_out.write(species_name + "\t" + fiveLetter + "\n") | ||
file_out.close() | ||
print("the five letter codes for each faa files are written in "+output_five_letter_tsv) | ||
|
||
return fiveLetter_species_dic | ||
|
||
|
||
def edit_record_write_fna(species_name_all_fna, fna_all, output_file_fna): | ||
# Add the five letter species code to each record in fasta file | ||
all_prot_fna = [] | ||
for species_idx_fna, species_record in enumerate(fna_all): | ||
species_name_fna = species_name_all_fna[species_idx_fna] | ||
species_name_faa = species_name_fna[:-17]+"_translated_cds" | ||
fiveLetter_species = fiveLetter_species_dic[species_name_faa] | ||
#print(species_name_fna, fiveLetter_species) | ||
|
||
for prot in species_record: | ||
|
||
prot_id_old = prot.id.split(" ")[0][3:] | ||
# lcl|KY249672.1_prot_APW78783.1_1 [protein=NS1] [protein_id=APW78783.1] [location=99..518] [gbkey=CDS] | ||
# >lcl|AF092942.1_cds_AAC96311.1_11 | ||
# >lcl|AF092942.1_prot_AAC96311.1_11 | ||
prot_id_old_split= prot_id_old.split("_") | ||
try: | ||
prot_id_old_split.remove("prot") | ||
except: | ||
try: | ||
prot_id_old_split.remove("cds") | ||
except: | ||
print("Error: prot/cds is not inside the record id of ", prot.id.split(" ")[0]) | ||
print("we expect such format >lcl|AF092942.1_cds_AAC96311.1_11. Contact the developers.") | ||
exit | ||
|
||
prot_id_edit = ".".join(prot_id_old_split) | ||
|
||
prot_id_new = fiveLetter_species+ prot_id_edit | ||
prot.id = prot_id_new | ||
prot.name = prot_id_new | ||
prot.description = prot_id_new | ||
|
||
|
||
all_prot_fna.append(prot) | ||
|
||
SeqIO.write(all_prot_fna, output_file_fna, "fasta") | ||
def clean_translate(records ,species_fivelet): | ||
|
||
return all_prot_fna | ||
|
||
|
||
def edit_record_write_faa(species_name_all_faa, faa_all, fiveLetter_species_dic, output_folder_faa,all_prot_fna_id_set): | ||
|
||
# Add the five letter species code to each record in fasta file | ||
for species_idx, species_record in enumerate(faa_all): | ||
species_name = species_name_all_faa[species_idx] | ||
fiveLetter_species = fiveLetter_species_dic[species_name] | ||
for prot in species_record: | ||
prot_id_old = prot.id.split(" ")[0][3:] | ||
# lcl|KY249672.1_prot_APW78783.1_1 [protein=NS1] [protein_id=APW78783.1] [location=99..518] [gbkey=CDS] | ||
# >lcl|AF092942.1_cds_AAC96311.1_11 | ||
# >lcl|AF092942.1_prot_AAC96311.1_11 | ||
prot_id_old_split= prot_id_old.split("_") | ||
try: | ||
prot_id_old_split.remove("prot") | ||
except: | ||
try: | ||
prot_id_old_split.remove("cds") | ||
except: | ||
print("Error: prot/cds is not inside the record id of ", prot.id.split(" ")[0]) | ||
print("we expect such format >lcl|AF092942.1_cds_AAC96311.1_11. Contact the developers. ") | ||
exit | ||
prot_id_edit = ".".join(prot_id_old_split) | ||
|
||
prot_id_new = fiveLetter_species+ prot_id_edit | ||
|
||
assert prot_id_new in all_prot_fna_id_set,prot_id_old+"is not in fna file (exact match after removing _cds_ or _prot_)" | ||
prot.id = prot_id_new | ||
prot.name = prot_id_new | ||
prot.description = prot_id_new | ||
|
||
|
||
SeqIO.write(species_record, output_folder_faa+fiveLetter_species+".fa", "fasta") | ||
records_nuc = [] | ||
records_aa = [] | ||
for record in records: | ||
sequence = record.seq | ||
remainder = len(sequence) % 3 | ||
if remainder != 0: | ||
sequence +=Seq('N' * (3 - remainder)) | ||
record.seq= sequence | ||
|
||
id_old = str(record.id).replace("_","").replace(".","") | ||
id_new= species_fivelet + id_old | ||
|
||
return faa_all | ||
nuc_seq= SeqIO.SeqRecord(sequence, id=id_new, description="cleaned for r2t", name = id_new) | ||
|
||
protein_seq = sequence.translate() | ||
protein_seq = SeqIO.SeqRecord(protein_seq, id=id_new, description="cleaned for r2t", name = id_new) | ||
|
||
|
||
records_nuc.append(nuc_seq) | ||
records_aa.append(protein_seq) | ||
|
||
print("the clean aa and nuc for "+species_fivelet+" is ready") | ||
|
||
return records_nuc, records_aa | ||
|
||
|
||
|
||
|
||
if __name__ == '__main__': | ||
|
||
input_folder_faa = sys.argv[1]+"/" # "data/" | ||
output_folder_faa = sys.argv[2]+"/" # "DB/" | ||
if __name__ == '__main__': | ||
|
||
output_file_fna = sys.argv[3] # "all_cdna.fa" | ||
input_folder_fna = sys.argv[1] + "/" # "myfolder/input_fna/" # | ||
|
||
file_names, records_all = read_fasta_files(input_folder_fna, "fna") | ||
fiveLetter_species_dic = create_five_letter(file_names) | ||
|
||
|
||
folder_aa= "clean_aa" | ||
|
||
if len(sys.argv)>4: | ||
input_five_letter_tsv = sys.argv[4] | ||
|
||
if not os.path.exists(folder_aa): | ||
os.makedirs(folder_aa) | ||
else: | ||
input_five_letter_tsv = "" | ||
print("ERROR the folder exists "+folder_aa +" better to remove it ") | ||
|
||
records_nuc_all_clean=[] | ||
for idx in range(len(file_names)): | ||
file_name = file_names[idx] | ||
records = records_all[idx] | ||
species_fivelet = fiveLetter_species_dic[file_name] | ||
|
||
''' | ||
$ cat five_letter_species.tsv | ||
GCA_003266525.1_ASM326652v1_translated_cds 66525 | ||
GCA_000857565.1_ViralProj15251_translated_cds 57565 | ||
GCA_000849305.1_ViralProj14697_translated_cds 49305 | ||
records_nuc, records_aa = clean_translate(records ,species_fivelet) | ||
SeqIO.write(records_aa, folder_aa+"/"+species_fivelet+".fa", "fasta") | ||
records_nuc_all_clean += records_nuc # one big list | ||
|
||
''' | ||
|
||
output_five_letter_tsv = input_folder_faa+"five_letter_species.tsv" # argv[2] | ||
|
||
|
||
input_folder_fna = input_folder_faa | ||
|
||
(species_name_all_faa, faa_all) = read_fasta_files(input_folder_faa,"faa") | ||
(species_name_all_fna, fna_all) = read_fasta_files(input_folder_fna,"fna") | ||
|
||
assert len(species_name_all_faa) ==len(species_name_all_fna), "the number of faa and fna files should be the same in the folder." | ||
|
||
assert len(faa_all) ==len(fna_all), "the number of faa and fna records should be the same." | ||
|
||
if input_five_letter_tsv: | ||
fiveLetter_species_dic = read_fiveLetter_species_file(input_five_letter_tsv) | ||
|
||
else: | ||
fiveLetter_species_dic= write_fiveLetter_species_file(species_name_all_faa, output_five_letter_tsv) | ||
|
||
|
||
|
||
all_prot_fna = edit_record_write_fna(species_name_all_fna, fna_all, output_file_fna) | ||
print("Edited cdna records are written to the file",output_file_fna) | ||
|
||
all_prot_fna_recordid = [i.id for i in all_prot_fna] | ||
all_prot_fna_id_set = set(all_prot_fna_recordid) | ||
assert len(all_prot_fna_recordid) == len(all_prot_fna_id_set), "all record id in fna files should be unique. we consider this format when we checl" +all_prot_fna_recordid[0] | ||
|
||
|
||
faa_all = edit_record_write_faa(species_name_all_faa, faa_all, fiveLetter_species_dic, output_folder_faa, all_prot_fna_id_set) | ||
|
||
print("Edited protien records are written to the folder",output_folder_faa) | ||
|
||
|
||
SeqIO.write(records_nuc_all_clean, "dna_ref.fa", "fasta") | ||
|
||
print("we wrote "+str(len(file_names))+" faa fiels in the folder "+folder_aa+" and the nucluetide sequences all together in dna_ref.fa" ) | ||
|
||
print("Now you can use the folder with OMA standalone" ) |