Skip to content

Commit

Permalink
rewrite clean_fasta_cdna_cds.py
Browse files Browse the repository at this point in the history
  • Loading branch information
sinamajidian authored Nov 26, 2024
1 parent 54fb18c commit 925b650
Showing 1 changed file with 82 additions and 163 deletions.
245 changes: 82 additions & 163 deletions archive/scripts/clean_fasta_cdna_cds.py
Original file line number Diff line number Diff line change
@@ -1,194 +1,113 @@

from Bio import SeqIO
from Bio.Seq import Seq
import sys
from os import listdir
from os import listdir
import os

def read_fasta_files(input_folder_faa,format_input):
files = listdir(input_folder_faa)
fa_all = []
species_name_all = [] # which are the file name of fasta files



def read_fasta_files(input_folder, format_input="fna"):

files = listdir(input_folder)
records_all = []
file_names = []
for file in files:
sp_name = file.split(".")[:-1]
if file.split(".")[-1] == format_input:
species_name_all.append(".".join(sp_name))
records_prot = list(SeqIO.parse(input_folder_faa+file, "fasta"))
fa_all.append(records_prot)
print("there are ",len(species_name_all),format_input, " files, and the first file has ",len(fa_all[0]),"sequences in it.") # , sum([len(i) for i in og_all]

return (species_name_all, fa_all)

def read_fiveLetter_species_file(input_five_letter_csv):
fiveLetter_species_dic ={}
file1 = open(input_five_letter_csv,"r")
for line in file1:
species_name, fiveLetter_species = line.strip().split("\t")
fiveLetter_species_dic[species_name] = fiveLetter_species
file1.close()
return fiveLetter_species_dic
if file.split(".")[-1] == format_input:
file_names.append(file)
records = list(SeqIO.parse(input_folder + file, "fasta"))
records_all.append(records)
else:
print("we are not reading the file "+str(input_folder+file)+" since extension is not faa.")
if records_all:
print("there are ", len(file_names), format_input, " files, and the first file has ", len(records_all[0]), "sequences in it.")
else:
print("there is no " +format_input, " files in ",input_folder)
return file_names, records_all


def write_fiveLetter_species_file(species_name_all, output_five_letter_tsv):
def create_five_letter(file_names, output_five_letter_tsv = "clean_five_letter_species.tsv"):

fiveLetter_species_dic = {}
try:
for species_name in species_name_all: # let's try to extract a code which unique from file name
fiveLetter_species= species_name.split(".")[0].split("_")[1][-5:] # GCA_000849305.1_ViralProj14697_translated_cds.faa # JN032115.1_cds_from_genomic.fna
fiveLetter_species_dic[species_name] = fiveLetter_species
except:
fiveLetter_species_dic = {}

if len(set(fiveLetter_species_dic.values())) != len(set(species_name_all)):
#"we assume the last five letter of NCBI is unique, please provide five letter code for species name as input as Read2tree works with five letter code specicies name."
fiveLetter_species_dic = {}
countr=0
for species_name in species_name_all:
fiveLetter_species= "s"+str(countr).zfill(4) #species_name.split(".")[0].split("_")[1][-5:]
fiveLetter_species_dic[species_name] = fiveLetter_species
countr+=1


file1 = open(output_five_letter_tsv,"w")
countr = 0
for file_name in file_names:
fiveLetter_species = "s" + str(countr).zfill(4)
fiveLetter_species_dic[file_name] = fiveLetter_species
countr += 1
file_out = open(output_five_letter_tsv, "w")
for species_name, fiveLetter in fiveLetter_species_dic.items():
file1.write(species_name+"\t"+fiveLetter+"\n")
file1.close()

return fiveLetter_species_dic

file_out.write(species_name + "\t" + fiveLetter + "\n")
file_out.close()
print("the five letter codes for each faa files are written in "+output_five_letter_tsv)

return fiveLetter_species_dic


def edit_record_write_fna(species_name_all_fna, fna_all, output_file_fna):
# Add the five letter species code to each record in fasta file
all_prot_fna = []
for species_idx_fna, species_record in enumerate(fna_all):
species_name_fna = species_name_all_fna[species_idx_fna]
species_name_faa = species_name_fna[:-17]+"_translated_cds"
fiveLetter_species = fiveLetter_species_dic[species_name_faa]
#print(species_name_fna, fiveLetter_species)

for prot in species_record:

prot_id_old = prot.id.split(" ")[0][3:]
# lcl|KY249672.1_prot_APW78783.1_1 [protein=NS1] [protein_id=APW78783.1] [location=99..518] [gbkey=CDS]
# >lcl|AF092942.1_cds_AAC96311.1_11
# >lcl|AF092942.1_prot_AAC96311.1_11
prot_id_old_split= prot_id_old.split("_")
try:
prot_id_old_split.remove("prot")
except:
try:
prot_id_old_split.remove("cds")
except:
print("Error: prot/cds is not inside the record id of ", prot.id.split(" ")[0])
print("we expect such format >lcl|AF092942.1_cds_AAC96311.1_11. Contact the developers.")
exit

prot_id_edit = ".".join(prot_id_old_split)

prot_id_new = fiveLetter_species+ prot_id_edit
prot.id = prot_id_new
prot.name = prot_id_new
prot.description = prot_id_new


all_prot_fna.append(prot)

SeqIO.write(all_prot_fna, output_file_fna, "fasta")
def clean_translate(records ,species_fivelet):

return all_prot_fna


def edit_record_write_faa(species_name_all_faa, faa_all, fiveLetter_species_dic, output_folder_faa,all_prot_fna_id_set):

# Add the five letter species code to each record in fasta file
for species_idx, species_record in enumerate(faa_all):
species_name = species_name_all_faa[species_idx]
fiveLetter_species = fiveLetter_species_dic[species_name]
for prot in species_record:
prot_id_old = prot.id.split(" ")[0][3:]
# lcl|KY249672.1_prot_APW78783.1_1 [protein=NS1] [protein_id=APW78783.1] [location=99..518] [gbkey=CDS]
# >lcl|AF092942.1_cds_AAC96311.1_11
# >lcl|AF092942.1_prot_AAC96311.1_11
prot_id_old_split= prot_id_old.split("_")
try:
prot_id_old_split.remove("prot")
except:
try:
prot_id_old_split.remove("cds")
except:
print("Error: prot/cds is not inside the record id of ", prot.id.split(" ")[0])
print("we expect such format >lcl|AF092942.1_cds_AAC96311.1_11. Contact the developers. ")
exit
prot_id_edit = ".".join(prot_id_old_split)

prot_id_new = fiveLetter_species+ prot_id_edit

assert prot_id_new in all_prot_fna_id_set,prot_id_old+"is not in fna file (exact match after removing _cds_ or _prot_)"
prot.id = prot_id_new
prot.name = prot_id_new
prot.description = prot_id_new


SeqIO.write(species_record, output_folder_faa+fiveLetter_species+".fa", "fasta")
records_nuc = []
records_aa = []
for record in records:
sequence = record.seq
remainder = len(sequence) % 3
if remainder != 0:
sequence +=Seq('N' * (3 - remainder))
record.seq= sequence

id_old = str(record.id).replace("_","").replace(".","")
id_new= species_fivelet + id_old

return faa_all
nuc_seq= SeqIO.SeqRecord(sequence, id=id_new, description="cleaned for r2t", name = id_new)

protein_seq = sequence.translate()
protein_seq = SeqIO.SeqRecord(protein_seq, id=id_new, description="cleaned for r2t", name = id_new)


records_nuc.append(nuc_seq)
records_aa.append(protein_seq)

print("the clean aa and nuc for "+species_fivelet+" is ready")

return records_nuc, records_aa




if __name__ == '__main__':

input_folder_faa = sys.argv[1]+"/" # "data/"
output_folder_faa = sys.argv[2]+"/" # "DB/"
if __name__ == '__main__':

output_file_fna = sys.argv[3] # "all_cdna.fa"
input_folder_fna = sys.argv[1] + "/" # "myfolder/input_fna/" #

file_names, records_all = read_fasta_files(input_folder_fna, "fna")
fiveLetter_species_dic = create_five_letter(file_names)


folder_aa= "clean_aa"

if len(sys.argv)>4:
input_five_letter_tsv = sys.argv[4]

if not os.path.exists(folder_aa):
os.makedirs(folder_aa)
else:
input_five_letter_tsv = ""
print("ERROR the folder exists "+folder_aa +" better to remove it ")

records_nuc_all_clean=[]
for idx in range(len(file_names)):
file_name = file_names[idx]
records = records_all[idx]
species_fivelet = fiveLetter_species_dic[file_name]

'''
$ cat five_letter_species.tsv
GCA_003266525.1_ASM326652v1_translated_cds 66525
GCA_000857565.1_ViralProj15251_translated_cds 57565
GCA_000849305.1_ViralProj14697_translated_cds 49305
records_nuc, records_aa = clean_translate(records ,species_fivelet)
SeqIO.write(records_aa, folder_aa+"/"+species_fivelet+".fa", "fasta")
records_nuc_all_clean += records_nuc # one big list

'''

output_five_letter_tsv = input_folder_faa+"five_letter_species.tsv" # argv[2]


input_folder_fna = input_folder_faa

(species_name_all_faa, faa_all) = read_fasta_files(input_folder_faa,"faa")
(species_name_all_fna, fna_all) = read_fasta_files(input_folder_fna,"fna")

assert len(species_name_all_faa) ==len(species_name_all_fna), "the number of faa and fna files should be the same in the folder."

assert len(faa_all) ==len(fna_all), "the number of faa and fna records should be the same."

if input_five_letter_tsv:
fiveLetter_species_dic = read_fiveLetter_species_file(input_five_letter_tsv)

else:
fiveLetter_species_dic= write_fiveLetter_species_file(species_name_all_faa, output_five_letter_tsv)



all_prot_fna = edit_record_write_fna(species_name_all_fna, fna_all, output_file_fna)
print("Edited cdna records are written to the file",output_file_fna)

all_prot_fna_recordid = [i.id for i in all_prot_fna]
all_prot_fna_id_set = set(all_prot_fna_recordid)
assert len(all_prot_fna_recordid) == len(all_prot_fna_id_set), "all record id in fna files should be unique. we consider this format when we checl" +all_prot_fna_recordid[0]


faa_all = edit_record_write_faa(species_name_all_faa, faa_all, fiveLetter_species_dic, output_folder_faa, all_prot_fna_id_set)

print("Edited protien records are written to the folder",output_folder_faa)


SeqIO.write(records_nuc_all_clean, "dna_ref.fa", "fasta")

print("we wrote "+str(len(file_names))+" faa fiels in the folder "+folder_aa+" and the nucluetide sequences all together in dna_ref.fa" )

print("Now you can use the folder with OMA standalone" )

0 comments on commit 925b650

Please sign in to comment.