Skip to content

Commit

Permalink
adding sequence checks
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Mar 7, 2018
1 parent 0107994 commit 090f5d4
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 148 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ adrsm -d ./data/genomes ./data/short_genome_list.csv
```
maxime@gph:~$ adrsm --help
usage: ADRSM [-h] [-d DIRECTORY] [-r READLENGTH] [-l LENSTDEV] [-fwd FWDADAPT]
[-rev REVADAPT] [-e ERROR] [-o OUTPUT] [-s STATS]
[-rev REVADAPT] [-e ERROR] [-o OUTPUT] [-q QUALITY] [-s STATS]
confFile
Ancient DNA Read Simulator for Metagenomics
Expand All @@ -50,6 +50,7 @@ optional arguments:
AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
-e ERROR Illumina sequecing error. Default = 0.01
-o OUTPUT Output file basename. Default = ./metagenome.*
-q QUALITY Base quality encoding. Default = d (PHRED+64)
-s STATS Statistic file. Default = stats.csv
```
Expand Down
13 changes: 10 additions & 3 deletions adrsm
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ def _get_args():
default="metagenome",
help="Output file basename. Default = ./metagenome.*")
parser.add_argument(
'-q',
dest="quality",
default="d",
help="Base quality encoding. Default = d (PHRED+64)")
parser.add_argument(
'-s',
dest="stats",
default="stats.csv",
Expand All @@ -62,9 +67,10 @@ def _get_args():
a2 = args.revAdapt
err = float(args.error)
outfile= args.output
quality = args.quality
stats = args.stats

return(infile, gendir, readlen, lendev, a1, a2, err, outfile, stats)
return(infile, gendir, readlen, lendev, a1, a2, err, outfile, quality, stats)

def read_config(infile, gendir):
"""
Expand All @@ -83,7 +89,7 @@ def read_config(infile, gendir):
return(genomes)

if __name__ == "__main__":
INFILE, GENDIR, READLEN, LENDEV, A1, A2, ERR, OUTFILE, STATS = _get_args()
INFILE, GENDIR, READLEN, LENDEV, A1, A2, ERR, OUTFILE, QUALITY, STATS = _get_args()
MINLENGTH = 20

genome_dict = {}
Expand All @@ -100,7 +106,8 @@ if __name__ == "__main__":
A2 = A2,
MINLENGTH = MINLENGTH,
ERR = ERR,
fastq_dict = genome_dict)
fastq_dict = genome_dict,
QUALITY=QUALITY)
stat_dict[ad.get_basename(agenome)] = stat_and_run

ad.write_fastq_multi(fastq_dict=genome_dict, outputfile=OUTFILE)
Expand Down
80 changes: 0 additions & 80 deletions adrsm_single

This file was deleted.

90 changes: 26 additions & 64 deletions lib/adrsmlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from numpy import random as npr

QUALITY = "G"


def get_basename(file_name):
if ("/") in file_name:
Expand All @@ -18,7 +18,7 @@ def reverse_complement(dna) :
'''
dna = dna[::-1]
revcom = []
complement = {"A" : "T", "T" : "A" , "G" : "C" , "C" : "G"}
complement = {"A" : "T", "T" : "A" , "G" : "C" , "C" : "G", "N": "N"}
for letter in dna :
for key in complement.keys() :
if letter == key :
Expand Down Expand Up @@ -66,25 +66,37 @@ def complement_read(all_inserts, adaptor, read_length):
read = insert
elif inlen > read_length:
read = insert[0:read_length]
result.append(read)
if len(read) == read_length:
read = read.upper()
read = list(read)
for j in range(0, len(read)):
if read[j] not in ["A","T","G","C","N"]:
read[j] = "N"
result.append("".join(read))
return(result)

def add_error(all_reads, error_rate):
for i in range(0, len(all_reads)):
read = list(all_reads[i])
for j in range(0, len(read)):
if read[j].upper() not in ["A","T","G","C","N"]:
read[j] = "N"
if npr.random() < error_rate:
read[j] = npr.choice(["A","T","G","C"])
all_reads[i] = "".join(read)
return(all_reads)

def prepare_fastq(fastq_dict, fwd_reads, rev_reads, basename, read_length):
def prepare_fastq(fastq_dict, fwd_reads, rev_reads, basename, read_length, quality):
fastq_dict[basename] = [[] for i in range(2)]
cnt = 1
for read1, read2 in zip(fwd_reads, rev_reads):
towrite_fwd = "@"+basename+"_"+str(cnt)+"/1"+"\n"+read1+"\n+\n"+QUALITY*read_length+"\n"
read1 = read1.rstrip()
read2 = read2.rstrip()
readlen1 = len(read1)
readlen2 = len(read2)
towrite_fwd = "@"+basename+"_"+str(cnt)+"/1"+"\n"+read1+"\n+\n"+quality*readlen1+"\n"
fastq_dict[basename][0].append(towrite_fwd)
towrite_rev = "@"+basename+"_"+str(cnt)+"/2"+"\n"+read2+"\n+\n"+QUALITY*read_length+"\n"
towrite_rev = "@"+basename+"_"+str(cnt)+"/2"+"\n"+read2+"\n+\n"+quality*readlen2+"\n"
fastq_dict[basename][1].append(towrite_rev)
cnt += 1
return(fastq_dict)
Expand All @@ -98,64 +110,8 @@ def write_fastq_multi(fastq_dict, outputfile):
for reads2 in fastq_dict[akey][1]:
f2.write(reads2)

def write_fastq(all_reads, basename, orientation, read_length, outfile):
if not outfile:
with open(basename+"."+str(orientation)+".fastq", "w") as fw:
for read in all_reads:
fw.write("@"+basename+"\n")
fw.write(read+"\n")
fw.write("+\n")
fw.write(QUALITY*read_length+"\n")
else:
with open(outfile+"."+str(orientation)+".fastq", "w") as fw:
for read in all_reads:
fw.write("@"+basename+"\n")
fw.write(read+"\n")
fw.write("+\n")
fw.write(QUALITY*read_length+"\n")



def run_read_simulation(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, OUTFILE, MINLENGTH, ERR):
print("INFILE: ", INFILE)
if COV:
print("COV: ", COV)
else:
print("NREAD: ", NREAD)
print("READLEN: ", READLEN)
print("INSERLEN: ", INSERLEN)
print("LENDEV: ", LENDEV)
print("A1: ", A1)
print("A2: ", A2)
print("OUTFILE: ", OUTFILE)

nread = None


basename = get_basename(INFILE)
fasta = read_fasta(INFILE)

if COV:
nread = int(fasta[1]/INSERLEN)
print("nread: ", nread)

insert_lengths = [int(n) for n in npr.normal(INSERLEN, LENDEV, nread)]




all_inserts = random_insert(fasta, insert_lengths, READLEN, MINLENGTH)
fwd_inserts = all_inserts
rev_inserts = [reverse_complement(i) for i in all_inserts]
fwd_reads = complement_read(fwd_inserts, A1, READLEN)
fwd_reads = add_error(fwd_reads, ERR)
rev_reads = complement_read(rev_inserts, A2, READLEN)
rev_reads = add_error(rev_reads, ERR)

write_fastq(fwd_reads, basename, 1, READLEN, OUTFILE)
write_fastq(rev_reads, basename, 2, READLEN, OUTFILE)

def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, fastq_dict):
def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1, A2, MINLENGTH, ERR, fastq_dict, QUALITY):
print("INFILE: ", INFILE)
if COV:
print("COV: ", COV)
Expand All @@ -166,6 +122,7 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
print("LENDEV: ", LENDEV)
print("A1: ", A1)
print("A2: ", A2)
print("QUALITY", QUALITY)
nread = None


Expand All @@ -189,7 +146,12 @@ def run_read_simulation_multi(INFILE, NREAD, COV, READLEN, INSERLEN, LENDEV, A1,
rev_reads = complement_read(rev_inserts, A2, READLEN)
rev_reads = add_error(rev_reads, ERR)

prepare_fastq(fastq_dict = fastq_dict, fwd_reads = fwd_reads, rev_reads = rev_reads, basename = basename, read_length = READLEN)
prepare_fastq(fastq_dict = fastq_dict,
fwd_reads = fwd_reads,
rev_reads = rev_reads,
basename = basename,
read_length = READLEN,
quality = QUALITY)
return(nread * INSERLEN)

def write_stat(stat_dict, stat_out):
Expand Down

0 comments on commit 090f5d4

Please sign in to comment.