From c1971333d9aeeed7efe898e166616fb572bacece Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Fri, 13 Oct 2023 19:25:57 +0200 Subject: [PATCH 01/14] refactor: refactor script + static code analysis --- scripts/oligomapOutputToSam_nhfiltered.py | 238 ---------- scripts/oligomap_output_to_sam_nh_filtered.py | 447 ++++++++++++++++++ 2 files changed, 447 insertions(+), 238 deletions(-) delete mode 100644 scripts/oligomapOutputToSam_nhfiltered.py create mode 100755 scripts/oligomap_output_to_sam_nh_filtered.py diff --git a/scripts/oligomapOutputToSam_nhfiltered.py b/scripts/oligomapOutputToSam_nhfiltered.py deleted file mode 100644 index 3810db5..0000000 --- a/scripts/oligomapOutputToSam_nhfiltered.py +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python - -# type: ignore -# pylint: skip-file - -################################################# -# Transforms oligomap output to SAM format and -# keeps only best alignments. -# -# input: -# - the output of Oligomap -# - nh filter value -# output: -# - stdout: a SAM file with the best -# alignments for each read *(e.g. all 3 -# alignments with 0 errors, or all alignments -# with 1 error for each read -# -# Paula Iborra. Zavolan Lab. -# Adated version of Alessandro Crippa script. -################################################# - - -import sys -import re -from argparse import ArgumentParser, RawTextHelpFormatter - - -### ARGUMENTS ### PI, modified June 2019. - -parser = ArgumentParser( - description="Oligomap output to SAM. NH filter applicable." - ) -parser.add_argument( - '-v','--version', - action='version', - version='%(prog)s 1.0', - help="Show program's version number and exit" - ) -parser.add_argument( - '-n','--nhfilter', - help="Add NH tag to output, remove reads that contain more aligments than given NH value (with min error).", - type=int, - ) -parser.add_argument( - '-i', '--input', - help="Input File. The output of oligomap mapping.", - required=True, - ) - -args = parser.parse_args() - - -readSeqs = {} # dictionary of reads -filt = {} # dict with all filtered reads (heavy multimappers NH > 100) by error. {'seqName':error} (error with which it has been discarted) -seqToMinError = {} -nh = {} # nh dictionary per read evaluated - - -def addReadToList(d,nh,minerr,seqName,flag,target,positionInTarget,errors,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString): - - if len(d) == 0: # if empty dictionary - if seqName not in list(minerr.keys()): # means: seqName NOT found previously - d[seqName] = [] - minerr[seqName] = errors - nh[seqName] = 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - else: # means: seqName found previously and filtered by NH ---- empty dicty due to d.clear() - if errors < minerr[seqName]: # Aligment found with lower error than the ones stored in minerr dict - d[seqName] = [] - minerr[seqName] = errors - nh[seqName] = 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - - else: # if dictionary not empty - if seqName == list(d.keys())[0]: #same seqName as the one stored in dict found - # check the errors fo this new seqName to include it or not - if errors == minerr[seqName]: # if seqName errors is equal to the one stored in minerr -> keep - nh[seqName] += 1 # increase nh +1 - if args.nhfilter: # if NH filter - if nh[seqName] <= (args.nhfilter): - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - else: # if after adding +1 to nh, the total is > than the filter value. seqName discarded. - # Clear d dictonary - d.clear() - sys.stderr.write("Filtered by NH | Read %s | Errors = %s \n"%(seqName,errors)) - - else: # no NH filtering, keep all reads including heavy multimappers - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - - elif errors < minerr[seqName]: # error minor to the one previously stored. - sys.stderr.write("Filtered by ERROR | Read %s | Errors = %s \n"%(seqName,minerr[seqName])) - minerr[seqName] = min(errors, minerr[seqName]) # Update minor error. - d[seqName] = [] # Create empy list for seqName (removed aligmants previously stored with higher error). - nh[seqName] = 1 # NH starts at 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - else: # if seqName errors is > than the error stored, discard - pass - - elif seqName != list(d.keys())[0]: #new seqName found - # PRINT to the output file the previous seqName stored in dict - for i in d.keys(): - sys.stderr.write("Printed read %s | Errors = %s | NH = %s \n"%(i, minerr[i], nh[i])) - for al in d[i]: - nhtag= ('NH:i:'+str(nh[i])) - al.append(nhtag) - print('\t'.join([str(x) for x in al])) - - # Clean dictonaries before saving the current seqName being evaluated - d.clear() - nh.clear() - minerr.clear() - - # Restart the dictionaries with the new seqName - d[seqName] = [] - minerr[seqName] = errors - nh[seqName] = 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - - - -def readInput(fi): - i = 0 - #read oligomap results - while True: - line1 = fi.readline() #e.g. seq_61 (21 nc) 21..1 chr1 1..21 - if not line1: break #[check if file is empty] - line2 = fi.readline() #e.g. chr1 - if not line2: break #[pointless check] - line3 = fi.readline() #e.g. errors: 1 orientation: - - if not line3: break #[pointless check] - line4 = fi.readline() #e.g. CAAGCAGAAGACGGCATACGC ->input sequence - if not line4: break #[pointless check] - line5 = fi.readline() #e.g. |||||||||||||||||||| - if not line5: break #[pointless check] - line6 = fi.readline() #e.g. CAAGCAGAAGACGGCATACGA ->target sequence - if not line6: break #[pointless check] - line7 = fi.readline() #e.g. "" - i += 1 - #we got a result - if line1[0].isdigit(): #out readNames start with a number - #parse lines - line1 = re.sub( '\s+', ' ', line1).strip().split() - line3 = re.sub( '\s+', ' ', line3).strip().split() - - seqName = line1[0].strip() - target = line2.strip() - positionInTarget = line1[5].split('.')[0].strip() - #position2InTarget= line1[5].split('.')[2].strip() - errors = line3[1].strip() - strand = line3[3].strip() - sequence = line4.strip() - referenceSeq = line6.strip() - mapq='255' - rnext = '*' - pnext= '0' - tlen='0' - qual='*' - - #flag - if strand == "+": - flag = "0" - else: - flag = "16" - #define cigar string - if errors == '0': - #perfect match, cigar string is just the number of nucleotides + "M" - cigarStr = str(len(sequence)) + "M" - matchingString = str(len(sequence)) - editDistance = "NM:i:0" - else: - #cigar and mismatch strings are built here - if line5[0] == " ": #if the first nucleotide is mutated - cigarStr= "1M" - elif line5[-2] == " ": #if the last nucleotide is mutated - cigarStr= str(len(line5)-2) +"M" - else: - cigarStr= str(line5.strip().index(' ')) + "M" #if any other nucleotide is mutated - editDistance = "NM:i:1" - #depending on where in the read a deletion,insertion or mutation occurs, we create ad hoc mismatch strings. - #case 1: deletion in the seq - if '-' in sequence: - indelerr = "1D" - if line5[0] == " ": #the 1st nt is deleted - cigarStr = indelerr + str(len(sequence)-1) + "M" - matchingString = "^" + referenceSeq[0] + str(len(sequence)-1) - elif line5[-2] == " ": #the last nt is deleted ([-2] because [-1] is "\n") - cigarStr = str(len(sequence)-1) + "M" + indelerr - matchingString = str(len(sequence)-1) + "^" + referenceSeq[len(sequence)-1] + "0" #"0" is required if the last entry is ^[something] - else: #deletion occurs "in" the read - tmp = cigarStr - cigarStr = cigarStr + indelerr + str(line5.strip().count('|')-int(tmp[:-1])) + "M" - matchingString = str(line5.strip().index(' ')) + "^" + referenceSeq[int(tmp[:-1])] + str(len(sequence) - line5.strip().index(' ') -1) - #case 2: insertion in the seq - elif '-' in referenceSeq: - indelerr = "1I" - matchingString = str(len(sequence)) - if line5[0] == " ": #the 1st nt is inserted - cigarStr = indelerr + str(len(sequence)-1) + "M" - elif line5[-2] == " ": #the last nt is inserted - cigarStr = str(len(sequence)-1) + "M" + indelerr - else: #addition occurs "in" the read - tmp = cigarStr - cigarStr = cigarStr + indelerr + str(line5.strip().count('|')-int(tmp[:-1])) + "M" - #case 3: single point mutation - else: - cigarStr = str(len(sequence)) + "M" - if line5[0] == " ": #the 1st nt is inserted - matchingString = referenceSeq[0] + str(len(sequence)-1) - elif line5[-2] == " ": #the last nt is inserted - matchingString = str(len(sequence)-1) + referenceSeq[len(sequence)-1] - else: #mutation occurs "in" the read - matchingString = str(line5.strip().index(' ')) + referenceSeq[line5.strip().index(' ')] + str(len(sequence) - line5.strip().index(' ') -1) - #indelerr = referenceSeq[int(cigarStr)] #this info is not used in the cigar string - sequence = re.sub( '-', '', sequence).strip() - - matchingString = ('MD:Z:'+matchingString) - #addReadToList(readSeqs, readFilt, readNh, seqName,flag,target,positionInTarget,errors,cigarStr,sequence,editDistance,matchingString) - sys.stderr.write("Record: %i | Sequence: %s \n"%(i,seqName)) - addReadToList(readSeqs, nh, seqToMinError, seqName,flag,target,positionInTarget,errors,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString) - - -fi = open(args.input, "r") #read file -sys.stderr.write("###########################\nSTART READING...\n###########################\n") -readInput(fi) #process -fi.close() - -#print last aligments stored in dict -if len(readSeqs) != 0: - for i in readSeqs.keys(): - sys.stderr.write("Printed read %s | Errors = %s | NH = %s \n"%(i, seqToMinError[i], nh[i])) - for al in readSeqs[i]: - nhtag= ('NH:i:'+str(nh[i])) - al.append(nhtag) - print('\t'.join([str(x) for x in al])) - -sys.stderr.write("SUCCESSFULLY FINISHED.") -sys.exit() diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py new file mode 100755 index 0000000..2aaf585 --- /dev/null +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python + +"""Transform oligomap output FASTA file to SAM keeping the best alignments. + +Read the input FASTA file, pick the best alignment(s) for each read (i.e. all +of the alignments have either 0 or 1 error). In addition, if the `--nh-filter` +CLI argument is set, filter the reads with more hits than the number provided. +The following sections will cover what the FASTA file must be like and how the +output can look like. + +EXPECTED INPUT +The FASTA file must be the output of mapping your library with the tool +oligomap: "https://github.com/zavolanlab/oligomap". The alignment +representation has to have 6 lines with the following content: +- The first line must contain the read's name, its length, the start and end +mapped positions in the read's sequence, the reference sequence name and, the +start and end mapped positions in the reference sequence in this order. +- The second line, must contain the reference sequence name. +- The third line, must contain the fields "errors:" and "orientation:" with +the alignment number of errors and reference sequence strand. +- The fourth and sixth lines must be the read and reference sequence +respectively. +- The fifth line must contain a vertical bar for each match between the +sequences base pairs and a space otherwise. + +Alignments must be separated with a blank line. +In addition, alignments have to be sorted by the read name and separated +with a blank line. An example of how an entry would look like in the EXAMPLES +section. + +OUTPUT FORMAT +The output is a SAM file named 'oligomap_REF_mappings.sam', where REF is +origin of the reference sequence(s) set in the CLI argument `--ref`. Regarding +the mandatory fields in the SAM file, the bitwise flag, `flag`, is set to 0 if +the reference sequence is found in the poisitive strand, and to 16 otherwise. +The mapping quality, `map_q`, is set to 255. Both, the position of the next +read, `p_next`. and the observed template length,`t_len`, are set to 0. +Both, the reference name of the next read, `r_next`, and the ASCII of +Phred-scaled base quality, `qual`, are set to '*'. In addition, the +optional fields, edit distance, MD string, and number of hits are also +included. Only the best alignments per read (i.e. either all the alignments +have 0 or 1 error) are written to the output file. Moreover, if the +`--nh-filter` CLI argument is given, reads with more htis than the provided +value are discarded. + +EXAMPLES + input format: + 82-1 (23 nc) 1..23 19 44377..44398 + 19 + errors: 1 orientation: + + CTACAAAGGGAAGCACTTGTCTC + |||||||||||||||||| |||| + CTACAAAGGGAAGCACTT-TCTC + + + 97-1 (22 nc) 1..22 19 5338..5359 + 19 + errors: 1 orientation: + + TCAAAACTGAGGTGCATTTTCT + |||||||||||| ||||||||| + TCAAAACTGAGGGGCATTTTCT + + output format: + 82-1 0 19 44377 255 18M1I4M * 0 0 CTACAAAGGGAAGCACTTGTCTC * NM:i:1 MD:Z:23 NH:i:1 + 97-1 0 19 5338 255 22M * 0 0 TCAAAACTGAGGTGCATTTTCT * NM:i:1 MD:Z:12G9 NH:i:1 + + +Paula Iborra. Zavolan Lab. +Adapted version of Alessandro Crippa script. +Refactored and documented by Iris Mestres. +""" # noqa: E501 +# pylint: enable=line-too-long + +from argparse import ArgumentParser, RawDescriptionHelpFormatter +from pathlib import Path +import re +import sys +from typing import Dict, NamedTuple + + +class Fields(NamedTuple): + """Class to store an alignment in its different SAM fields.""" + + read_name: str + flag: str + ref_seq_name: str + position_in_ref: str + map_q: str + cigar_str: str + r_next: str + p_next: str + t_len: str + sequence: str + qual: str + edit_dist: str + md_str: str + + +def parse_arguments(): + """Command-line arguments parser.""" + parser = ArgumentParser( + description=__doc__, + formatter_class=RawDescriptionHelpFormatter + ) + parser.add_argument( + '-v', '--version', + action="version", + version="%(prog)s 1.1.0", + help="Show program's version number and exit" + ) + parser.add_argument( + 'fasta_file', + help="Path to the FASTA file resulting from the oligomap mapping.", + type=Path, + ) + parser.add_argument( + '--outdir', + help="Path to the output directory. Default: %(default)s.", + default=Path.cwd(), + type=Path + ) + parser.add_argument( + '--ref', + help=( + "Molecular collection where the reference sequence(s) belong to. " + "Default: %(default)s." + ), + default="genome", + type=str, + ) + parser.add_argument( + '-n', '--nh-filter', + help=( + "Add NH tag to output and remove reads that contain more " + "aligments than the provided NH value (with min error)." + ), + type=int, + ) + + return parser + + +def get_cigar_md(errors: str, sequence: str, bars_line: str, + ref_seq: str) -> tuple[str, str]: + """Get the CIGAR and MD strings. + + Given the read and target sequences, the number of errors and the + alignment's bar representation, this function first checks if there is any + error in the mapping. If there is no errors, the CIGAR and MD strings are + the read's length followed by and "M" and the read's length respectively. + If there is an error, the function checks if the read has a deletion (the + read sequence contains a dash), an insertion (the reference sequence + contains a dash), or a single-point mutation. Finally, it checks the + error's position in the read sequence and builds the CIGAR and MD strings. + + Args: + errors: + number of mapping errors (either 0 or 1) + sequence: + read sequence + bars_line: + alignment's bar representation + ref_seq: + reference sequence + + Returns: + cigarStr: + alignment's CIGAR string + matchingString: + alignment's MD string with its corresponding tag + + Examples: + read sequence -> GAAGGCGCTTCACCTTTGGAGT + alignment representation -> |||||||||||||||||||||| + reference sequence -> GAAGGCGCTTCACCTTTGGAGT + errors -> 0 + + CIGAR -> 21M + MD -> MD:Z:21 + + ------------------------------------------------------ + + read sequence -> GAAGGCGCTTCACCTTTGGAGT + alignment representation -> ||||||||||| |||||||||| + reference sequence -> GAAGGCGCTTC-CCTTTGGAGT + errors -> 1 + + CIGAR -> 11M1I10M + MD -> MD:Z:21 + + ------------------------------------------------------ + + read sequence -> GAAGGCGCTTC-CCTTTGGAGT + alignment representation -> ||||||||||| |||||||||| + reference sequence -> GAAGGCGCTTCCCCTTTGGAGT + errors -> 1 + + CIGAR -> 11M1D10M + MD -> MD:Z:11C10 + + ------------------------------------------------------ + + read sequence -> GAAGGCGCTTCACCTTTGGAGT + alignment representation -> ||||||||||| |||||||||| + reference sequence -> GAAGGCGCTTCCCCTTTGGAGT + errors -> 1 + + CIGAR -> 21M + MD -> MD:Z:11C10 + """ + seq_len = len(sequence) + + if errors == '0': + return f"{seq_len}M", f"MD:Z:{seq_len}" + + # CASE 1: deletion in the read + if '-' in sequence: + indelerr = "1D" + + if bars_line[0] == ' ': + cigarStr = f"{indelerr}{seq_len - 1}M" + matchingString = f"MD:Z:^{ref_seq[0]}{seq_len - 1}" + + elif bars_line[-1] == " ": + cigarStr = f"{seq_len - 1}M{indelerr}" + matchingString = f"MD:Z:{seq_len - 1}^{ref_seq[seq_len - 1]}0" + + else: + idx = bars_line.index(' ') + cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx + 1}M" + matchingString = f"MD:Z:{idx}^{ref_seq[idx]}{seq_len - idx -1}" + + return cigarStr, matchingString + + # CASE 2: insertion in the read + if '-' in ref_seq: + indelerr = "1I" + + if bars_line[0] == " ": + cigarStr = f"{indelerr}{seq_len - 1}M" + + elif bars_line[-1] == " ": + cigarStr = f"{seq_len - 1}M{indelerr}" + + else: + idx = bars_line.index(' ') + cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx + 1}M" + + return cigarStr, f"MD:Z:{seq_len}" + + # CASE 3: single point mutation + if bars_line[0] == " ": + matchingString = f"MD:Z:{ref_seq[0]}{seq_len - 1}" + + elif bars_line[-1] == " ": + matchingString = f"MD:Z:{seq_len - 1}{ref_seq[seq_len - 1]}" + + else: + idx = bars_line.index(' ') + matchingString = f"MD:Z:{idx}{ref_seq[idx]}{seq_len - idx - 1}" + + return f"{seq_len}M", matchingString + + +def get_sam_fields(aln: list[str]) -> Fields: + """Create the read's alignment in SAM format. + + Given the set of lines oligomap provides as an alignment + representation, this function returns a Fields class object with the + mandatory fields in a SAM file and three optional ones. + + The bitwise flag, `flag`, is set to 0 if the reference sequence is found + in the poisitive strand, and to 16 otherwise. + The mapping quality, `map_q`, is set to 255. + Both, the position of the next read, `p_next`. and the observed template + length,`t_len`, are set to 0. + Both, the reference name of the next read, `r_next`, and the ASCII of + Phred-scaled base quality, `qual`, are set to '*'. + + The three optional fields are the edit distance, the MD string and the + number of hits. + + Args: + aln: + list made up of the lines that form the output alignment after + mapping with oligomap. + + Returns: + fields: + Fields class NamedTuple with the alignment's data as SAM fields. + """ + seq_name_pos = aln[0].split() + errors = aln[2].split()[1] + seq = aln[3].strip() + + cigar, md = get_cigar_md(errors, seq, aln[4][:-1], aln[5].strip()) + + fields = Fields(seq_name_pos[0], + '0' if aln[2].split()[3] == '+' else "16", + aln[1].strip(), + seq_name_pos[5].split('.')[0], + "255", cigar, '*', '0', '0', + re.sub('-', '', seq), '*', + "NM:i:0" if errors == '0' else "NM:i:1", md) + + return fields + + +def eval_aln(outfile: Path, nhfilter: int, d: Dict[str, list], + minErr_nh: Dict[str, list], fields: Fields) -> None: + """Evaluate an alignment to add, discard or write it in the output file. + + Given a read's alignment, this function first checks if the dictionary + storing the read alignments is empty. If it is, the function checks if the + read has an entry in the dictionary storing the current minium error and + the NH. If there is not, or the minimum number of errors is higher than + the alignment under evaluation, the entry for that read in both + dictionaries is set to the data of the alignment under evaluation. + + If the alignments dictionary is not empty, the function checks if the + read under evaluation is the first in the dictionary. If it is, and the + number of errors in the current alignment is the same as the minimum error + for that read, the number of hits is increased by 1. If the number of hits + does not exceed the maximum NH or the NH is not provided, the alignment + under evaluation is appended to the read's entry. If a maximum NH is set, + and the read exceeds it, it is removed. If the minimum number of errors is + higher than the number of errors of the alignment under evaluation, the + entry for that read in both dictionaries is set to the data of the + current alignment. + + If the read is not the first in the dictionary, the function writes all + the alignments for the read in the first position in the output file, + and both dictionaries are reset to only include the data for the + alignments under evaluation. + + Args: + outfile: + Path to the ouput file + nhfilter: + maximum number of hits an read can have to be kept + d: + dictionary with read names as keys and a list with Fields class + NamedTuples as values + minErr_nh: + dictionary with read names as keys and a list with the current + minimum error and the number of hits in this order as values + fields: + read's alignment as a Fields class NamedTuple + """ + seq_name = fields.read_name + errors = fields.edit_dist[-1] + + if len(d) == 0: + if (seq_name not in list(minErr_nh.keys()) or + errors < minErr_nh[seq_name][0]): + + minErr_nh[seq_name] = [errors, 1] + d[seq_name] = [fields] + else: + if seq_name == list(d.keys())[0]: + if errors == minErr_nh[seq_name][0]: + minErr_nh[seq_name][1] += 1 + + if nhfilter: + + if minErr_nh[seq_name][1] <= nhfilter: + d[seq_name].append(fields) + + else: + d.clear() + sys.stderr.write(f"Filtered by NH | Read {seq_name}" + + f" | Errors = {errors}\n") + + else: + d[seq_name].append(fields) + + elif errors < minErr_nh[seq_name][0]: + sys.stderr.write(f"Filtered by ERROR | Read {seq_name}" + + f" | Errors = {minErr_nh[seq_name][0]}\n") + + minErr_nh[seq_name] = [min(errors, minErr_nh[seq_name][0]), 1] + d[seq_name] = [fields] + else: + with open(outfile, 'a', encoding="utf-8") as out_file: + for seq, aln in d.items(): + sys.stderr.write(f"Written read {seq} | " + + f"Errors = {minErr_nh[seq][0]} | " + + f"NH = {minErr_nh[seq][1]}\n") + for field in aln: + out_file.write('\t'.join(field) + + f"\tNH:i:{minErr_nh[seq][1]}" + '\n') + + d.clear() + minErr_nh.clear() + + d[seq_name] = [fields] + minErr_nh[seq_name] = [errors, 1] + + +def main(arguments) -> None: + """Convert oligomap alignments FASTA file to SAM file.""" + outdir = Path(arguments.outdir) + outdir.mkdir(parents=True, exist_ok=True) + outfile = outdir/f"oligomap_{arguments.ref}_mappings.sam" + + read_seqs: Dict[str, list] = {} + seq_minError_nh: Dict[str, list] = {} + + with open(arguments.fasta_file, 'r', encoding="utf-8") as in_file: + + sys.stderr.write("##############\nSTART READING...\n##############\n") + + lines = [in_file.readline() for _ in range(6)] + i = 1 + + while lines[0] != "": + + fields = get_sam_fields(lines) + + sys.stderr.write(f"Record:{i} | Sequence:{fields.read_name}\n") + + eval_aln(outfile, arguments.nh_filter, read_seqs, seq_minError_nh, + fields) + i += 1 + + in_file.readline() + lines = [in_file.readline().strip() for _ in range(6)] + + if len(read_seqs) > 0: + with open(outfile, 'a', encoding="utf-8") as out_file: + + for read_name, alignments in read_seqs.items(): + sys.stderr.write(f"Printed read {read_name} | Errors = " + + f"{seq_minError_nh[read_name][0]} | " + + f"NH = {seq_minError_nh[read_name][1]}\n") + + for aln in alignments: + out_file.write('\t'.join(aln) + + f"\tNH:i:{seq_minError_nh[read_name][1]}" + + '\n') + + sys.stderr.write("SUCCESSFULLY FINISHED.") + + +if __name__ == "__main__": + args = parse_arguments().parse_args() + main(args) From 8992b09c066c8c5f89923f8e31392c9a83b60e59 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Fri, 13 Oct 2023 21:56:42 +0200 Subject: [PATCH 02/14] refactor: add empty file creation --- scripts/oligomap_output_to_sam_nh_filtered.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py index 2aaf585..7ecc412 100755 --- a/scripts/oligomap_output_to_sam_nh_filtered.py +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -41,7 +41,8 @@ included. Only the best alignments per read (i.e. either all the alignments have 0 or 1 error) are written to the output file. Moreover, if the `--nh-filter` CLI argument is given, reads with more htis than the provided -value are discarded. +value are discarded. If none of the alignments meet the criteria, an empty +file is created. EXAMPLES input format: @@ -403,6 +404,9 @@ def main(arguments) -> None: outdir.mkdir(parents=True, exist_ok=True) outfile = outdir/f"oligomap_{arguments.ref}_mappings.sam" + with open(outfile, 'a', encoding="utf-8") as out_file: + out_file.write('') + read_seqs: Dict[str, list] = {} seq_minError_nh: Dict[str, list] = {} @@ -438,7 +442,6 @@ def main(arguments) -> None: out_file.write('\t'.join(aln) + f"\tNH:i:{seq_minError_nh[read_name][1]}" + '\n') - sys.stderr.write("SUCCESSFULLY FINISHED.") From c1305427c826df54d25f733dae5a39223a5c53b8 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Fri, 13 Oct 2023 21:59:35 +0200 Subject: [PATCH 03/14] refactr: refactor rules to turn oligomap FASTA to SAM --- workflow/rules/map.smk | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index 3983473..29bd20e 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -487,14 +487,11 @@ rule sort_genome_oligomap: rule convert_genome_to_sam_oligomap: input: - report=os.path.join( - config["output_dir"], "{sample}", "oligomap_genome_report.txt" - ), sort=os.path.join( config["output_dir"], "{sample}", "oligomap_genome_sorted.fasta" ), script=os.path.join( - config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py" + config["scripts_dir"], "oligomap_output_to_sam_nh_filtered.py" ), output: gmap=os.path.join( @@ -505,6 +502,8 @@ rule convert_genome_to_sam_oligomap: config["cluster_log"], "oligomap_genome_to_sam_{sample}.log" ), nh=config["nh"], + out_dir=lambda wildcards, output: Path(output[0]).parent, + ref="genome", log: os.path.join( config["local_log"], "oligomap_genome_to_sam_{sample}.log" @@ -518,9 +517,11 @@ rule convert_genome_to_sam_oligomap: os.path.join(workflow.basedir, "envs", "python.yaml") shell: "(python {input.script} \ - -i {input.sort} \ + {input.sort} \ -n {params.nh} \ - > {output.gmap}) &> {log}" + --ref {params.ref} \ + --outdir {params.out_dir} \ + ) &> {log}" ############################################################################### @@ -628,18 +629,13 @@ rule sort_transcriptome_oligomap: rule convert_transcriptome_to_sam_oligomap: input: - report=os.path.join( - config["output_dir"], - "{sample}", - "oligomap_transcriptome_report.txt", - ), sort=os.path.join( config["output_dir"], "{sample}", "oligomap_transcriptome_sorted.fasta", ), script=os.path.join( - config["scripts_dir"], "oligomapOutputToSam_nhfiltered.py" + config["scripts_dir"], "oligomap_output_to_sam_nh_filtered.py" ), output: tmap=os.path.join( @@ -653,6 +649,8 @@ rule convert_transcriptome_to_sam_oligomap: "oligomap_transcriptome_to_sam_{sample}.log", ), nh=config["nh"], + out_dir=lambda wildcards, output: Path(output[0]).parent, + ref="transcriptome", log: os.path.join( config["local_log"], "oligomap_transcriptome_to_sam_{sample}.log" @@ -663,12 +661,14 @@ rule convert_transcriptome_to_sam_oligomap: os.path.join(workflow.basedir, "envs", "python.yaml") shell: "(python {input.script} \ - -i {input.sort} \ + {input.sort} \ -n {params.nh} \ - > {output.tmap} \ + --ref {params.ref} \ + --outdir {params.out_dir} \ ) &> {log}" + ############################################################################### ### Merge genome mappings ############################################################################### From f04325cd2cea7a8588f78d4cf913f0693d7830a3 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 18 Oct 2023 16:51:35 +0200 Subject: [PATCH 04/14] build: update gffutils version --- workflow/envs/gffutils.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/envs/gffutils.yaml b/workflow/envs/gffutils.yaml index 2454b97..b375f37 100644 --- a/workflow/envs/gffutils.yaml +++ b/workflow/envs/gffutils.yaml @@ -3,5 +3,5 @@ channels: - conda-forge - bioconda dependencies: - - gffutils=0.11.1 -... \ No newline at end of file + - gffutils>=0.11.1 +... From 6b188e30478a1c3b09abb7c01a61901e036dc571 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Thu, 19 Oct 2023 17:46:27 +0200 Subject: [PATCH 05/14] refactor: fix input reading --- scripts/oligomap_output_to_sam_nh_filtered.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py index 7ecc412..70f5907 100755 --- a/scripts/oligomap_output_to_sam_nh_filtered.py +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -228,7 +228,7 @@ def get_cigar_md(errors: str, sequence: str, bars_line: str, else: idx = bars_line.index(' ') - cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx + 1}M" + cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx}M" matchingString = f"MD:Z:{idx}^{ref_seq[idx]}{seq_len - idx -1}" return cigarStr, matchingString @@ -245,7 +245,7 @@ def get_cigar_md(errors: str, sequence: str, bars_line: str, else: idx = bars_line.index(' ') - cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx + 1}M" + cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx}M" return cigarStr, f"MD:Z:{seq_len}" @@ -293,7 +293,7 @@ def get_sam_fields(aln: list[str]) -> Fields: seq_name_pos = aln[0].split() errors = aln[2].split()[1] seq = aln[3].strip() - + cigar, md = get_cigar_md(errors, seq, aln[4][:-1], aln[5].strip()) fields = Fields(seq_name_pos[0], @@ -428,7 +428,7 @@ def main(arguments) -> None: i += 1 in_file.readline() - lines = [in_file.readline().strip() for _ in range(6)] + lines = [in_file.readline() for _ in range(6)] if len(read_seqs) > 0: with open(outfile, 'a', encoding="utf-8") as out_file: From ae20014db675ba376a095d12e5aa9199e9ffb30f Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 30 Oct 2023 22:07:59 +0100 Subject: [PATCH 06/14] test: add unit testing --- ...test_oligomap_output_to_sam_nh_filtered.py | 479 ++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 scripts/tests/test_oligomap_output_to_sam_nh_filtered.py diff --git a/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py b/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py new file mode 100644 index 0000000..88deef9 --- /dev/null +++ b/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py @@ -0,0 +1,479 @@ +"""Unit tests for module 'oligomap_output_to_sam_nh_filtered.py'.""" + +import argparse +from pathlib import Path +import sys +from typing import NamedTuple + +import pytest + +sys.path.append("../../") + + +from scripts.oligomap_output_to_sam_nh_filtered import( + eval_aln, + Fields, + get_cigar_md, + get_sam_fields, + main, + parse_arguments +) + + +@pytest.fixture +def empty_file(): + """Import path to empty file.""" + empty_file = Path("files/empty_file") + + return empty_file + + +@pytest.fixture +def genome_nh_2(): + """Import path to test files with maximum NH set to 2.""" + oligo_in = Path("files/in_oligomap_output.fa") + oligo_out = Path("files/oligomap_genome_2_nh.sam") + + return oligo_in, oligo_out + + +@pytest.fixture +def transcriptome_no_nh(): + """Import path to test files with reference set to transcriptome.""" + oligo_in = Path("files/in_oligomap_output.fa") + oligo_out = Path("files/oligomap_transcriptome_no_nh.sam") + + return oligo_in, oligo_out + +@pytest.fixture +def single_read(): + """Import path to test file with a single read.""" + oligo_out = Path("files/oligomap_single_read.sam") + + return oligo_out + +@pytest.fixture +def aln_fields(): + """Create sample alignment as a Fields class NamedTuple.""" + # Perfect alignment + field_1 = Fields("read_1", "0", "19", "44377", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:0", "MD:Z:19") + + # Alignment with a mismatch in the first position + field_2 = Fields("read_1", '0', "19", "53471", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:1", "MD:Z:G18") + + # Alignment with a mismatch in the last position + field_3 = Fields("read_1", "0", "19", "44278", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:1", "MD:Z:18C") + + # Alignment with a mismatch in the middle of the read sequence + field_4 = Fields("read_1", "0", "19", "50971", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:1", "MD:Z:14C4") + + # Alignment with an insertion at read's first position + field_5 = Fields("read_2", "16", "19", "7627", "255", "1I22M", '*', '0', '0', + "AAAGCACCTCCAGAGCTTGAAGC", '*', "NM:i:1", "MD:Z:23") + + # Alignment with an insertion in the middle of the read sequence + field_6 = Fields("read_2", "16", "19", "7886", "255", "9M1I12M", '*', '0', '0', + "AAAGCACCTCCAGAGCTTGAAGC", '*', "NM:i:1", "MD:Z:23") + + return [field_1, field_2, field_3, field_4, field_5, field_6] + + +@pytest.fixture +def alns(): + """Create sample alignment lists to extract the CIGAR and MD strings. + + The list contain the number of errors, the read's sequence, the alignment + representation in bars and the reference sequence as strings in that + order. + """ + # Perfect alignment + aln1 = [] + aln1.append("0") + aln1.append("CTACAAAGGGAAGCACTTT") + aln1.append("|||||||||||||||||||") + aln1.append("CTACAAAGGGAAGCACTTT") + + # Alignment with a mismatch in the first position + aln2 = [] + aln2.append("1") + aln2.append("CTACAAAGGGAAGCACTTT") + aln2.append(" ||||||||||||||||||") + aln2.append("GTACAAAGGGAAGCACTTT") + + # Alignment with a mismatch in the last position + aln3 = [] + aln3.append("1") + aln3.append("CTACAAAGGGAAGCACTTT") + aln3.append("|||||||||||||||||| ") + aln3.append("CTACAAAGGGAAGCACTTC") + + # Alignment with a mismatch in the middle of the sequence + aln4 = [] + aln4.append("1") + aln4.append("CTACAAAGGGAAGCACTTT") + aln4.append("|||||||||||||| ||||") + aln4.append("CTACAAAGGGAAGCCCTTT") + + # Alignment with an insertion at read's first position + aln5 = [] + aln5.append("1") + aln5.append("AAAGCACCTCCAGAGCTTGAAGC") + aln5.append(" ||||||||||||||||||||||") + aln5.append("-AAGCACCTCCAGAGCTTGAAGC") + + # Alignment with an insertion at read's last position + aln6 = [] + aln6.append("1") + aln6.append("AAAGCACCTCCAGAGCTTGAAGC") + aln6.append("|||||||||||||||||||||| ") + aln6.append("AAAGCACCTCCAGAGCTTGAAG-") + + # Alignment with an insertion in the middle of the read + aln7 = [] + aln7.append("1") + aln7.append("AAAGCACCTCCAGAGCTTGAAGC") + aln7.append("||||||||| |||||||||||||") + aln7.append("AAAGCACCT-CAGAGCTTGAAGC") + + # Alignment with a deletion at read's first position + aln8 = [] + aln8.append("1") + aln8.append("-GAAGGCGCTTCACCTTTGGAGT") + aln8.append(" ||||||||||||||||||||||") + aln8.append("TGAAGGCGCTTCACCTTTGGAGT") + + # Alignment with a deletion at read's last position + aln9 = [] + aln9.append("1") + aln9.append("GAAGGCGCTTCACCTTTGGAGT-") + aln9.append("|||||||||||||||||||||| ") + aln9.append("GAAGGCGCTTCACCTTTGGAGTA") + + # Alignment with a deletion in the middle of the read + aln10 = [] + aln10.append("1") + aln10.append("GAAGGCGCTTC-CCTTTGGAGT") + aln10.append("||||||||||| ||||||||||") + aln10.append("GAAGGCGCTTCACCTTTGGAGT") + + return [aln1, aln2, aln3, aln4, aln5, aln6, aln7, aln8, aln9, aln10] + + +class TestParseArguments: + """Test 'parse_arguments()' function.""" + + def test_no_files(self, monkeypatch): + """Call without input nor output files.""" + with pytest.raises(SystemExit) as sysex: + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered'] + ) + parse_arguments().parse_args() + assert sysex.value.code == 2 + + def test_in_files(self, monkeypatch, empty_file, tmp_path): + """Call with in and output files.""" + empty_in = empty_file + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(empty_in), + '--outdir', str(tmp_path), + ] + ) + + args = parse_arguments().parse_args() + assert isinstance(args, argparse.Namespace) + + def test_all_arguments(self, monkeypatch, genome_nh_2, tmp_path): + """Call with all the arguments.""" + fa_in, sam_out = genome_nh_2 + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(fa_in), + '--outdir', str(tmp_path), + '--ref', "genome", + '-n', '100', + ] + ) + + args = parse_arguments().parse_args() + assert isinstance(args, argparse.Namespace) + + +class TestGetCigarMd: + """Test 'get_cigar_md()' function.""" + + def test_perfect_aln(self, alns): + """Test perfect alignment.""" + result = ("19M", "MD:Z:19") + + assert get_cigar_md(alns[0][0], alns[0][1], + alns[0][2], alns[0][3]) == result + + def test_mm_first_pos_aln(self, alns): + """Test mismatch at read's first position.""" + result = ("19M", "MD:Z:G18") + + assert get_cigar_md(alns[1][0], alns[1][1], + alns[1][2], alns[1][3]) == result + + def test_mm_last_pos_aln(self, alns): + """Test mismatch at read's last position.""" + result = ("19M", "MD:Z:18C") + + assert get_cigar_md(alns[2][0], alns[2][1], + alns[2][2], alns[2][3]) == result + + def test_mm_middle_aln(self, alns): + """Test mismatch in the middle of the read.""" + result = ("19M", "MD:Z:14C4") + + assert get_cigar_md(alns[3][0], alns[3][1], + alns[3][2], alns[3][3]) == result + + def test_in_first_pos_aln(self, alns): + """Test insertion at read's first position.""" + result = ("1I22M", "MD:Z:23") + + assert get_cigar_md(alns[4][0], alns[4][1], + alns[4][2], alns[4][3]) == result + + def test_in_last_pos_aln(self, alns): + """Test insertion at read's last position.""" + result = ("22M1I", "MD:Z:23") + + assert get_cigar_md(alns[5][0], alns[5][1], + alns[5][2], alns[5][3]) == result + + def test_in_middle_aln(self, alns): + """Test insertion in the middle of the read.""" + result = ("9M1I13M", "MD:Z:23") + + assert get_cigar_md(alns[6][0], alns[6][1], + alns[6][2], alns[6][3]) == result + + def test_del_first_pos_aln(self, alns): + """Test deletion at read's first position.""" + result = ("1D22M", "MD:Z:^T22") + + assert get_cigar_md(alns[7][0], alns[7][1], + alns[7][2], alns[7][3]) == result + + def test_del_last_pos_aln(self, alns): + """Test deletion at read's last position.""" + result = ("22M1D", "MD:Z:22^A0") + + assert get_cigar_md(alns[8][0], alns[8][1], + alns[8][2], alns[8][3]) == result + + def test_del_middle_aln(self, alns): + """Test deletion in the middle of the read.""" + result = ("11M1D10M", "MD:Z:11^A10") + + assert get_cigar_md(alns[9][0], alns[9][1], + alns[9][2], alns[9][3]) == result + + +class TestGetSAMFields(): + """Test 'get_sam_fields()' function.""" + + def test_pos_strand_no_err(self,alns, aln_fields): + """Test perfect alignment in the positive strand.""" + line1 = "read_1 (19 nc) 1...19 19 44377...44395" + line2 = "19" + line3 = "errors: 0 orientation: +" + + assert get_sam_fields([line1, line2, line3, alns[0][1], + alns[0][2], alns[0][3]]) == aln_fields[0] + + def test_neg_strand_one_err(self,alns, aln_fields): + """Test alignment with an insertion in the negative strand.""" + line1 = "read_2 (23 nc) 1...23 19 7886...7908" + line2 = "19" + line3 = "errors: 1 orientation: -" + + assert get_sam_fields([line1, line2, line3, alns[6][1], + alns[6][2], alns[6][3]]) == aln_fields[5] + + +class TestEvalAln: + """Test ''eval_aln()' function.""" + + def test_eval_empty_dict_new_read(self, tmp_path, aln_fields): + """Test evaluation with a new read and an empty dictionary.""" + output = tmp_path/"oligomap_genome_mappings.sam" + + d = dict() + minerr_nh = {"read_0" : ['0', 1]} + aln = aln_fields[0] + nhfilter = None + + eval_aln(output, nhfilter, d, minerr_nh, aln) + + assert list(d.keys())[0] == aln.read_name + assert minerr_nh[aln.read_name] == ['0', 1] + + def test_eval_empty_dict_smaller_error(self, tmp_path, aln_fields): + """Test evaluation with a smaller error and an empty dictionary.""" + output = tmp_path/"oligomap_genome_mappings.sam" + + d = dict() + minerr_nh = {"read_1" : ['1', 1]} + aln = aln_fields[0] + nhfilter = None + + eval_aln(output, nhfilter, d, minerr_nh, aln) + + assert list(d.keys())[0] == aln.read_name + assert minerr_nh[aln.read_name] == ['0', 1] + + def test_increase_nh_no_filter(self, tmp_path, aln_fields): + """Test evaluation when increasing NH without a maximum value.""" + output = tmp_path/"oligomap_genome_mappings.sam" + + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[3] + nhfilter = None + + eval_aln(output, nhfilter, d, minerr_nh, aln) + + assert len(d[aln.read_name]) == 3 + assert minerr_nh[aln.read_name] == ['1', 3] + + def test_exceed_nh_filter_2(self, capsys, tmp_path, aln_fields): + """Test evaluation when exceeding the maximum NH set to 2.""" + output = tmp_path/"oligomap_genome_mappings.sam" + + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[3] + nhfilter = 2 + + eval_aln(output, nhfilter, d, minerr_nh, aln) + captured = capsys.readouterr() + + assert len(d) == 0 + assert minerr_nh[aln.read_name] == ['1', 3] + assert captured.err == "Filtered by NH | Read read_1 | Errors = 1\n" + + def test_no_exceed_nh_filter_2(self, tmp_path, aln_fields): + """Test evaluation when increasing NH with maximum value of 2.""" + output = tmp_path/"oligomap_genome_mappings.sam" + + d = {"read_1": [aln_fields[1]]} + minerr_nh = {"read_1" : ['1', 1]} + aln = aln_fields[2] + nhfilter = 2 + + eval_aln(output, nhfilter, d, minerr_nh, aln) + + assert len(d[aln.read_name]) == 2 + assert minerr_nh[aln.read_name] == ['1', 2] + + def test_smaller_min_error(self, capsys, tmp_path, aln_fields): + """Test evaluation when having a smaller minimumm error.""" + output = tmp_path/"oligomap_genome_mappings.sam" + + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[0] + nhfilter = None + + eval_aln(output, nhfilter, d, minerr_nh, aln) + captured = capsys.readouterr() + + assert len(d[aln.read_name]) == 1 + assert minerr_nh[aln.read_name] == ['0', 1] + assert captured.err == "Filtered by ERROR | Read read_1 | Errors = 1\n" + + def test_different_read(self, capsys, tmp_path, aln_fields, single_read): + """Test evaluation when having to write due to a different read.""" + output = tmp_path/"oligomap_genome_mappings.sam" + out_file = single_read + + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[4] + nhfilter = None + + eval_aln(output, nhfilter, d, minerr_nh, aln) + captured = capsys.readouterr() + + assert list(d.keys())[0] == aln.read_name + assert len(minerr_nh) == 1 + assert captured.err == "Written read read_1 | Errors = 1 | NH = 2\n" + + with open(out_file, 'r') as expected, open(output, 'r') as output: + assert output.read() == expected.read() + + +class TestMain: + """Test 'main()' function.""" + + def test_main_empty_file(self, monkeypatch, tmp_path, empty_file): + """Test main function with an empty FASTA file.""" + empty_in = empty_file + output = tmp_path/"oligomap_genome_mappings.sam" + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(empty_in), + '--outdir', str(tmp_path), + ] + ) + args = parse_arguments().parse_args() + main(args) + + with open(empty_in, 'r') as expected, open(output, 'r') as out_file: + assert out_file.read() == expected.read() + + def test_main_max_nh_2(self, monkeypatch, tmp_path, genome_nh_2): + """Test main function with NH set to 2.""" + in_file, out_file = genome_nh_2 + output = tmp_path/"oligomap_genome_mappings.sam" + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(in_file), + '-n', '2', + '--outdir', str(tmp_path), + ] + ) + args = parse_arguments().parse_args() + main(args) + + with open(out_file, 'r') as expected, open(out_file, 'r') as output: + assert output.read() == expected.read() + + def test_main_no_nh_transcriptome(self, monkeypatch, tmp_path, + transcriptome_no_nh): + """Test main function with no NH set for transcriptome mappings.""" + in_file, out_file = transcriptome_no_nh + output = tmp_path/"oligomap_transcriptome_mappings.sam" + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(in_file), + '--ref', "transcriptome", + '--outdir', str(tmp_path), + ] + ) + args = parse_arguments().parse_args() + main(args) + + with open(out_file, 'r') as expected, open(out_file, 'r') as output: + assert output.read() == expected.read() + From b3847af20d773831443e5f8b2a8ca47dc8048542 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 30 Oct 2023 22:08:17 +0100 Subject: [PATCH 07/14] test: add files for unit testing --- scripts/tests/files/in_oligomap_output.fa | 69 +++++++++++++++++++ scripts/tests/files/oligomap_genome_2_nh.sam | 1 + scripts/tests/files/oligomap_single_read.sam | 2 + .../files/oligomap_transcriptome_no_nh.sam | 7 ++ 4 files changed, 79 insertions(+) create mode 100644 scripts/tests/files/in_oligomap_output.fa create mode 100644 scripts/tests/files/oligomap_genome_2_nh.sam create mode 100644 scripts/tests/files/oligomap_single_read.sam create mode 100644 scripts/tests/files/oligomap_transcriptome_no_nh.sam diff --git a/scripts/tests/files/in_oligomap_output.fa b/scripts/tests/files/in_oligomap_output.fa new file mode 100644 index 0000000..69e6420 --- /dev/null +++ b/scripts/tests/files/in_oligomap_output.fa @@ -0,0 +1,69 @@ +read_1 (19 nc) 1..19 19 44278..44297 +19 +errors: 1 orientation: + +CTACAAAGGGAAGCACTTT + |||||||||||||||||| +GTACAAAGGGAAGCCCTTT + +read_1 (19 nc) 1..19 19 44377..44395 +19 +errors: 1 orientation: + +CTACAAAGGGAAGCACTTT +|||||||||||||| |||| +CTACAAAGGGAAGCCCTTT + +read_1 (19 nc) 1..19 19 50971..50989 +19 +errors: 1 orientation: + +CTACAAAGGGAAGCACTTT +|||||||||||||||||| +CTACAAAGGGAAGCACTTC + +read_1 (19 nc) 1..19 19 53471..53489 +19 +errors: 0 orientation: + +CTACAAAGGGAAGCACTTT +||||||||||||||||||| +CTACAAAGGGAAGCACTTT + +read_2 (23 nc) 1..23 19 7627..7648 +19 +errors: 1 orientation: - +AAAGCACCTCCAGAGCTTGAAGC + |||||||||||||||||||||| +-AAGCACCTCCAGAGCTTGAAGC + +read_2 (23 nc) 1..23 19 7886..7908 +19 +errors: 1 orientation: - +AAAGCACCTCCAGAGCTTGAAGC +||||||||| ||||||||||||| +AAAGCACCT-CAGAGCTTGAAGC + +read_2 (23 nc) 1..23 19 9524..7545 +19 +errors: 1 orientation: - +AAAGCACCTCCAGAGCTTGAAGC +|||||||||||||||||||||| +AAAGCACCTCCAGAGCTTGAAG- + +read_3 (22 nc) 1..22 19 44414..44434 +19 +errors: 1 orientation: + +-GAAGGCGCTTCACCTTTGGAGT + |||||||||||||||||||||| +TGAAGGCGCTTCACCTTTGGAGT + +read_3 (22 nc) 1..22 19 54316..54336 +19 +errors: 1 orientation: + +GAAGGCGCTTC-CCTTTGGAGT +||||||||||| |||||||||| +GAAGGCGCTTCGCCTTTGGAGT + +read_3 (22 nc) 1..22 19 55718..55738 +19 +errors: 1 orientation: + +GAAGGCGCTTCACCTTTGGAGT- +|||||||||||||||||||||| +GAAGGCGCTTCACCTTTGGAGTA diff --git a/scripts/tests/files/oligomap_genome_2_nh.sam b/scripts/tests/files/oligomap_genome_2_nh.sam new file mode 100644 index 0000000..9816087 --- /dev/null +++ b/scripts/tests/files/oligomap_genome_2_nh.sam @@ -0,0 +1 @@ +read_1 0 19 53471 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:0 MD:Z:19 NH:i:1 diff --git a/scripts/tests/files/oligomap_single_read.sam b/scripts/tests/files/oligomap_single_read.sam new file mode 100644 index 0000000..8bf2e12 --- /dev/null +++ b/scripts/tests/files/oligomap_single_read.sam @@ -0,0 +1,2 @@ +read_1 0 19 53471 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:1 MD:Z:G18 NH:i:2 +read_1 0 19 44278 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:1 MD:Z:18C NH:i:2 diff --git a/scripts/tests/files/oligomap_transcriptome_no_nh.sam b/scripts/tests/files/oligomap_transcriptome_no_nh.sam new file mode 100644 index 0000000..7fd084b --- /dev/null +++ b/scripts/tests/files/oligomap_transcriptome_no_nh.sam @@ -0,0 +1,7 @@ +read_1 0 19 53471 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:0 MD:Z:19 NH:i:1 +read_2 16 19 7627 255 1I22M * 0 0 AAAGCACCTCCAGAGCTTGAAGC * NM:i:1 MD:Z:23 NH:i:3 +read_2 16 19 7886 255 9M1I13M * 0 0 AAAGCACCTCCAGAGCTTGAAGC * NM:i:1 MD:Z:23 NH:i:3 +read_2 16 19 9524 255 22M1I * 0 0 AAAGCACCTCCAGAGCTTGAAGC * NM:i:1 MD:Z:23 NH:i:3 +read_3 0 19 44414 255 1D22M * 0 0 GAAGGCGCTTCACCTTTGGAGT * NM:i:1 MD:Z:^T22 NH:i:3 +read_3 0 19 54316 255 11M1D10M * 0 0 GAAGGCGCTTCCCTTTGGAGT * NM:i:1 MD:Z:11^G10 NH:i:3 +read_3 0 19 55718 255 22M1D * 0 0 GAAGGCGCTTCACCTTTGGAGT * NM:i:1 MD:Z:22^A0 NH:i:3 From b0c82b2beec7ccba9c45a063e7e724af7339e1b2 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 30 Oct 2023 22:10:03 +0100 Subject: [PATCH 08/14] refacor: finish script refactoring --- scripts/oligomap_output_to_sam_nh_filtered.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py index 70f5907..2d5e927 100755 --- a/scripts/oligomap_output_to_sam_nh_filtered.py +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -293,7 +293,7 @@ def get_sam_fields(aln: list[str]) -> Fields: seq_name_pos = aln[0].split() errors = aln[2].split()[1] seq = aln[3].strip() - + cigar, md = get_cigar_md(errors, seq, aln[4][:-1], aln[5].strip()) fields = Fields(seq_name_pos[0], @@ -446,5 +446,5 @@ def main(arguments) -> None: if __name__ == "__main__": - args = parse_arguments().parse_args() - main(args) + args = parse_arguments().parse_args() # pragma: no cover + main(args) # pragma: no cover From e3a61f2b19d2bd1b8da0b8a5e091ec298566ca8f Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 30 Oct 2023 22:11:11 +0100 Subject: [PATCH 09/14] ci: add oligomap script to the static code analysis --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4c23eb5..6713d4b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: - name: flake8 working-directory: ./scripts - run: flake8 ./*.py --exclude oligomapOutputToSam_nhfiltered.py + run: flake8 ./*.py - name: mypy working-directory: ./scripts From 9093f16a8e9d99cba55c6d203786557ec60a1ac8 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Mon, 30 Oct 2023 22:19:41 +0100 Subject: [PATCH 10/14] docs: update link to oligomap output to sam script --- pipeline_documentation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_documentation.md b/pipeline_documentation.md index 22a991c..a3f50ba 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -1130,7 +1130,7 @@ alignments in a genomic region of interest. [custom-script-mir-ext]: scripts/mirna_extension.py [custom-script-mir-quant]: scripts/mirna_quantification.py [custom-script-nh-filter]: scripts/nh_filter.py -[custom-script-oligo-sam]: scripts/oligomapOutputToSam_nhfiltered.py +[custom-script-oligo-sam]: scripts/oligomap_output_to_sam_nh_filtered.py [custom-script-pri-quant]: scripts/primir_quantification.py [custom-script-remove-dup]: scripts/sam_remove_duplicates_inferior_alignments_multimappers.pl [custom-script-sam-trx]: scripts/sam_trx_to_sam_gen.pl From 851b70930add61281cb69f61576455d82ccc9333 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 12 Nov 2023 19:23:25 +0100 Subject: [PATCH 11/14] test: change unit test to check STDOUT --- ..._output.fa => in_oligomap_output.oligomap} | 0 ...test_oligomap_output_to_sam_nh_filtered.py | 85 +++++++------------ 2 files changed, 33 insertions(+), 52 deletions(-) rename scripts/tests/files/{in_oligomap_output.fa => in_oligomap_output.oligomap} (100%) diff --git a/scripts/tests/files/in_oligomap_output.fa b/scripts/tests/files/in_oligomap_output.oligomap similarity index 100% rename from scripts/tests/files/in_oligomap_output.fa rename to scripts/tests/files/in_oligomap_output.oligomap diff --git a/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py b/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py index 88deef9..7a143f0 100644 --- a/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py +++ b/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py @@ -31,7 +31,7 @@ def empty_file(): @pytest.fixture def genome_nh_2(): """Import path to test files with maximum NH set to 2.""" - oligo_in = Path("files/in_oligomap_output.fa") + oligo_in = Path("files/in_oligomap_output.oligomap") oligo_out = Path("files/oligomap_genome_2_nh.sam") return oligo_in, oligo_out @@ -40,7 +40,7 @@ def genome_nh_2(): @pytest.fixture def transcriptome_no_nh(): """Import path to test files with reference set to transcriptome.""" - oligo_in = Path("files/in_oligomap_output.fa") + oligo_in = Path("files/in_oligomap_output.oligomap") oligo_out = Path("files/oligomap_transcriptome_no_nh.sam") return oligo_in, oligo_out @@ -176,22 +176,21 @@ def test_no_files(self, monkeypatch): parse_arguments().parse_args() assert sysex.value.code == 2 - def test_in_files(self, monkeypatch, empty_file, tmp_path): - """Call with in and output files.""" + def test_in_files(self, monkeypatch, empty_file): + """Call with in file.""" empty_in = empty_file monkeypatch.setattr( sys, 'argv', ['oligomap_output_to_sam_nh_filtered', str(empty_in), - '--outdir', str(tmp_path), ] ) args = parse_arguments().parse_args() assert isinstance(args, argparse.Namespace) - def test_all_arguments(self, monkeypatch, genome_nh_2, tmp_path): + def test_all_arguments(self, monkeypatch, genome_nh_2): """Call with all the arguments.""" fa_in, sam_out = genome_nh_2 @@ -199,8 +198,6 @@ def test_all_arguments(self, monkeypatch, genome_nh_2, tmp_path): sys, 'argv', ['oligomap_output_to_sam_nh_filtered', str(fa_in), - '--outdir', str(tmp_path), - '--ref', "genome", '-n', '100', ] ) @@ -308,88 +305,76 @@ def test_neg_strand_one_err(self,alns, aln_fields): class TestEvalAln: """Test ''eval_aln()' function.""" - def test_eval_empty_dict_new_read(self, tmp_path, aln_fields): + def test_eval_empty_dict_new_read(self, aln_fields): """Test evaluation with a new read and an empty dictionary.""" - output = tmp_path/"oligomap_genome_mappings.sam" - d = dict() minerr_nh = {"read_0" : ['0', 1]} aln = aln_fields[0] nhfilter = None - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) assert list(d.keys())[0] == aln.read_name assert minerr_nh[aln.read_name] == ['0', 1] - def test_eval_empty_dict_smaller_error(self, tmp_path, aln_fields): + def test_eval_empty_dict_smaller_error(self, aln_fields): """Test evaluation with a smaller error and an empty dictionary.""" - output = tmp_path/"oligomap_genome_mappings.sam" - d = dict() minerr_nh = {"read_1" : ['1', 1]} aln = aln_fields[0] nhfilter = None - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) assert list(d.keys())[0] == aln.read_name assert minerr_nh[aln.read_name] == ['0', 1] - def test_increase_nh_no_filter(self, tmp_path, aln_fields): + def test_increase_nh_no_filter(self, aln_fields): """Test evaluation when increasing NH without a maximum value.""" - output = tmp_path/"oligomap_genome_mappings.sam" - d = {"read_1": [aln_fields[1], aln_fields[2]]} minerr_nh = {"read_1" : ['1', 2]} aln = aln_fields[3] nhfilter = None - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) assert len(d[aln.read_name]) == 3 assert minerr_nh[aln.read_name] == ['1', 3] - def test_exceed_nh_filter_2(self, capsys, tmp_path, aln_fields): + def test_exceed_nh_filter_2(self, capsys, aln_fields): """Test evaluation when exceeding the maximum NH set to 2.""" - output = tmp_path/"oligomap_genome_mappings.sam" - d = {"read_1": [aln_fields[1], aln_fields[2]]} minerr_nh = {"read_1" : ['1', 2]} aln = aln_fields[3] nhfilter = 2 - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) captured = capsys.readouterr() assert len(d) == 0 assert minerr_nh[aln.read_name] == ['1', 3] assert captured.err == "Filtered by NH | Read read_1 | Errors = 1\n" - def test_no_exceed_nh_filter_2(self, tmp_path, aln_fields): + def test_no_exceed_nh_filter_2(self, aln_fields): """Test evaluation when increasing NH with maximum value of 2.""" - output = tmp_path/"oligomap_genome_mappings.sam" - d = {"read_1": [aln_fields[1]]} minerr_nh = {"read_1" : ['1', 1]} aln = aln_fields[2] nhfilter = 2 - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) assert len(d[aln.read_name]) == 2 assert minerr_nh[aln.read_name] == ['1', 2] - def test_smaller_min_error(self, capsys, tmp_path, aln_fields): + def test_smaller_min_error(self, capsys, aln_fields): """Test evaluation when having a smaller minimumm error.""" - output = tmp_path/"oligomap_genome_mappings.sam" - d = {"read_1": [aln_fields[1], aln_fields[2]]} minerr_nh = {"read_1" : ['1', 2]} aln = aln_fields[0] nhfilter = None - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) captured = capsys.readouterr() assert len(d[aln.read_name]) == 1 @@ -406,74 +391,70 @@ def test_different_read(self, capsys, tmp_path, aln_fields, single_read): aln = aln_fields[4] nhfilter = None - eval_aln(output, nhfilter, d, minerr_nh, aln) + eval_aln(nhfilter, d, minerr_nh, aln) captured = capsys.readouterr() assert list(d.keys())[0] == aln.read_name assert len(minerr_nh) == 1 assert captured.err == "Written read read_1 | Errors = 1 | NH = 2\n" - with open(out_file, 'r') as expected, open(output, 'r') as output: - assert output.read() == expected.read() + with open(out_file, 'r') as expected: + assert captured.out == expected.read() class TestMain: """Test 'main()' function.""" - def test_main_empty_file(self, monkeypatch, tmp_path, empty_file): - """Test main function with an empty FASTA file.""" + def test_main_empty_file(self, monkeypatch, capsys, empty_file): + """Test main function with an empty file.""" empty_in = empty_file - output = tmp_path/"oligomap_genome_mappings.sam" monkeypatch.setattr( sys, 'argv', ['oligomap_output_to_sam_nh_filtered', str(empty_in), - '--outdir', str(tmp_path), ] ) args = parse_arguments().parse_args() main(args) + captured = capsys.readouterr() - with open(empty_in, 'r') as expected, open(output, 'r') as out_file: - assert out_file.read() == expected.read() + with open(empty_in, 'r') as expected: + assert captured.out == expected.read() - def test_main_max_nh_2(self, monkeypatch, tmp_path, genome_nh_2): + def test_main_max_nh_2(self, monkeypatch, capsys, genome_nh_2): """Test main function with NH set to 2.""" in_file, out_file = genome_nh_2 - output = tmp_path/"oligomap_genome_mappings.sam" monkeypatch.setattr( sys, 'argv', ['oligomap_output_to_sam_nh_filtered', str(in_file), '-n', '2', - '--outdir', str(tmp_path), ] ) args = parse_arguments().parse_args() main(args) + captured = capsys.readouterr() - with open(out_file, 'r') as expected, open(out_file, 'r') as output: - assert output.read() == expected.read() + with open(out_file, 'r') as expected: + assert captured.out == expected.read() - def test_main_no_nh_transcriptome(self, monkeypatch, tmp_path, + def test_main_no_nh_transcriptome(self, monkeypatch, capsys, transcriptome_no_nh): """Test main function with no NH set for transcriptome mappings.""" in_file, out_file = transcriptome_no_nh - output = tmp_path/"oligomap_transcriptome_mappings.sam" monkeypatch.setattr( sys, 'argv', ['oligomap_output_to_sam_nh_filtered', str(in_file), - '--ref', "transcriptome", - '--outdir', str(tmp_path), ] ) args = parse_arguments().parse_args() main(args) + captured = capsys.readouterr() - with open(out_file, 'r') as expected, open(out_file, 'r') as output: - assert output.read() == expected.read() + with open(out_file, 'r') as expected: + assert captured.out == expected.read() From 33f966ad4c37c15f5df09661cec0ae93188350eb Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 12 Nov 2023 19:51:41 +0100 Subject: [PATCH 12/14] restore rules calls --- workflow/rules/map.smk | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index b4c9dc0..d12f146 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -405,16 +405,13 @@ rule sort_genome_oligomap: rule convert_genome_to_sam_oligomap: input: - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", - script=SCRIPTS_DIR / "oligomapOutputToSam_nhfiltered.py", + script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_genome_to_sam_{sample}.log", nh=config["nh"], - out_dir=lambda wildcards, output: Path(output[0]).parent, - ref="genome", log: LOCAL_LOG / "oligomap_genome_to_sam_{sample}.log", resources: @@ -428,9 +425,7 @@ rule convert_genome_to_sam_oligomap: "(python {input.script} \ {input.sort} \ -n {params.nh} \ - --ref {params.ref} \ - --outdir {params.out_dir} \ - ) &> {log}" + > {output.gmap}) &> {log}" ############################################################################### @@ -502,16 +497,13 @@ rule sort_transcriptome_oligomap: rule convert_transcriptome_to_sam_oligomap: input: - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", - script=SCRIPTS_DIR / "oligomapOutputToSam_nhfiltered.py", + script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", params: cluster_log=CLUSTER_LOG / "oligomap_transcriptome_to_sam_{sample}.log", nh=config["nh"], - out_dir=lambda wildcards, output: Path(output[0]).parent, - ref="transcriptome", log: LOCAL_LOG / "oligomap_transcriptome_to_sam_{sample}.log", container: @@ -522,9 +514,7 @@ rule convert_transcriptome_to_sam_oligomap: "(python {input.script} \ {input.sort} \ -n {params.nh} \ - --ref {params.ref} \ - --outdir {params.out_dir} \ - ) &> {log}" + > {output.tmap}) &> {log}" From 4c1292d71bcc301d97b613b132475637da1a4087 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Sun, 12 Nov 2023 19:52:03 +0100 Subject: [PATCH 13/14] refactor: use STDOUT instead of an output file --- scripts/oligomap_output_to_sam_nh_filtered.py | 161 ++++++++---------- 1 file changed, 70 insertions(+), 91 deletions(-) diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py index 2d5e927..cfb8d96 100755 --- a/scripts/oligomap_output_to_sam_nh_filtered.py +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -2,47 +2,44 @@ """Transform oligomap output FASTA file to SAM keeping the best alignments. -Read the input FASTA file, pick the best alignment(s) for each read (i.e. all +Read the input file, pick the best alignment(s) for each read (i.e. all of the alignments have either 0 or 1 error). In addition, if the `--nh-filter` CLI argument is set, filter the reads with more hits than the number provided. -The following sections will cover what the FASTA file must be like and how the +The following sections will cover what the input file must be like and how the output can look like. EXPECTED INPUT The FASTA file must be the output of mapping your library with the tool -oligomap: "https://github.com/zavolanlab/oligomap". The alignment -representation has to have 6 lines with the following content: -- The first line must contain the read's name, its length, the start and end -mapped positions in the read's sequence, the reference sequence name and, the -start and end mapped positions in the reference sequence in this order. -- The second line, must contain the reference sequence name. -- The third line, must contain the fields "errors:" and "orientation:" with -the alignment number of errors and reference sequence strand. -- The fourth and sixth lines must be the read and reference sequence -respectively. -- The fifth line must contain a vertical bar for each match between the -sequences base pairs and a space otherwise. - -Alignments must be separated with a blank line. -In addition, alignments have to be sorted by the read name and separated -with a blank line. An example of how an entry would look like in the EXAMPLES -section. +oligomap: "https://github.com/zavolanlab/oligomap". Refer to the "Output +format" seccion in its main README.md for more information. +In addition, alignments have to be sorted by the read name. An example of how +an entry would look like in the EXAMPLES section. OUTPUT FORMAT -The output is a SAM file named 'oligomap_REF_mappings.sam', where REF is -origin of the reference sequence(s) set in the CLI argument `--ref`. Regarding -the mandatory fields in the SAM file, the bitwise flag, `flag`, is set to 0 if -the reference sequence is found in the poisitive strand, and to 16 otherwise. -The mapping quality, `map_q`, is set to 255. Both, the position of the next -read, `p_next`. and the observed template length,`t_len`, are set to 0. -Both, the reference name of the next read, `r_next`, and the ASCII of -Phred-scaled base quality, `qual`, are set to '*'. In addition, the -optional fields, edit distance, MD string, and number of hits are also -included. Only the best alignments per read (i.e. either all the alignments -have 0 or 1 error) are written to the output file. Moreover, if the +The output consist on the filtered set of alignments from the input file in +SAM format. Only the best alignments per read (i.e. either all the alignments +have 0 or 1 error) are written to the standard output. Moreover, if the `--nh-filter` CLI argument is given, reads with more htis than the provided -value are discarded. If none of the alignments meet the criteria, an empty -file is created. +value are discarded. If none of the alignments meet the criteria, nothing is +returned. The fields in the SAM entry are: + +field | value +-------------- +QNAME | Read's name +FLAG | Set to 0 if the reference sequence is found in the poisitive strand, + | and to 16 otherwise. +RNAME | Refernce sequence name +POS | Alignment's first position in the reference sequence +MAPQ | Value set to 255 (mapping quality not available) +CIGAR | Alignment's CIGAR string +RNEXT | Value set to * (unavailable information) +PNEXT | Value set to 0 (unavailable information) +TLEN | Value set to 0 (unavailable information) +SEQ | Mapped sequence +QUAL | Value set to * (Phred-scaled base quality not stored) +NM:i: | Alignment's edit distance (Optional field) +MD:Z: | Alignment's MD string (Optional field) +NH:i: | Alignment's NH (Optional field) EXAMPLES input format: @@ -110,25 +107,10 @@ def parse_arguments(): help="Show program's version number and exit" ) parser.add_argument( - 'fasta_file', + 'infile', help="Path to the FASTA file resulting from the oligomap mapping.", type=Path, ) - parser.add_argument( - '--outdir', - help="Path to the output directory. Default: %(default)s.", - default=Path.cwd(), - type=Path - ) - parser.add_argument( - '--ref', - help=( - "Molecular collection where the reference sequence(s) belong to. " - "Default: %(default)s." - ), - default="genome", - type=str, - ) parser.add_argument( '-n', '--nh-filter', help=( @@ -270,16 +252,23 @@ def get_sam_fields(aln: list[str]) -> Fields: representation, this function returns a Fields class object with the mandatory fields in a SAM file and three optional ones. - The bitwise flag, `flag`, is set to 0 if the reference sequence is found - in the poisitive strand, and to 16 otherwise. - The mapping quality, `map_q`, is set to 255. - Both, the position of the next read, `p_next`. and the observed template - length,`t_len`, are set to 0. - Both, the reference name of the next read, `r_next`, and the ASCII of - Phred-scaled base quality, `qual`, are set to '*'. - - The three optional fields are the edit distance, the MD string and the - number of hits. + field | value + -------------- + QNAME | Read's name + FLAG | Set to 0 if the reference sequence is found in the poisitive + | strand, and to 16 otherwise. + RNAME | Refernce sequence name + POS | Alignment's first position in the reference sequence + MAPQ | Value set to 255 (mapping quality not available) + CIGAR | Alignment's CIGAR string + RNEXT | Value set to * (unavailable information) + PNEXT | Value set to 0 (unavailable information) + TLEN | Value set to 0 (unavailable information) + SEQ | Mapped sequence + QUAL | Value set to * (Phred-scaled base quality not stored) + NM:i: | Alignment's edit distance (Optional field) + MD:Z: | Alignment's MD string (Optional field) + NH:i: | Alignment's NH (Optional field) Args: aln: @@ -307,9 +296,9 @@ def get_sam_fields(aln: list[str]) -> Fields: return fields -def eval_aln(outfile: Path, nhfilter: int, d: Dict[str, list], - minErr_nh: Dict[str, list], fields: Fields) -> None: - """Evaluate an alignment to add, discard or write it in the output file. +def eval_aln(nhfilter: int, d: Dict[str, list], minErr_nh: Dict[str, list], + fields: Fields) -> None: + """Evaluate an alignment to add, discard or write it to the STDOUT. Given a read's alignment, this function first checks if the dictionary storing the read alignments is empty. If it is, the function checks if the @@ -330,13 +319,11 @@ def eval_aln(outfile: Path, nhfilter: int, d: Dict[str, list], current alignment. If the read is not the first in the dictionary, the function writes all - the alignments for the read in the first position in the output file, + the alignments for the read in the first position to the standard output, and both dictionaries are reset to only include the data for the alignments under evaluation. Args: - outfile: - Path to the ouput file nhfilter: maximum number of hits an read can have to be kept d: @@ -382,14 +369,13 @@ def eval_aln(outfile: Path, nhfilter: int, d: Dict[str, list], minErr_nh[seq_name] = [min(errors, minErr_nh[seq_name][0]), 1] d[seq_name] = [fields] else: - with open(outfile, 'a', encoding="utf-8") as out_file: - for seq, aln in d.items(): - sys.stderr.write(f"Written read {seq} | " + - f"Errors = {minErr_nh[seq][0]} | " + - f"NH = {minErr_nh[seq][1]}\n") - for field in aln: - out_file.write('\t'.join(field) + - f"\tNH:i:{minErr_nh[seq][1]}" + '\n') + for seq, aln in d.items(): + sys.stderr.write(f"Written read {seq} | " + + f"Errors = {minErr_nh[seq][0]} | " + + f"NH = {minErr_nh[seq][1]}\n") + for field in aln: + sys.stdout.write('\t'.join(field) + + f"\tNH:i:{minErr_nh[seq][1]}" + '\n') d.clear() minErr_nh.clear() @@ -399,18 +385,11 @@ def eval_aln(outfile: Path, nhfilter: int, d: Dict[str, list], def main(arguments) -> None: - """Convert oligomap alignments FASTA file to SAM file.""" - outdir = Path(arguments.outdir) - outdir.mkdir(parents=True, exist_ok=True) - outfile = outdir/f"oligomap_{arguments.ref}_mappings.sam" - - with open(outfile, 'a', encoding="utf-8") as out_file: - out_file.write('') - + """Convert the alignments in the oligomap output file to its SAM format.""" read_seqs: Dict[str, list] = {} seq_minError_nh: Dict[str, list] = {} - with open(arguments.fasta_file, 'r', encoding="utf-8") as in_file: + with open(arguments.infile, 'r', encoding="utf-8") as in_file: sys.stderr.write("##############\nSTART READING...\n##############\n") @@ -423,7 +402,7 @@ def main(arguments) -> None: sys.stderr.write(f"Record:{i} | Sequence:{fields.read_name}\n") - eval_aln(outfile, arguments.nh_filter, read_seqs, seq_minError_nh, + eval_aln(arguments.nh_filter, read_seqs, seq_minError_nh, fields) i += 1 @@ -431,17 +410,17 @@ def main(arguments) -> None: lines = [in_file.readline() for _ in range(6)] if len(read_seqs) > 0: - with open(outfile, 'a', encoding="utf-8") as out_file: - for read_name, alignments in read_seqs.items(): - sys.stderr.write(f"Printed read {read_name} | Errors = " + - f"{seq_minError_nh[read_name][0]} | " + - f"NH = {seq_minError_nh[read_name][1]}\n") + for read_name, alignments in read_seqs.items(): + sys.stderr.write(f"Printed read {read_name} | Errors = " + + f"{seq_minError_nh[read_name][0]} | " + + f"NH = {seq_minError_nh[read_name][1]}\n") + + for aln in alignments: + sys.stdout.write('\t'.join(aln) + + f"\tNH:i:{seq_minError_nh[read_name][1]}" + + '\n') - for aln in alignments: - out_file.write('\t'.join(aln) + - f"\tNH:i:{seq_minError_nh[read_name][1]}" + - '\n') sys.stderr.write("SUCCESSFULLY FINISHED.") From 79e8ad393c599588d02884df8edc64ccf9a838e0 Mon Sep 17 00:00:00 2001 From: deliaBlue Date: Wed, 15 Nov 2023 13:47:47 +0100 Subject: [PATCH 14/14] refactor: modify variable name and main docstring --- scripts/oligomap_output_to_sam_nh_filtered.py | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py index cfb8d96..c0df9ca 100755 --- a/scripts/oligomap_output_to_sam_nh_filtered.py +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -296,7 +296,7 @@ def get_sam_fields(aln: list[str]) -> Fields: return fields -def eval_aln(nhfilter: int, d: Dict[str, list], minErr_nh: Dict[str, list], +def eval_aln(nhfilter: int, d: Dict[str, list], min_err_nh: Dict[str, list], fields: Fields) -> None: """Evaluate an alignment to add, discard or write it to the STDOUT. @@ -329,7 +329,7 @@ def eval_aln(nhfilter: int, d: Dict[str, list], minErr_nh: Dict[str, list], d: dictionary with read names as keys and a list with Fields class NamedTuples as values - minErr_nh: + min_err_nh: dictionary with read names as keys and a list with the current minimum error and the number of hits in this order as values fields: @@ -339,19 +339,19 @@ def eval_aln(nhfilter: int, d: Dict[str, list], minErr_nh: Dict[str, list], errors = fields.edit_dist[-1] if len(d) == 0: - if (seq_name not in list(minErr_nh.keys()) or - errors < minErr_nh[seq_name][0]): + if (seq_name not in list(min_err_nh.keys()) or + errors < min_err_nh[seq_name][0]): - minErr_nh[seq_name] = [errors, 1] + min_err_nh[seq_name] = [errors, 1] d[seq_name] = [fields] else: if seq_name == list(d.keys())[0]: - if errors == minErr_nh[seq_name][0]: - minErr_nh[seq_name][1] += 1 + if errors == min_err_nh[seq_name][0]: + min_err_nh[seq_name][1] += 1 if nhfilter: - if minErr_nh[seq_name][1] <= nhfilter: + if min_err_nh[seq_name][1] <= nhfilter: d[seq_name].append(fields) else: @@ -362,32 +362,33 @@ def eval_aln(nhfilter: int, d: Dict[str, list], minErr_nh: Dict[str, list], else: d[seq_name].append(fields) - elif errors < minErr_nh[seq_name][0]: + elif errors < min_err_nh[seq_name][0]: sys.stderr.write(f"Filtered by ERROR | Read {seq_name}" + - f" | Errors = {minErr_nh[seq_name][0]}\n") + f" | Errors = {min_err_nh[seq_name][0]}\n") - minErr_nh[seq_name] = [min(errors, minErr_nh[seq_name][0]), 1] + min_err_nh[seq_name] = [min(errors, min_err_nh[seq_name][0]), + 1] d[seq_name] = [fields] else: for seq, aln in d.items(): sys.stderr.write(f"Written read {seq} | " + - f"Errors = {minErr_nh[seq][0]} | " + - f"NH = {minErr_nh[seq][1]}\n") + f"Errors = {min_err_nh[seq][0]} | " + + f"NH = {min_err_nh[seq][1]}\n") for field in aln: sys.stdout.write('\t'.join(field) + - f"\tNH:i:{minErr_nh[seq][1]}" + '\n') + f"\tNH:i:{min_err_nh[seq][1]}" + '\n') d.clear() - minErr_nh.clear() + min_err_nh.clear() d[seq_name] = [fields] - minErr_nh[seq_name] = [errors, 1] + min_err_nh[seq_name] = [errors, 1] def main(arguments) -> None: - """Convert the alignments in the oligomap output file to its SAM format.""" + """Convert the alignments in the oligomap output file to SAM format.""" read_seqs: Dict[str, list] = {} - seq_minError_nh: Dict[str, list] = {} + seq_min_error_nh: Dict[str, list] = {} with open(arguments.infile, 'r', encoding="utf-8") as in_file: @@ -402,7 +403,7 @@ def main(arguments) -> None: sys.stderr.write(f"Record:{i} | Sequence:{fields.read_name}\n") - eval_aln(arguments.nh_filter, read_seqs, seq_minError_nh, + eval_aln(arguments.nh_filter, read_seqs, seq_min_error_nh, fields) i += 1 @@ -413,12 +414,12 @@ def main(arguments) -> None: for read_name, alignments in read_seqs.items(): sys.stderr.write(f"Printed read {read_name} | Errors = " + - f"{seq_minError_nh[read_name][0]} | " + - f"NH = {seq_minError_nh[read_name][1]}\n") + f"{seq_min_error_nh[read_name][0]} | " + + f"NH = {seq_min_error_nh[read_name][1]}\n") for aln in alignments: sys.stdout.write('\t'.join(aln) + - f"\tNH:i:{seq_minError_nh[read_name][1]}" + + f"\tNH:i:{seq_min_error_nh[read_name][1]}" + '\n') sys.stderr.write("SUCCESSFULLY FINISHED.")