diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4c23eb53..6713d4bf 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -38,7 +38,7 @@ jobs: - name: flake8 working-directory: ./scripts - run: flake8 ./*.py --exclude oligomapOutputToSam_nhfiltered.py + run: flake8 ./*.py - name: mypy working-directory: ./scripts diff --git a/pipeline_documentation.md b/pipeline_documentation.md index 22a991c1..a3f50bac 100644 --- a/pipeline_documentation.md +++ b/pipeline_documentation.md @@ -1130,7 +1130,7 @@ alignments in a genomic region of interest. [custom-script-mir-ext]: scripts/mirna_extension.py [custom-script-mir-quant]: scripts/mirna_quantification.py [custom-script-nh-filter]: scripts/nh_filter.py -[custom-script-oligo-sam]: scripts/oligomapOutputToSam_nhfiltered.py +[custom-script-oligo-sam]: scripts/oligomap_output_to_sam_nh_filtered.py [custom-script-pri-quant]: scripts/primir_quantification.py [custom-script-remove-dup]: scripts/sam_remove_duplicates_inferior_alignments_multimappers.pl [custom-script-sam-trx]: scripts/sam_trx_to_sam_gen.pl diff --git a/scripts/oligomapOutputToSam_nhfiltered.py b/scripts/oligomapOutputToSam_nhfiltered.py deleted file mode 100644 index 3810db5c..00000000 --- a/scripts/oligomapOutputToSam_nhfiltered.py +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python - -# type: ignore -# pylint: skip-file - -################################################# -# Transforms oligomap output to SAM format and -# keeps only best alignments. -# -# input: -# - the output of Oligomap -# - nh filter value -# output: -# - stdout: a SAM file with the best -# alignments for each read *(e.g. all 3 -# alignments with 0 errors, or all alignments -# with 1 error for each read -# -# Paula Iborra. Zavolan Lab. -# Adated version of Alessandro Crippa script. -################################################# - - -import sys -import re -from argparse import ArgumentParser, RawTextHelpFormatter - - -### ARGUMENTS ### PI, modified June 2019. - -parser = ArgumentParser( - description="Oligomap output to SAM. NH filter applicable." - ) -parser.add_argument( - '-v','--version', - action='version', - version='%(prog)s 1.0', - help="Show program's version number and exit" - ) -parser.add_argument( - '-n','--nhfilter', - help="Add NH tag to output, remove reads that contain more aligments than given NH value (with min error).", - type=int, - ) -parser.add_argument( - '-i', '--input', - help="Input File. The output of oligomap mapping.", - required=True, - ) - -args = parser.parse_args() - - -readSeqs = {} # dictionary of reads -filt = {} # dict with all filtered reads (heavy multimappers NH > 100) by error. {'seqName':error} (error with which it has been discarted) -seqToMinError = {} -nh = {} # nh dictionary per read evaluated - - -def addReadToList(d,nh,minerr,seqName,flag,target,positionInTarget,errors,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString): - - if len(d) == 0: # if empty dictionary - if seqName not in list(minerr.keys()): # means: seqName NOT found previously - d[seqName] = [] - minerr[seqName] = errors - nh[seqName] = 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - else: # means: seqName found previously and filtered by NH ---- empty dicty due to d.clear() - if errors < minerr[seqName]: # Aligment found with lower error than the ones stored in minerr dict - d[seqName] = [] - minerr[seqName] = errors - nh[seqName] = 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - - else: # if dictionary not empty - if seqName == list(d.keys())[0]: #same seqName as the one stored in dict found - # check the errors fo this new seqName to include it or not - if errors == minerr[seqName]: # if seqName errors is equal to the one stored in minerr -> keep - nh[seqName] += 1 # increase nh +1 - if args.nhfilter: # if NH filter - if nh[seqName] <= (args.nhfilter): - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - else: # if after adding +1 to nh, the total is > than the filter value. seqName discarded. - # Clear d dictonary - d.clear() - sys.stderr.write("Filtered by NH | Read %s | Errors = %s \n"%(seqName,errors)) - - else: # no NH filtering, keep all reads including heavy multimappers - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - - elif errors < minerr[seqName]: # error minor to the one previously stored. - sys.stderr.write("Filtered by ERROR | Read %s | Errors = %s \n"%(seqName,minerr[seqName])) - minerr[seqName] = min(errors, minerr[seqName]) # Update minor error. - d[seqName] = [] # Create empy list for seqName (removed aligmants previously stored with higher error). - nh[seqName] = 1 # NH starts at 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - else: # if seqName errors is > than the error stored, discard - pass - - elif seqName != list(d.keys())[0]: #new seqName found - # PRINT to the output file the previous seqName stored in dict - for i in d.keys(): - sys.stderr.write("Printed read %s | Errors = %s | NH = %s \n"%(i, minerr[i], nh[i])) - for al in d[i]: - nhtag= ('NH:i:'+str(nh[i])) - al.append(nhtag) - print('\t'.join([str(x) for x in al])) - - # Clean dictonaries before saving the current seqName being evaluated - d.clear() - nh.clear() - minerr.clear() - - # Restart the dictionaries with the new seqName - d[seqName] = [] - minerr[seqName] = errors - nh[seqName] = 1 - d[seqName].append([seqName,flag,target,positionInTarget,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString]) - - - -def readInput(fi): - i = 0 - #read oligomap results - while True: - line1 = fi.readline() #e.g. seq_61 (21 nc) 21..1 chr1 1..21 - if not line1: break #[check if file is empty] - line2 = fi.readline() #e.g. chr1 - if not line2: break #[pointless check] - line3 = fi.readline() #e.g. errors: 1 orientation: - - if not line3: break #[pointless check] - line4 = fi.readline() #e.g. CAAGCAGAAGACGGCATACGC ->input sequence - if not line4: break #[pointless check] - line5 = fi.readline() #e.g. |||||||||||||||||||| - if not line5: break #[pointless check] - line6 = fi.readline() #e.g. CAAGCAGAAGACGGCATACGA ->target sequence - if not line6: break #[pointless check] - line7 = fi.readline() #e.g. "" - i += 1 - #we got a result - if line1[0].isdigit(): #out readNames start with a number - #parse lines - line1 = re.sub( '\s+', ' ', line1).strip().split() - line3 = re.sub( '\s+', ' ', line3).strip().split() - - seqName = line1[0].strip() - target = line2.strip() - positionInTarget = line1[5].split('.')[0].strip() - #position2InTarget= line1[5].split('.')[2].strip() - errors = line3[1].strip() - strand = line3[3].strip() - sequence = line4.strip() - referenceSeq = line6.strip() - mapq='255' - rnext = '*' - pnext= '0' - tlen='0' - qual='*' - - #flag - if strand == "+": - flag = "0" - else: - flag = "16" - #define cigar string - if errors == '0': - #perfect match, cigar string is just the number of nucleotides + "M" - cigarStr = str(len(sequence)) + "M" - matchingString = str(len(sequence)) - editDistance = "NM:i:0" - else: - #cigar and mismatch strings are built here - if line5[0] == " ": #if the first nucleotide is mutated - cigarStr= "1M" - elif line5[-2] == " ": #if the last nucleotide is mutated - cigarStr= str(len(line5)-2) +"M" - else: - cigarStr= str(line5.strip().index(' ')) + "M" #if any other nucleotide is mutated - editDistance = "NM:i:1" - #depending on where in the read a deletion,insertion or mutation occurs, we create ad hoc mismatch strings. - #case 1: deletion in the seq - if '-' in sequence: - indelerr = "1D" - if line5[0] == " ": #the 1st nt is deleted - cigarStr = indelerr + str(len(sequence)-1) + "M" - matchingString = "^" + referenceSeq[0] + str(len(sequence)-1) - elif line5[-2] == " ": #the last nt is deleted ([-2] because [-1] is "\n") - cigarStr = str(len(sequence)-1) + "M" + indelerr - matchingString = str(len(sequence)-1) + "^" + referenceSeq[len(sequence)-1] + "0" #"0" is required if the last entry is ^[something] - else: #deletion occurs "in" the read - tmp = cigarStr - cigarStr = cigarStr + indelerr + str(line5.strip().count('|')-int(tmp[:-1])) + "M" - matchingString = str(line5.strip().index(' ')) + "^" + referenceSeq[int(tmp[:-1])] + str(len(sequence) - line5.strip().index(' ') -1) - #case 2: insertion in the seq - elif '-' in referenceSeq: - indelerr = "1I" - matchingString = str(len(sequence)) - if line5[0] == " ": #the 1st nt is inserted - cigarStr = indelerr + str(len(sequence)-1) + "M" - elif line5[-2] == " ": #the last nt is inserted - cigarStr = str(len(sequence)-1) + "M" + indelerr - else: #addition occurs "in" the read - tmp = cigarStr - cigarStr = cigarStr + indelerr + str(line5.strip().count('|')-int(tmp[:-1])) + "M" - #case 3: single point mutation - else: - cigarStr = str(len(sequence)) + "M" - if line5[0] == " ": #the 1st nt is inserted - matchingString = referenceSeq[0] + str(len(sequence)-1) - elif line5[-2] == " ": #the last nt is inserted - matchingString = str(len(sequence)-1) + referenceSeq[len(sequence)-1] - else: #mutation occurs "in" the read - matchingString = str(line5.strip().index(' ')) + referenceSeq[line5.strip().index(' ')] + str(len(sequence) - line5.strip().index(' ') -1) - #indelerr = referenceSeq[int(cigarStr)] #this info is not used in the cigar string - sequence = re.sub( '-', '', sequence).strip() - - matchingString = ('MD:Z:'+matchingString) - #addReadToList(readSeqs, readFilt, readNh, seqName,flag,target,positionInTarget,errors,cigarStr,sequence,editDistance,matchingString) - sys.stderr.write("Record: %i | Sequence: %s \n"%(i,seqName)) - addReadToList(readSeqs, nh, seqToMinError, seqName,flag,target,positionInTarget,errors,mapq,cigarStr,rnext,pnext,tlen,sequence,qual,editDistance,matchingString) - - -fi = open(args.input, "r") #read file -sys.stderr.write("###########################\nSTART READING...\n###########################\n") -readInput(fi) #process -fi.close() - -#print last aligments stored in dict -if len(readSeqs) != 0: - for i in readSeqs.keys(): - sys.stderr.write("Printed read %s | Errors = %s | NH = %s \n"%(i, seqToMinError[i], nh[i])) - for al in readSeqs[i]: - nhtag= ('NH:i:'+str(nh[i])) - al.append(nhtag) - print('\t'.join([str(x) for x in al])) - -sys.stderr.write("SUCCESSFULLY FINISHED.") -sys.exit() diff --git a/scripts/oligomap_output_to_sam_nh_filtered.py b/scripts/oligomap_output_to_sam_nh_filtered.py new file mode 100755 index 00000000..c0df9cab --- /dev/null +++ b/scripts/oligomap_output_to_sam_nh_filtered.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python + +"""Transform oligomap output FASTA file to SAM keeping the best alignments. + +Read the input file, pick the best alignment(s) for each read (i.e. all +of the alignments have either 0 or 1 error). In addition, if the `--nh-filter` +CLI argument is set, filter the reads with more hits than the number provided. +The following sections will cover what the input file must be like and how the +output can look like. + +EXPECTED INPUT +The FASTA file must be the output of mapping your library with the tool +oligomap: "https://github.com/zavolanlab/oligomap". Refer to the "Output +format" seccion in its main README.md for more information. +In addition, alignments have to be sorted by the read name. An example of how +an entry would look like in the EXAMPLES section. + +OUTPUT FORMAT +The output consist on the filtered set of alignments from the input file in +SAM format. Only the best alignments per read (i.e. either all the alignments +have 0 or 1 error) are written to the standard output. Moreover, if the +`--nh-filter` CLI argument is given, reads with more htis than the provided +value are discarded. If none of the alignments meet the criteria, nothing is +returned. The fields in the SAM entry are: + +field | value +-------------- +QNAME | Read's name +FLAG | Set to 0 if the reference sequence is found in the poisitive strand, + | and to 16 otherwise. +RNAME | Refernce sequence name +POS | Alignment's first position in the reference sequence +MAPQ | Value set to 255 (mapping quality not available) +CIGAR | Alignment's CIGAR string +RNEXT | Value set to * (unavailable information) +PNEXT | Value set to 0 (unavailable information) +TLEN | Value set to 0 (unavailable information) +SEQ | Mapped sequence +QUAL | Value set to * (Phred-scaled base quality not stored) +NM:i: | Alignment's edit distance (Optional field) +MD:Z: | Alignment's MD string (Optional field) +NH:i: | Alignment's NH (Optional field) + +EXAMPLES + input format: + 82-1 (23 nc) 1..23 19 44377..44398 + 19 + errors: 1 orientation: + + CTACAAAGGGAAGCACTTGTCTC + |||||||||||||||||| |||| + CTACAAAGGGAAGCACTT-TCTC + + + 97-1 (22 nc) 1..22 19 5338..5359 + 19 + errors: 1 orientation: + + TCAAAACTGAGGTGCATTTTCT + |||||||||||| ||||||||| + TCAAAACTGAGGGGCATTTTCT + + output format: + 82-1 0 19 44377 255 18M1I4M * 0 0 CTACAAAGGGAAGCACTTGTCTC * NM:i:1 MD:Z:23 NH:i:1 + 97-1 0 19 5338 255 22M * 0 0 TCAAAACTGAGGTGCATTTTCT * NM:i:1 MD:Z:12G9 NH:i:1 + + +Paula Iborra. Zavolan Lab. +Adapted version of Alessandro Crippa script. +Refactored and documented by Iris Mestres. +""" # noqa: E501 +# pylint: enable=line-too-long + +from argparse import ArgumentParser, RawDescriptionHelpFormatter +from pathlib import Path +import re +import sys +from typing import Dict, NamedTuple + + +class Fields(NamedTuple): + """Class to store an alignment in its different SAM fields.""" + + read_name: str + flag: str + ref_seq_name: str + position_in_ref: str + map_q: str + cigar_str: str + r_next: str + p_next: str + t_len: str + sequence: str + qual: str + edit_dist: str + md_str: str + + +def parse_arguments(): + """Command-line arguments parser.""" + parser = ArgumentParser( + description=__doc__, + formatter_class=RawDescriptionHelpFormatter + ) + parser.add_argument( + '-v', '--version', + action="version", + version="%(prog)s 1.1.0", + help="Show program's version number and exit" + ) + parser.add_argument( + 'infile', + help="Path to the FASTA file resulting from the oligomap mapping.", + type=Path, + ) + parser.add_argument( + '-n', '--nh-filter', + help=( + "Add NH tag to output and remove reads that contain more " + "aligments than the provided NH value (with min error)." + ), + type=int, + ) + + return parser + + +def get_cigar_md(errors: str, sequence: str, bars_line: str, + ref_seq: str) -> tuple[str, str]: + """Get the CIGAR and MD strings. + + Given the read and target sequences, the number of errors and the + alignment's bar representation, this function first checks if there is any + error in the mapping. If there is no errors, the CIGAR and MD strings are + the read's length followed by and "M" and the read's length respectively. + If there is an error, the function checks if the read has a deletion (the + read sequence contains a dash), an insertion (the reference sequence + contains a dash), or a single-point mutation. Finally, it checks the + error's position in the read sequence and builds the CIGAR and MD strings. + + Args: + errors: + number of mapping errors (either 0 or 1) + sequence: + read sequence + bars_line: + alignment's bar representation + ref_seq: + reference sequence + + Returns: + cigarStr: + alignment's CIGAR string + matchingString: + alignment's MD string with its corresponding tag + + Examples: + read sequence -> GAAGGCGCTTCACCTTTGGAGT + alignment representation -> |||||||||||||||||||||| + reference sequence -> GAAGGCGCTTCACCTTTGGAGT + errors -> 0 + + CIGAR -> 21M + MD -> MD:Z:21 + + ------------------------------------------------------ + + read sequence -> GAAGGCGCTTCACCTTTGGAGT + alignment representation -> ||||||||||| |||||||||| + reference sequence -> GAAGGCGCTTC-CCTTTGGAGT + errors -> 1 + + CIGAR -> 11M1I10M + MD -> MD:Z:21 + + ------------------------------------------------------ + + read sequence -> GAAGGCGCTTC-CCTTTGGAGT + alignment representation -> ||||||||||| |||||||||| + reference sequence -> GAAGGCGCTTCCCCTTTGGAGT + errors -> 1 + + CIGAR -> 11M1D10M + MD -> MD:Z:11C10 + + ------------------------------------------------------ + + read sequence -> GAAGGCGCTTCACCTTTGGAGT + alignment representation -> ||||||||||| |||||||||| + reference sequence -> GAAGGCGCTTCCCCTTTGGAGT + errors -> 1 + + CIGAR -> 21M + MD -> MD:Z:11C10 + """ + seq_len = len(sequence) + + if errors == '0': + return f"{seq_len}M", f"MD:Z:{seq_len}" + + # CASE 1: deletion in the read + if '-' in sequence: + indelerr = "1D" + + if bars_line[0] == ' ': + cigarStr = f"{indelerr}{seq_len - 1}M" + matchingString = f"MD:Z:^{ref_seq[0]}{seq_len - 1}" + + elif bars_line[-1] == " ": + cigarStr = f"{seq_len - 1}M{indelerr}" + matchingString = f"MD:Z:{seq_len - 1}^{ref_seq[seq_len - 1]}0" + + else: + idx = bars_line.index(' ') + cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx}M" + matchingString = f"MD:Z:{idx}^{ref_seq[idx]}{seq_len - idx -1}" + + return cigarStr, matchingString + + # CASE 2: insertion in the read + if '-' in ref_seq: + indelerr = "1I" + + if bars_line[0] == " ": + cigarStr = f"{indelerr}{seq_len - 1}M" + + elif bars_line[-1] == " ": + cigarStr = f"{seq_len - 1}M{indelerr}" + + else: + idx = bars_line.index(' ') + cigarStr = f"{idx}M{indelerr}{bars_line.count('|') - idx}M" + + return cigarStr, f"MD:Z:{seq_len}" + + # CASE 3: single point mutation + if bars_line[0] == " ": + matchingString = f"MD:Z:{ref_seq[0]}{seq_len - 1}" + + elif bars_line[-1] == " ": + matchingString = f"MD:Z:{seq_len - 1}{ref_seq[seq_len - 1]}" + + else: + idx = bars_line.index(' ') + matchingString = f"MD:Z:{idx}{ref_seq[idx]}{seq_len - idx - 1}" + + return f"{seq_len}M", matchingString + + +def get_sam_fields(aln: list[str]) -> Fields: + """Create the read's alignment in SAM format. + + Given the set of lines oligomap provides as an alignment + representation, this function returns a Fields class object with the + mandatory fields in a SAM file and three optional ones. + + field | value + -------------- + QNAME | Read's name + FLAG | Set to 0 if the reference sequence is found in the poisitive + | strand, and to 16 otherwise. + RNAME | Refernce sequence name + POS | Alignment's first position in the reference sequence + MAPQ | Value set to 255 (mapping quality not available) + CIGAR | Alignment's CIGAR string + RNEXT | Value set to * (unavailable information) + PNEXT | Value set to 0 (unavailable information) + TLEN | Value set to 0 (unavailable information) + SEQ | Mapped sequence + QUAL | Value set to * (Phred-scaled base quality not stored) + NM:i: | Alignment's edit distance (Optional field) + MD:Z: | Alignment's MD string (Optional field) + NH:i: | Alignment's NH (Optional field) + + Args: + aln: + list made up of the lines that form the output alignment after + mapping with oligomap. + + Returns: + fields: + Fields class NamedTuple with the alignment's data as SAM fields. + """ + seq_name_pos = aln[0].split() + errors = aln[2].split()[1] + seq = aln[3].strip() + + cigar, md = get_cigar_md(errors, seq, aln[4][:-1], aln[5].strip()) + + fields = Fields(seq_name_pos[0], + '0' if aln[2].split()[3] == '+' else "16", + aln[1].strip(), + seq_name_pos[5].split('.')[0], + "255", cigar, '*', '0', '0', + re.sub('-', '', seq), '*', + "NM:i:0" if errors == '0' else "NM:i:1", md) + + return fields + + +def eval_aln(nhfilter: int, d: Dict[str, list], min_err_nh: Dict[str, list], + fields: Fields) -> None: + """Evaluate an alignment to add, discard or write it to the STDOUT. + + Given a read's alignment, this function first checks if the dictionary + storing the read alignments is empty. If it is, the function checks if the + read has an entry in the dictionary storing the current minium error and + the NH. If there is not, or the minimum number of errors is higher than + the alignment under evaluation, the entry for that read in both + dictionaries is set to the data of the alignment under evaluation. + + If the alignments dictionary is not empty, the function checks if the + read under evaluation is the first in the dictionary. If it is, and the + number of errors in the current alignment is the same as the minimum error + for that read, the number of hits is increased by 1. If the number of hits + does not exceed the maximum NH or the NH is not provided, the alignment + under evaluation is appended to the read's entry. If a maximum NH is set, + and the read exceeds it, it is removed. If the minimum number of errors is + higher than the number of errors of the alignment under evaluation, the + entry for that read in both dictionaries is set to the data of the + current alignment. + + If the read is not the first in the dictionary, the function writes all + the alignments for the read in the first position to the standard output, + and both dictionaries are reset to only include the data for the + alignments under evaluation. + + Args: + nhfilter: + maximum number of hits an read can have to be kept + d: + dictionary with read names as keys and a list with Fields class + NamedTuples as values + min_err_nh: + dictionary with read names as keys and a list with the current + minimum error and the number of hits in this order as values + fields: + read's alignment as a Fields class NamedTuple + """ + seq_name = fields.read_name + errors = fields.edit_dist[-1] + + if len(d) == 0: + if (seq_name not in list(min_err_nh.keys()) or + errors < min_err_nh[seq_name][0]): + + min_err_nh[seq_name] = [errors, 1] + d[seq_name] = [fields] + else: + if seq_name == list(d.keys())[0]: + if errors == min_err_nh[seq_name][0]: + min_err_nh[seq_name][1] += 1 + + if nhfilter: + + if min_err_nh[seq_name][1] <= nhfilter: + d[seq_name].append(fields) + + else: + d.clear() + sys.stderr.write(f"Filtered by NH | Read {seq_name}" + + f" | Errors = {errors}\n") + + else: + d[seq_name].append(fields) + + elif errors < min_err_nh[seq_name][0]: + sys.stderr.write(f"Filtered by ERROR | Read {seq_name}" + + f" | Errors = {min_err_nh[seq_name][0]}\n") + + min_err_nh[seq_name] = [min(errors, min_err_nh[seq_name][0]), + 1] + d[seq_name] = [fields] + else: + for seq, aln in d.items(): + sys.stderr.write(f"Written read {seq} | " + + f"Errors = {min_err_nh[seq][0]} | " + + f"NH = {min_err_nh[seq][1]}\n") + for field in aln: + sys.stdout.write('\t'.join(field) + + f"\tNH:i:{min_err_nh[seq][1]}" + '\n') + + d.clear() + min_err_nh.clear() + + d[seq_name] = [fields] + min_err_nh[seq_name] = [errors, 1] + + +def main(arguments) -> None: + """Convert the alignments in the oligomap output file to SAM format.""" + read_seqs: Dict[str, list] = {} + seq_min_error_nh: Dict[str, list] = {} + + with open(arguments.infile, 'r', encoding="utf-8") as in_file: + + sys.stderr.write("##############\nSTART READING...\n##############\n") + + lines = [in_file.readline() for _ in range(6)] + i = 1 + + while lines[0] != "": + + fields = get_sam_fields(lines) + + sys.stderr.write(f"Record:{i} | Sequence:{fields.read_name}\n") + + eval_aln(arguments.nh_filter, read_seqs, seq_min_error_nh, + fields) + i += 1 + + in_file.readline() + lines = [in_file.readline() for _ in range(6)] + + if len(read_seqs) > 0: + + for read_name, alignments in read_seqs.items(): + sys.stderr.write(f"Printed read {read_name} | Errors = " + + f"{seq_min_error_nh[read_name][0]} | " + + f"NH = {seq_min_error_nh[read_name][1]}\n") + + for aln in alignments: + sys.stdout.write('\t'.join(aln) + + f"\tNH:i:{seq_min_error_nh[read_name][1]}" + + '\n') + + sys.stderr.write("SUCCESSFULLY FINISHED.") + + +if __name__ == "__main__": + args = parse_arguments().parse_args() # pragma: no cover + main(args) # pragma: no cover diff --git a/scripts/tests/files/in_oligomap_output.oligomap b/scripts/tests/files/in_oligomap_output.oligomap new file mode 100644 index 00000000..69e6420d --- /dev/null +++ b/scripts/tests/files/in_oligomap_output.oligomap @@ -0,0 +1,69 @@ +read_1 (19 nc) 1..19 19 44278..44297 +19 +errors: 1 orientation: + +CTACAAAGGGAAGCACTTT + |||||||||||||||||| +GTACAAAGGGAAGCCCTTT + +read_1 (19 nc) 1..19 19 44377..44395 +19 +errors: 1 orientation: + +CTACAAAGGGAAGCACTTT +|||||||||||||| |||| +CTACAAAGGGAAGCCCTTT + +read_1 (19 nc) 1..19 19 50971..50989 +19 +errors: 1 orientation: + +CTACAAAGGGAAGCACTTT +|||||||||||||||||| +CTACAAAGGGAAGCACTTC + +read_1 (19 nc) 1..19 19 53471..53489 +19 +errors: 0 orientation: + +CTACAAAGGGAAGCACTTT +||||||||||||||||||| +CTACAAAGGGAAGCACTTT + +read_2 (23 nc) 1..23 19 7627..7648 +19 +errors: 1 orientation: - +AAAGCACCTCCAGAGCTTGAAGC + |||||||||||||||||||||| +-AAGCACCTCCAGAGCTTGAAGC + +read_2 (23 nc) 1..23 19 7886..7908 +19 +errors: 1 orientation: - +AAAGCACCTCCAGAGCTTGAAGC +||||||||| ||||||||||||| +AAAGCACCT-CAGAGCTTGAAGC + +read_2 (23 nc) 1..23 19 9524..7545 +19 +errors: 1 orientation: - +AAAGCACCTCCAGAGCTTGAAGC +|||||||||||||||||||||| +AAAGCACCTCCAGAGCTTGAAG- + +read_3 (22 nc) 1..22 19 44414..44434 +19 +errors: 1 orientation: + +-GAAGGCGCTTCACCTTTGGAGT + |||||||||||||||||||||| +TGAAGGCGCTTCACCTTTGGAGT + +read_3 (22 nc) 1..22 19 54316..54336 +19 +errors: 1 orientation: + +GAAGGCGCTTC-CCTTTGGAGT +||||||||||| |||||||||| +GAAGGCGCTTCGCCTTTGGAGT + +read_3 (22 nc) 1..22 19 55718..55738 +19 +errors: 1 orientation: + +GAAGGCGCTTCACCTTTGGAGT- +|||||||||||||||||||||| +GAAGGCGCTTCACCTTTGGAGTA diff --git a/scripts/tests/files/oligomap_genome_2_nh.sam b/scripts/tests/files/oligomap_genome_2_nh.sam new file mode 100644 index 00000000..9816087c --- /dev/null +++ b/scripts/tests/files/oligomap_genome_2_nh.sam @@ -0,0 +1 @@ +read_1 0 19 53471 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:0 MD:Z:19 NH:i:1 diff --git a/scripts/tests/files/oligomap_single_read.sam b/scripts/tests/files/oligomap_single_read.sam new file mode 100644 index 00000000..8bf2e122 --- /dev/null +++ b/scripts/tests/files/oligomap_single_read.sam @@ -0,0 +1,2 @@ +read_1 0 19 53471 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:1 MD:Z:G18 NH:i:2 +read_1 0 19 44278 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:1 MD:Z:18C NH:i:2 diff --git a/scripts/tests/files/oligomap_transcriptome_no_nh.sam b/scripts/tests/files/oligomap_transcriptome_no_nh.sam new file mode 100644 index 00000000..7fd084bf --- /dev/null +++ b/scripts/tests/files/oligomap_transcriptome_no_nh.sam @@ -0,0 +1,7 @@ +read_1 0 19 53471 255 19M * 0 0 CTACAAAGGGAAGCACTTT * NM:i:0 MD:Z:19 NH:i:1 +read_2 16 19 7627 255 1I22M * 0 0 AAAGCACCTCCAGAGCTTGAAGC * NM:i:1 MD:Z:23 NH:i:3 +read_2 16 19 7886 255 9M1I13M * 0 0 AAAGCACCTCCAGAGCTTGAAGC * NM:i:1 MD:Z:23 NH:i:3 +read_2 16 19 9524 255 22M1I * 0 0 AAAGCACCTCCAGAGCTTGAAGC * NM:i:1 MD:Z:23 NH:i:3 +read_3 0 19 44414 255 1D22M * 0 0 GAAGGCGCTTCACCTTTGGAGT * NM:i:1 MD:Z:^T22 NH:i:3 +read_3 0 19 54316 255 11M1D10M * 0 0 GAAGGCGCTTCCCTTTGGAGT * NM:i:1 MD:Z:11^G10 NH:i:3 +read_3 0 19 55718 255 22M1D * 0 0 GAAGGCGCTTCACCTTTGGAGT * NM:i:1 MD:Z:22^A0 NH:i:3 diff --git a/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py b/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py new file mode 100644 index 00000000..7a143f07 --- /dev/null +++ b/scripts/tests/test_oligomap_output_to_sam_nh_filtered.py @@ -0,0 +1,460 @@ +"""Unit tests for module 'oligomap_output_to_sam_nh_filtered.py'.""" + +import argparse +from pathlib import Path +import sys +from typing import NamedTuple + +import pytest + +sys.path.append("../../") + + +from scripts.oligomap_output_to_sam_nh_filtered import( + eval_aln, + Fields, + get_cigar_md, + get_sam_fields, + main, + parse_arguments +) + + +@pytest.fixture +def empty_file(): + """Import path to empty file.""" + empty_file = Path("files/empty_file") + + return empty_file + + +@pytest.fixture +def genome_nh_2(): + """Import path to test files with maximum NH set to 2.""" + oligo_in = Path("files/in_oligomap_output.oligomap") + oligo_out = Path("files/oligomap_genome_2_nh.sam") + + return oligo_in, oligo_out + + +@pytest.fixture +def transcriptome_no_nh(): + """Import path to test files with reference set to transcriptome.""" + oligo_in = Path("files/in_oligomap_output.oligomap") + oligo_out = Path("files/oligomap_transcriptome_no_nh.sam") + + return oligo_in, oligo_out + +@pytest.fixture +def single_read(): + """Import path to test file with a single read.""" + oligo_out = Path("files/oligomap_single_read.sam") + + return oligo_out + +@pytest.fixture +def aln_fields(): + """Create sample alignment as a Fields class NamedTuple.""" + # Perfect alignment + field_1 = Fields("read_1", "0", "19", "44377", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:0", "MD:Z:19") + + # Alignment with a mismatch in the first position + field_2 = Fields("read_1", '0', "19", "53471", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:1", "MD:Z:G18") + + # Alignment with a mismatch in the last position + field_3 = Fields("read_1", "0", "19", "44278", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:1", "MD:Z:18C") + + # Alignment with a mismatch in the middle of the read sequence + field_4 = Fields("read_1", "0", "19", "50971", "255", "19M", '*', '0', '0', + "CTACAAAGGGAAGCACTTT", '*', "NM:i:1", "MD:Z:14C4") + + # Alignment with an insertion at read's first position + field_5 = Fields("read_2", "16", "19", "7627", "255", "1I22M", '*', '0', '0', + "AAAGCACCTCCAGAGCTTGAAGC", '*', "NM:i:1", "MD:Z:23") + + # Alignment with an insertion in the middle of the read sequence + field_6 = Fields("read_2", "16", "19", "7886", "255", "9M1I12M", '*', '0', '0', + "AAAGCACCTCCAGAGCTTGAAGC", '*', "NM:i:1", "MD:Z:23") + + return [field_1, field_2, field_3, field_4, field_5, field_6] + + +@pytest.fixture +def alns(): + """Create sample alignment lists to extract the CIGAR and MD strings. + + The list contain the number of errors, the read's sequence, the alignment + representation in bars and the reference sequence as strings in that + order. + """ + # Perfect alignment + aln1 = [] + aln1.append("0") + aln1.append("CTACAAAGGGAAGCACTTT") + aln1.append("|||||||||||||||||||") + aln1.append("CTACAAAGGGAAGCACTTT") + + # Alignment with a mismatch in the first position + aln2 = [] + aln2.append("1") + aln2.append("CTACAAAGGGAAGCACTTT") + aln2.append(" ||||||||||||||||||") + aln2.append("GTACAAAGGGAAGCACTTT") + + # Alignment with a mismatch in the last position + aln3 = [] + aln3.append("1") + aln3.append("CTACAAAGGGAAGCACTTT") + aln3.append("|||||||||||||||||| ") + aln3.append("CTACAAAGGGAAGCACTTC") + + # Alignment with a mismatch in the middle of the sequence + aln4 = [] + aln4.append("1") + aln4.append("CTACAAAGGGAAGCACTTT") + aln4.append("|||||||||||||| ||||") + aln4.append("CTACAAAGGGAAGCCCTTT") + + # Alignment with an insertion at read's first position + aln5 = [] + aln5.append("1") + aln5.append("AAAGCACCTCCAGAGCTTGAAGC") + aln5.append(" ||||||||||||||||||||||") + aln5.append("-AAGCACCTCCAGAGCTTGAAGC") + + # Alignment with an insertion at read's last position + aln6 = [] + aln6.append("1") + aln6.append("AAAGCACCTCCAGAGCTTGAAGC") + aln6.append("|||||||||||||||||||||| ") + aln6.append("AAAGCACCTCCAGAGCTTGAAG-") + + # Alignment with an insertion in the middle of the read + aln7 = [] + aln7.append("1") + aln7.append("AAAGCACCTCCAGAGCTTGAAGC") + aln7.append("||||||||| |||||||||||||") + aln7.append("AAAGCACCT-CAGAGCTTGAAGC") + + # Alignment with a deletion at read's first position + aln8 = [] + aln8.append("1") + aln8.append("-GAAGGCGCTTCACCTTTGGAGT") + aln8.append(" ||||||||||||||||||||||") + aln8.append("TGAAGGCGCTTCACCTTTGGAGT") + + # Alignment with a deletion at read's last position + aln9 = [] + aln9.append("1") + aln9.append("GAAGGCGCTTCACCTTTGGAGT-") + aln9.append("|||||||||||||||||||||| ") + aln9.append("GAAGGCGCTTCACCTTTGGAGTA") + + # Alignment with a deletion in the middle of the read + aln10 = [] + aln10.append("1") + aln10.append("GAAGGCGCTTC-CCTTTGGAGT") + aln10.append("||||||||||| ||||||||||") + aln10.append("GAAGGCGCTTCACCTTTGGAGT") + + return [aln1, aln2, aln3, aln4, aln5, aln6, aln7, aln8, aln9, aln10] + + +class TestParseArguments: + """Test 'parse_arguments()' function.""" + + def test_no_files(self, monkeypatch): + """Call without input nor output files.""" + with pytest.raises(SystemExit) as sysex: + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered'] + ) + parse_arguments().parse_args() + assert sysex.value.code == 2 + + def test_in_files(self, monkeypatch, empty_file): + """Call with in file.""" + empty_in = empty_file + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(empty_in), + ] + ) + + args = parse_arguments().parse_args() + assert isinstance(args, argparse.Namespace) + + def test_all_arguments(self, monkeypatch, genome_nh_2): + """Call with all the arguments.""" + fa_in, sam_out = genome_nh_2 + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(fa_in), + '-n', '100', + ] + ) + + args = parse_arguments().parse_args() + assert isinstance(args, argparse.Namespace) + + +class TestGetCigarMd: + """Test 'get_cigar_md()' function.""" + + def test_perfect_aln(self, alns): + """Test perfect alignment.""" + result = ("19M", "MD:Z:19") + + assert get_cigar_md(alns[0][0], alns[0][1], + alns[0][2], alns[0][3]) == result + + def test_mm_first_pos_aln(self, alns): + """Test mismatch at read's first position.""" + result = ("19M", "MD:Z:G18") + + assert get_cigar_md(alns[1][0], alns[1][1], + alns[1][2], alns[1][3]) == result + + def test_mm_last_pos_aln(self, alns): + """Test mismatch at read's last position.""" + result = ("19M", "MD:Z:18C") + + assert get_cigar_md(alns[2][0], alns[2][1], + alns[2][2], alns[2][3]) == result + + def test_mm_middle_aln(self, alns): + """Test mismatch in the middle of the read.""" + result = ("19M", "MD:Z:14C4") + + assert get_cigar_md(alns[3][0], alns[3][1], + alns[3][2], alns[3][3]) == result + + def test_in_first_pos_aln(self, alns): + """Test insertion at read's first position.""" + result = ("1I22M", "MD:Z:23") + + assert get_cigar_md(alns[4][0], alns[4][1], + alns[4][2], alns[4][3]) == result + + def test_in_last_pos_aln(self, alns): + """Test insertion at read's last position.""" + result = ("22M1I", "MD:Z:23") + + assert get_cigar_md(alns[5][0], alns[5][1], + alns[5][2], alns[5][3]) == result + + def test_in_middle_aln(self, alns): + """Test insertion in the middle of the read.""" + result = ("9M1I13M", "MD:Z:23") + + assert get_cigar_md(alns[6][0], alns[6][1], + alns[6][2], alns[6][3]) == result + + def test_del_first_pos_aln(self, alns): + """Test deletion at read's first position.""" + result = ("1D22M", "MD:Z:^T22") + + assert get_cigar_md(alns[7][0], alns[7][1], + alns[7][2], alns[7][3]) == result + + def test_del_last_pos_aln(self, alns): + """Test deletion at read's last position.""" + result = ("22M1D", "MD:Z:22^A0") + + assert get_cigar_md(alns[8][0], alns[8][1], + alns[8][2], alns[8][3]) == result + + def test_del_middle_aln(self, alns): + """Test deletion in the middle of the read.""" + result = ("11M1D10M", "MD:Z:11^A10") + + assert get_cigar_md(alns[9][0], alns[9][1], + alns[9][2], alns[9][3]) == result + + +class TestGetSAMFields(): + """Test 'get_sam_fields()' function.""" + + def test_pos_strand_no_err(self,alns, aln_fields): + """Test perfect alignment in the positive strand.""" + line1 = "read_1 (19 nc) 1...19 19 44377...44395" + line2 = "19" + line3 = "errors: 0 orientation: +" + + assert get_sam_fields([line1, line2, line3, alns[0][1], + alns[0][2], alns[0][3]]) == aln_fields[0] + + def test_neg_strand_one_err(self,alns, aln_fields): + """Test alignment with an insertion in the negative strand.""" + line1 = "read_2 (23 nc) 1...23 19 7886...7908" + line2 = "19" + line3 = "errors: 1 orientation: -" + + assert get_sam_fields([line1, line2, line3, alns[6][1], + alns[6][2], alns[6][3]]) == aln_fields[5] + + +class TestEvalAln: + """Test ''eval_aln()' function.""" + + def test_eval_empty_dict_new_read(self, aln_fields): + """Test evaluation with a new read and an empty dictionary.""" + d = dict() + minerr_nh = {"read_0" : ['0', 1]} + aln = aln_fields[0] + nhfilter = None + + eval_aln(nhfilter, d, minerr_nh, aln) + + assert list(d.keys())[0] == aln.read_name + assert minerr_nh[aln.read_name] == ['0', 1] + + def test_eval_empty_dict_smaller_error(self, aln_fields): + """Test evaluation with a smaller error and an empty dictionary.""" + d = dict() + minerr_nh = {"read_1" : ['1', 1]} + aln = aln_fields[0] + nhfilter = None + + eval_aln(nhfilter, d, minerr_nh, aln) + + assert list(d.keys())[0] == aln.read_name + assert minerr_nh[aln.read_name] == ['0', 1] + + def test_increase_nh_no_filter(self, aln_fields): + """Test evaluation when increasing NH without a maximum value.""" + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[3] + nhfilter = None + + eval_aln(nhfilter, d, minerr_nh, aln) + + assert len(d[aln.read_name]) == 3 + assert minerr_nh[aln.read_name] == ['1', 3] + + def test_exceed_nh_filter_2(self, capsys, aln_fields): + """Test evaluation when exceeding the maximum NH set to 2.""" + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[3] + nhfilter = 2 + + eval_aln(nhfilter, d, minerr_nh, aln) + captured = capsys.readouterr() + + assert len(d) == 0 + assert minerr_nh[aln.read_name] == ['1', 3] + assert captured.err == "Filtered by NH | Read read_1 | Errors = 1\n" + + def test_no_exceed_nh_filter_2(self, aln_fields): + """Test evaluation when increasing NH with maximum value of 2.""" + d = {"read_1": [aln_fields[1]]} + minerr_nh = {"read_1" : ['1', 1]} + aln = aln_fields[2] + nhfilter = 2 + + eval_aln(nhfilter, d, minerr_nh, aln) + + assert len(d[aln.read_name]) == 2 + assert minerr_nh[aln.read_name] == ['1', 2] + + def test_smaller_min_error(self, capsys, aln_fields): + """Test evaluation when having a smaller minimumm error.""" + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[0] + nhfilter = None + + eval_aln(nhfilter, d, minerr_nh, aln) + captured = capsys.readouterr() + + assert len(d[aln.read_name]) == 1 + assert minerr_nh[aln.read_name] == ['0', 1] + assert captured.err == "Filtered by ERROR | Read read_1 | Errors = 1\n" + + def test_different_read(self, capsys, tmp_path, aln_fields, single_read): + """Test evaluation when having to write due to a different read.""" + output = tmp_path/"oligomap_genome_mappings.sam" + out_file = single_read + + d = {"read_1": [aln_fields[1], aln_fields[2]]} + minerr_nh = {"read_1" : ['1', 2]} + aln = aln_fields[4] + nhfilter = None + + eval_aln(nhfilter, d, minerr_nh, aln) + captured = capsys.readouterr() + + assert list(d.keys())[0] == aln.read_name + assert len(minerr_nh) == 1 + assert captured.err == "Written read read_1 | Errors = 1 | NH = 2\n" + + with open(out_file, 'r') as expected: + assert captured.out == expected.read() + + +class TestMain: + """Test 'main()' function.""" + + def test_main_empty_file(self, monkeypatch, capsys, empty_file): + """Test main function with an empty file.""" + empty_in = empty_file + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(empty_in), + ] + ) + args = parse_arguments().parse_args() + main(args) + captured = capsys.readouterr() + + with open(empty_in, 'r') as expected: + assert captured.out == expected.read() + + def test_main_max_nh_2(self, monkeypatch, capsys, genome_nh_2): + """Test main function with NH set to 2.""" + in_file, out_file = genome_nh_2 + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(in_file), + '-n', '2', + ] + ) + args = parse_arguments().parse_args() + main(args) + captured = capsys.readouterr() + + with open(out_file, 'r') as expected: + assert captured.out == expected.read() + + def test_main_no_nh_transcriptome(self, monkeypatch, capsys, + transcriptome_no_nh): + """Test main function with no NH set for transcriptome mappings.""" + in_file, out_file = transcriptome_no_nh + + monkeypatch.setattr( + sys, 'argv', + ['oligomap_output_to_sam_nh_filtered', + str(in_file), + ] + ) + args = parse_arguments().parse_args() + main(args) + captured = capsys.readouterr() + + with open(out_file, 'r') as expected: + assert captured.out == expected.read() + diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index f3cf8b95..bc800e70 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -405,9 +405,8 @@ rule sort_genome_oligomap: rule convert_genome_to_sam_oligomap: input: - report=OUT_DIR / "{sample}" / "oligomap_genome_report.txt", sort=OUT_DIR / "{sample}" / "oligomap_genome_sorted.fasta", - script=SCRIPTS_DIR / "oligomapOutputToSam_nhfiltered.py", + script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: gmap=OUT_DIR / "{sample}" / "oligomap_genome_mappings.sam", params: @@ -424,7 +423,7 @@ rule convert_genome_to_sam_oligomap: ENV_DIR / "python.yaml" shell: "(python {input.script} \ - -i {input.sort} \ + {input.sort} \ -n {params.nh} \ > {output.gmap}) &> {log}" @@ -498,9 +497,8 @@ rule sort_transcriptome_oligomap: rule convert_transcriptome_to_sam_oligomap: input: - report=OUT_DIR / "{sample}" / "oligomap_transcriptome_report.txt", sort=OUT_DIR / "{sample}" / "oligomap_transcriptome_sorted.fasta", - script=SCRIPTS_DIR / "oligomapOutputToSam_nhfiltered.py", + script=SCRIPTS_DIR / "oligomap_output_to_sam_nh_filtered.py", output: tmap=OUT_DIR / "{sample}" / "oligomap_transcriptome_mappings.sam", params: @@ -514,10 +512,10 @@ rule convert_transcriptome_to_sam_oligomap: ENV_DIR / "python.yaml" shell: "(python {input.script} \ - -i {input.sort} \ + {input.sort} \ -n {params.nh} \ - > {output.tmap} \ - ) &> {log}" + > {output.tmap}) &> {log}" + ###############################################################################