From 506690e1a67800dbcc639b6a366e1257e18525e4 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 10 Jan 2024 13:11:33 -0500 Subject: [PATCH 01/10] changed how marker plots are created [skip ci] --- lusSTR/scripts/marker.py | 17 +++++++--- lusSTR/scripts/repeat.py | 19 +++++------ lusSTR/wrappers/convert.py | 47 --------------------------- lusSTR/wrappers/filter.py | 66 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 61 deletions(-) diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index b95d7d61..898af645 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -386,14 +386,21 @@ def convert(self): if len(self.uas_sequence) < 110: bracketed_form = collapse_repeats_by_length(self.uas_sequence, 4) else: + print(self.uas_sequence) if "GGGCTGCCTA" in self.uas_sequence: break_point = self.uas_sequence.index("GGGCTGCCTA") + 10 - else: + bracketed_form = ( + f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} " + f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}" + ) + elif "TTTT" in self.uas_sequence: break_point = self.uas_sequence.index("TTTT") + 14 - bracketed_form = ( - f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} " - f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}" - ) + bracketed_form = ( + f"{collapse_repeats_by_length(self.uas_sequence[:break_point], 4)} " + f"{collapse_repeats_by_length(self.uas_sequence[break_point:], 4)}" + ) + else: + bracketed_form = "" return bracketed_form diff --git a/lusSTR/scripts/repeat.py b/lusSTR/scripts/repeat.py index 7175e56b..76ff49c4 100644 --- a/lusSTR/scripts/repeat.py +++ b/lusSTR/scripts/repeat.py @@ -163,15 +163,16 @@ def repeat_copy_number(bf, repeat): The input is a sequence string collapsed to bracketed sequence form. """ longest = 0 - for block in bf.split(" "): - if block == repeat: - if 1 > longest: - longest = 1 - match = re.match(r"\[" + repeat + r"\](\d+)", block) - if match: - length = int(match.group(1)) - if length > longest: - longest = length + if bf != "": + for block in bf.split(" "): + if block == repeat: + if 1 > longest: + longest = 1 + match = re.match(r"\[" + repeat + r"\](\d+)", block) + if match: + length = int(match.group(1)) + if length > longest: + longest = length return str(longest) diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 652a70fb..39ffb810 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -13,7 +13,6 @@ import csv import json import math -import matplotlib.pyplot as plt import numpy as np import os import pandas as pd @@ -23,7 +22,6 @@ from lusSTR.scripts.repeat import collapse_all_repeats, collapse_repeats_by_length from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed -from matplotlib.backends.backend_pdf import PdfPages from pathlib import Path @@ -101,7 +99,6 @@ def format_table(input, software, kit="forenseq"): ] flanks_list.append(flank_summary) continue - marker = STRMarkerObject(locus, sequence, software, kit=kit) if locus == "D12S391" and kit == "powerseq": if "." in str(marker.canonical): @@ -185,48 +182,6 @@ def sort_table(table): return sorted_table -def marker_plots(df, output_name, sex=False): - Path("MarkerPlots").mkdir(parents=True, exist_ok=True) - df["CE_Allele"] = df["CE_Allele"].astype(float) - for id in df["SampleID"].unique(): - sample_id = f"{id}_sexchr" if sex else id - with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: - make_plot(df, id, sex, sameyaxis=False) - pdf.savefig() - make_plot(df, id, sex) - pdf.savefig() - - -def make_plot(df, id, sex=False, sameyaxis=True): - sample_df = df[df["SampleID"] == id] - sample_id = f"{id}_sexchr" if sex else id - max_reads = max(sample_df["Reads"]) - n = 100 if max_reads > 1000 else 10 - max_yvalue = int(math.ceil(max_reads / n)) * n - increase_value = int(math.ceil((max_yvalue / 5)) / n) * n - fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30)) - n = 0 - for marker in sample_df["Locus"].unique(): - n += 1 - marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") - ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n) - ax.bar(marker_df["CE_Allele"], marker_df["Reads"]) - if sameyaxis: - ax.set_yticks(np.arange(0, max_yvalue, increase_value)) - ax.set_xticks( - np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0) - ) - ax.title.set_text(marker) - if sameyaxis: - plt.text( - 0.4, 0.95, "Marker Plots With Same Y-Axis Scale", transform=fig.transFigure, size=24 - ) - else: - plt.text( - 0.4, 0.95, "Marker Plots With Custom Y-Axis Scale", transform=fig.transFigure, size=24 - ) - - def main(input, out, kit, software, sex, nocombine): input = str(input) out = str(out) @@ -248,7 +203,6 @@ def main(input, out, kit, software, sex, nocombine): sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) - marker_plots(sex_final_table, output_name, sex=True) if software != "uas": if not autosomal_final_table.empty: autosomal_flank_table.to_csv(f"{output_name}_flanks.txt", sep="\t", index=False) @@ -260,7 +214,6 @@ def main(input, out, kit, software, sex, nocombine): autosomal_final_table.to_csv(out, sep="\t", index=False) else: autosomal_final_table.to_csv(out, sep="\t", index=False) - marker_plots(autosomal_final_table, output_name) if __name__ == "__main__": diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 42817c4e..4d767abb 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -16,6 +16,9 @@ import json import lusSTR from lusSTR.scripts.filter_settings import filters, flags +import math +from matplotlib.backends.backend_pdf import PdfPages +import matplotlib.pyplot as plt import numpy as np import os import pandas as pd @@ -322,6 +325,68 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df +def marker_plots(df, output_name, sex=False): + Path("MarkerPlots").mkdir(parents=True, exist_ok=True) + df["CE_Allele"] = df["CE_Allele"].astype(float) + filt_df = df[df["allele_type"] == "real_allele"] + for id in df["SampleID"].unique(): + sample_id = f"{id}_sexchr" if sex else id + with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: + make_plot(filt_df, id, sex, filters=True) + pdf.savefig() + make_plot(df, id, sex) + pdf.savefig() + make_plot(df, id, sex, sameyaxis=True) + pdf.savefig() + + +def make_plot(df, id, sex=False, sameyaxis=False, filters=False): + sample_df = df[df["SampleID"] == id] + sample_id = f"{id}_sexchr" if sex else id + sample_df["Type"] = sample_df["allele_type"] + conditions = [ + sample_df["allele_type"].str.contains("real"), + sample_df["allele_type"].str.contains("BelowAT"), + sample_df["allele_type"].str.contains("stutter"), + ] + values = ["Real", "BelowAT", "Stutter"] + sample_df["Type"] = np.select(conditions, values) + max_reads = max(sample_df["Reads"]) + n = 100 if max_reads > 1000 else 10 + max_yvalue = int(math.ceil(max_reads / n)) * n + increase_value = int(math.ceil((max_yvalue / 5)) / n) * n + fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30)) + n = 0 + for marker in sample_df["Locus"].unique(): + n += 1 + colors = {"Real": "g", "Stutter": "b", "BelowAT": "r"} + marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n) + ax.bar( + marker_df["CE_Allele"], + marker_df["Reads"], + edgecolor="k", + color=[colors[i] for i in marker_df["Type"]], + ) + labels = marker_df["Type"].unique() + handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] + if not filters: + plt.legend(handles, labels, title="Allele Type") + if sameyaxis: + ax.set_yticks(np.arange(0, max_yvalue, increase_value)) + ax.set_xticks( + np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0) + ) + ax.title.set_text(marker) + if sameyaxis: + title = "Marker Plots for All Alleles With Same Y-Axis Scale" + elif filters: + title = "Marker Plots for True Alleles With Custom Y-Axis Scale" + else: + title = "Marker Plots for All Alleles With Custom Y-Axis Scale" + plt.text(0.4, 0.95, title, transform=fig.transFigure, size=24) + + def main( input, output_type, profile_type, data_type, output_dir, info, separate, nofilters, strand ): @@ -350,6 +415,7 @@ def main( else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} final_df, flags_df = process_strs(dict_loc, data_type, seq_col) + marker_plots(final_df, outpath, sex=False) if output_type == "efm" or output_type == "mpsproto": EFM_output(final_df, outpath, profile_type, data_type, brack_col, separate) else: From 2a6fd83824daf8b9c28425a44c0475713d77381c Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 10 Jan 2024 13:15:25 -0500 Subject: [PATCH 02/10] remove print statement [skip ci] --- lusSTR/scripts/marker.py | 1 - lusSTR/wrappers/convert.py | 1 - 2 files changed, 2 deletions(-) diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 898af645..1817faf5 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -386,7 +386,6 @@ def convert(self): if len(self.uas_sequence) < 110: bracketed_form = collapse_repeats_by_length(self.uas_sequence, 4) else: - print(self.uas_sequence) if "GGGCTGCCTA" in self.uas_sequence: break_point = self.uas_sequence.index("GGGCTGCCTA") + 10 bracketed_form = ( diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 39ffb810..fa8db047 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -12,7 +12,6 @@ import csv import json -import math import numpy as np import os import pandas as pd From 504c142216421ab504d180288f7f7d423d8fd512 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 06:40:54 -0500 Subject: [PATCH 03/10] added AT line [skip ci] --- lusSTR/wrappers/filter.py | 51 ++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 4d767abb..6002cdd5 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -325,49 +325,54 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def marker_plots(df, output_name, sex=False): +def marker_plots(df, output_name): Path("MarkerPlots").mkdir(parents=True, exist_ok=True) df["CE_Allele"] = df["CE_Allele"].astype(float) filt_df = df[df["allele_type"] == "real_allele"] - for id in df["SampleID"].unique(): - sample_id = f"{id}_sexchr" if sex else id + for sample_id in df["SampleID"].unique(): + # sample_id = f"{id}_sexchr" if sex else id with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: - make_plot(filt_df, id, sex, filters=True) + make_plot(filt_df, sample_id, filters=True, at=False) pdf.savefig() - make_plot(df, id, sex) + make_plot(df, sample_id) pdf.savefig() - make_plot(df, id, sex, sameyaxis=True) + make_plot(df, sample_id, sameyaxis=True) pdf.savefig() -def make_plot(df, id, sex=False, sameyaxis=False, filters=False): - sample_df = df[df["SampleID"] == id] - sample_id = f"{id}_sexchr" if sex else id +def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): + sample_df = df[df["SampleID"] == sample_id] + # sample_id = f"{id}_sexchr" if sex else id sample_df["Type"] = sample_df["allele_type"] conditions = [ sample_df["allele_type"].str.contains("real"), sample_df["allele_type"].str.contains("BelowAT"), sample_df["allele_type"].str.contains("stutter"), ] - values = ["Real", "BelowAT", "Stutter"] + values = ["Typed", "BelowAT", "Stutter"] sample_df["Type"] = np.select(conditions, values) max_reads = max(sample_df["Reads"]) n = 100 if max_reads > 1000 else 10 max_yvalue = int(math.ceil(max_reads / n)) * n increase_value = int(math.ceil((max_yvalue / 5)) / n) * n - fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30)) + fig = plt.figure(figsize=(30, 30)) n = 0 for marker in sample_df["Locus"].unique(): n += 1 - colors = {"Real": "g", "Stutter": "b", "BelowAT": "r"} + colors = {"Typed": "g", "Stutter": "b", "BelowAT": "r"} marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") - ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n) + # print(marker_df) + ax = fig.add_subplot(6, 5, n) ax.bar( marker_df["CE_Allele"], marker_df["Reads"], edgecolor="k", color=[colors[i] for i in marker_df["Type"]], ) + if at: + at = get_at(marker_df, marker) + ax.axhline(at, linestyle="--", color="k") + ax.text(min(marker_df["CE_Allele"]) - 0.9, at + (at * 0.1), f"AT", size=12) labels = marker_df["Type"].unique() handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] if not filters: @@ -387,6 +392,23 @@ def make_plot(df, id, sex=False, sameyaxis=False, filters=False): plt.text(0.4, 0.95, title, transform=fig.transFigure, size=24) +def get_at(df, locus): + metadata = filter_marker_data[locus] + thresh_use = metadata["AnalyticalThresholdUse"] + at_st = float(metadata["AnalyticalThresholdStaticCount"]) + at_dy = metadata["AnalyticalThresholdDynamicPercent"] + at_dy_num = df["Reads"].sum() * float(at_dy) + if thresh_use.lower() == "both": + at = at_st if at_st > at_dy_num else at_dy_num + elif thresh_use.lower() == "static": + at = at_st + elif thresh_use.lower() == "dynamic": + at = at_dy_num + else: + raise ValueError("Incorrect AT specified in filters.json. Please check and re-run.") + return at + + def main( input, output_type, profile_type, data_type, output_dir, info, separate, nofilters, strand ): @@ -408,6 +430,7 @@ def main( ) if nofilters: full_df["allele_type"] = "real_allele" + marker_plots(full_df, outpath) if output_type == "efm" or output_type == "mpsproto": EFM_output(full_df, outpath, profile_type, data_type, brack_col, separate) else: @@ -415,7 +438,7 @@ def main( else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} final_df, flags_df = process_strs(dict_loc, data_type, seq_col) - marker_plots(final_df, outpath, sex=False) + marker_plots(final_df, outpath) if output_type == "efm" or output_type == "mpsproto": EFM_output(final_df, outpath, profile_type, data_type, brack_col, separate) else: From 677f2c5329fbea7104aa8a90d46e2a3bbb7ceea7 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 06:41:15 -0500 Subject: [PATCH 04/10] updated test_suite [skip ci] --- lusSTR/tests/test_suite.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index cf1aaee0..25632b71 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -323,24 +323,15 @@ def test_snakemake(command, output, format_out, convert_out, all_out, tmp_path): assert filecmp.cmp(exp_output, obs_output) is True -@pytest.mark.parametrize( - "sex", - [True, False], -) -def test_marker_plots(sex, tmp_path): +def test_marker_plots(tmp_path): inputfile = data_file("UAS_bulk_input/Positive Control Sample Details Report 2315.xlsx") exp_output = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_marker_plots.pdf") sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_sexchr_marker_plots.pdf") - if sex: - arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile), "--sex"] - else: - arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile)] + arglist = ["config", "-w", str(tmp_path), "--input", str(inputfile)] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) - snakemake_arglist = ["strs", "convert", "-w", str(tmp_path)] + snakemake_arglist = ["strs", "all", "-w", str(tmp_path)] lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist)) assert os.path.exists(exp_output) is True - if sex: - assert os.path.exists(sex_exp) is True def test_genemarker(tmp_path): From fdc00866274bc318d4700debbcebcc3766f2f47a Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 09:52:49 -0500 Subject: [PATCH 05/10] updated tests --- lusSTR/scripts/filter_settings.py | 4 +- .../LUSPlus_sequence_info.csv | 48 ++++++++--------- .../NGS_stutter_test/Sample1_nofilter.csv | 54 +++++++++---------- lusSTR/wrappers/filter.py | 3 +- 4 files changed, 54 insertions(+), 55 deletions(-) diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index 1d6b5b5d..f3f60461 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -531,6 +531,6 @@ def same_size_filter(df, metadata, datatype): else: final_df = pd.concat([final_df, df_filt]) final_df = final_df.reset_index(drop=True) - if datatype == "lusplus": - final_df = final_df.drop("CE_Allele", axis=1) + # if datatype == "lusplus": + # final_df = final_df.drop("CE_Allele", axis=1) return final_df diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv index a093c1fd..9bcfd9a0 100644 --- a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv +++ b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv @@ -1,24 +1,24 @@ -SampleID,Locus,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter,CE_Allele -Sample1,D4S2408,10_10_0,1022,real_allele,,,,,,, -Sample1,D4S2408,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,,, -Sample1,D4S2408,8_8_0,1050,real_allele,,,,,,, -Sample1,D8S1179,14_12_1_0,869,real_allele,,,,,,, -Sample1,D8S1179,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212, -Sample1,D8S1179,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201, -Sample1,D9S1122,13_11,948,real_allele,,,,,,, -Sample1,D9S1122,12_10,108,-1_stutter,13_11,,948.0,,,0.114, -Sample1,D9S1122,11_11,991,real_allele,,,,,,, -Sample1,D9S1122,10_10,87,-1_stutter,11_11,,991.0,,,0.088, -Sample1,FGA,23_15_3_0,1436,real_allele,,,,,,, -Sample1,FGA,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182, -Sample1,FGA,21_13_3_0,48,BelowAT,,,,,0.013,, -Sample1,FGA,20_12_3_0,1750,real_allele,,,,,,, -Sample1,FGA,18_10_3_0,181,real_allele,,,,,,, -Sample1,FGA,17_9_3_0,15,BelowAT,,,,,0.004,, -Sample1,PENTA D,15_15,50,real_allele,,,,,,, -Sample1,PENTA D,13_13,1000,real_allele,,,,,,, -Sample1,PENTA E,7_7,505,real_allele,,,,,,,7.0 -Sample1,TH01,7_7,2197,real_allele,,,,,,, -Sample1,TH01,6_6,1632,real_allele,,,,,,, -Sample1,TH01,5_5,66,BelowAT,,,,,0.017,, -Sample1,TPOX,11_11,15,BelowAT,,,,,1.0,,11.0 +SampleID,Locus,CE_Allele,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter +Sample1,D4S2408,10.0,10_10_0,1022,real_allele,,,,,, +Sample1,D4S2408,9.0,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,, +Sample1,D4S2408,8.0,8_8_0,1050,real_allele,,,,,, +Sample1,D8S1179,14.0,14_12_1_0,869,real_allele,,,,,, +Sample1,D8S1179,13.0,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212 +Sample1,D8S1179,12.0,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201 +Sample1,D9S1122,13.0,13_11,948,real_allele,,,,,, +Sample1,D9S1122,12.0,12_10,108,-1_stutter,13_11,,948.0,,,0.114 +Sample1,D9S1122,11.0,11_11,991,real_allele,,,,,, +Sample1,D9S1122,10.0,10_10,87,-1_stutter,11_11,,991.0,,,0.088 +Sample1,FGA,23.0,23_15_3_0,1436,real_allele,,,,,, +Sample1,FGA,22.0,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182 +Sample1,FGA,21.0,21_13_3_0,48,BelowAT,,,,,0.013, +Sample1,FGA,20.0,20_12_3_0,1750,real_allele,,,,,, +Sample1,FGA,18.0,18_10_3_0,181,real_allele,,,,,, +Sample1,FGA,17.0,17_9_3_0,15,BelowAT,,,,,0.004, +Sample1,PENTA D,15.0,15_15,50,real_allele,,,,,, +Sample1,PENTA D,13.0,13_13,1000,real_allele,,,,,, +Sample1,PENTA E,7.0,7_7,505,real_allele,,,,,, +Sample1,TH01,7.0,7_7,2197,real_allele,,,,,, +Sample1,TH01,6.0,6_6,1632,real_allele,,,,,, +Sample1,TH01,5.0,5_5,66,BelowAT,,,,,0.017, +Sample1,TPOX,11.0,11_11,15,BelowAT,,,,,1.0, diff --git a/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv b/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv index b83fce36..7531c6f3 100644 --- a/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv +++ b/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv @@ -1,28 +1,28 @@ Locus,CE Allele,Allele Seq,Reads -D4S2408,8,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 -D4S2408,9,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 -D4S2408,10,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 -D8S1179,12,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 -D8S1179,12,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 -D8S1179,13,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 -D8S1179,13,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 -D8S1179,14,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 -D8S1179,14,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 -D9S1122,10,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 -D9S1122,11,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 -D9S1122,12,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 -D9S1122,13,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 -FGA,17,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 -FGA,18,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 -FGA,20,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 -FGA,21,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 -FGA,22,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 -FGA,23,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 -PentaD,13,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 -PentaD,15,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 -PentaE,7,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 -TH01,5,AATGAATGAATGAATGAATG,66 -TH01,6,AATGAATGAATGAATGAATGAATG,1632 -TH01,7,AATGAATGAATGAATGAATGAATGAATG,2197 -TPOX,11,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 -vWA,16,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 +D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 +D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 +D4S2408,10.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 +D8S1179,12.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 +D8S1179,12.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 +D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 +D8S1179,13.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 +D8S1179,14.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 +D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 +D9S1122,10.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 +D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 +D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 +D9S1122,13.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 +FGA,17.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 +FGA,18.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 +FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 +FGA,21.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 +FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 +FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 +PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 +PentaD,15.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 +PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 +TH01,5.0,AATGAATGAATGAATGAATG,66 +TH01,6.0,AATGAATGAATGAATGAATGAATG,1632 +TH01,7.0,AATGAATGAATGAATGAATGAATGAATG,2197 +TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 +vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 6002cdd5..1d47d73e 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -361,7 +361,6 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): n += 1 colors = {"Typed": "g", "Stutter": "b", "BelowAT": "r"} marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") - # print(marker_df) ax = fig.add_subplot(6, 5, n) ax.bar( marker_df["CE_Allele"], @@ -386,7 +385,7 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): if sameyaxis: title = "Marker Plots for All Alleles With Same Y-Axis Scale" elif filters: - title = "Marker Plots for True Alleles With Custom Y-Axis Scale" + title = "Marker Plots for Typed Alleles With Custom Y-Axis Scale" else: title = "Marker Plots for All Alleles With Custom Y-Axis Scale" plt.text(0.4, 0.95, title, transform=fig.transFigure, size=24) From f0873dce325b3962a65bc52e873afe66e50bfd0e Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 09:54:28 -0500 Subject: [PATCH 06/10] removed hashed code --- lusSTR/scripts/filter_settings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index f3f60461..859ce736 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -531,6 +531,4 @@ def same_size_filter(df, metadata, datatype): else: final_df = pd.concat([final_df, df_filt]) final_df = final_df.reset_index(drop=True) - # if datatype == "lusplus": - # final_df = final_df.drop("CE_Allele", axis=1) return final_df From 3a24edf6b8547f2fa6a980ca8e3b5cb21c9a534a Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 11:48:32 -0500 Subject: [PATCH 07/10] updated convert --- lusSTR/wrappers/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index fa8db047..beb950db 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -99,7 +99,7 @@ def format_table(input, software, kit="forenseq"): flanks_list.append(flank_summary) continue marker = STRMarkerObject(locus, sequence, software, kit=kit) - if locus == "D12S391" and kit == "powerseq": + if locus == "D12S391" and kit == "powerseq" and software == "straitrazor": if "." in str(marker.canonical): check_sr += 1 if check_sr > 10: From 922a539c3ccdfb1052395d532a6ba98516735e85 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 13:06:18 -0500 Subject: [PATCH 08/10] fixed copy warning --- lusSTR/wrappers/filter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 1d47d73e..57d394d9 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -341,16 +341,15 @@ def marker_plots(df, output_name): def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): - sample_df = df[df["SampleID"] == sample_id] + sample_df = df[df["SampleID"] == sample_id].copy() # sample_id = f"{id}_sexchr" if sex else id - sample_df["Type"] = sample_df["allele_type"] conditions = [ sample_df["allele_type"].str.contains("real"), sample_df["allele_type"].str.contains("BelowAT"), sample_df["allele_type"].str.contains("stutter"), ] values = ["Typed", "BelowAT", "Stutter"] - sample_df["Type"] = np.select(conditions, values) + sample_df.loc[:, "Type"] = np.select(conditions, values) max_reads = max(sample_df["Reads"]) n = 100 if max_reads > 1000 else 10 max_yvalue = int(math.ceil(max_reads / n)) * n From e534317314332d22381bcb055e3ac1ff2e5781ed Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 11 Jan 2024 15:08:16 -0500 Subject: [PATCH 09/10] added allele name to typed alleles plot --- lusSTR/wrappers/filter.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 57d394d9..cbcf8052 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -361,7 +361,7 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): colors = {"Typed": "g", "Stutter": "b", "BelowAT": "r"} marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") ax = fig.add_subplot(6, 5, n) - ax.bar( + p = ax.bar( marker_df["CE_Allele"], marker_df["Reads"], edgecolor="k", @@ -370,15 +370,25 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): if at: at = get_at(marker_df, marker) ax.axhline(at, linestyle="--", color="k") - ax.text(min(marker_df["CE_Allele"]) - 0.9, at + (at * 0.1), f"AT", size=12) + ax.text(round(min(marker_df["CE_Allele"])) - 0.9, at + (at * 0.1), f"AT", size=12) labels = marker_df["Type"].unique() handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] if not filters: plt.legend(handles, labels, title="Allele Type") + else: + for i, row in marker_df.iterrows(): + marker_df.loc[i, "Label"] = ( + str(int(marker_df.loc[i, "CE_Allele"])) + if ".0" in str(marker_df.loc[i, "CE_Allele"]) + else str(marker_df.loc[i, "CE_Allele"]) + ) + ax.bar_label(p, labels=marker_df["Label"]) if sameyaxis: ax.set_yticks(np.arange(0, max_yvalue, increase_value)) ax.set_xticks( - np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0) + np.arange( + round(min(marker_df["CE_Allele"]) - 1), round(max(marker_df["CE_Allele"])) + 2, 1.0 + ) ) ax.title.set_text(marker) if sameyaxis: From 3d478ac8bd2e8bdbdbb8e216f01d5ea4e3ff9e60 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Fri, 12 Jan 2024 06:17:46 -0500 Subject: [PATCH 10/10] updated empty bf code --- lusSTR/scripts/repeat.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lusSTR/scripts/repeat.py b/lusSTR/scripts/repeat.py index 76ff49c4..bd34789b 100644 --- a/lusSTR/scripts/repeat.py +++ b/lusSTR/scripts/repeat.py @@ -163,16 +163,17 @@ def repeat_copy_number(bf, repeat): The input is a sequence string collapsed to bracketed sequence form. """ longest = 0 - if bf != "": - for block in bf.split(" "): - if block == repeat: - if 1 > longest: - longest = 1 - match = re.match(r"\[" + repeat + r"\](\d+)", block) - if match: - length = int(match.group(1)) - if length > longest: - longest = length + if bf == "": + return 0 + for block in bf.split(" "): + if block == repeat: + if 1 > longest: + longest = 1 + match = re.match(r"\[" + repeat + r"\](\d+)", block) + if match: + length = int(match.group(1)) + if length > longest: + longest = length return str(longest)