diff --git a/README.md b/README.md index cd2284d9..61ad546c 100755 --- a/README.md +++ b/README.md @@ -165,7 +165,12 @@ Further, a second table (labeled as ```*_flanks.txt```) containing information r * 3' Flanking Sequence Bracketed Notation * Potential Issues (such as: Possible indel or partial sequence) -The ```Potential Issues``` column in this report is to draw attention to potential problem sequences (due to perhaps an indel or partial sequence) and requires the attention of the user to further evaluate the sequence for it's authenticity. +The ```Potential Issues``` column in this report is to draw attention to potential problem sequences (due to perhaps an indel or partial sequence) and requires the attention of the user to further evaluate the sequence for its authenticity. + +Individual marker plots displaying reads vs. CE allele are created in PDF format. They are stored in the ```MarkerPlots/``` directory of the working directory: +* The first page of the PDF file contains the marker plots using a custom y-axis scale (i.e. the y-axis is scaled for each individual marker). +* The second page of the PDF file contains the marker plots using the same (i.e. the maximum) y-axis scale. +* A separate PDF file for the X- & Y-STRs are created, if specified. --- diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py index e23bf485..8cfcef4c 100644 --- a/lusSTR/tests/test_suite.py +++ b/lusSTR/tests/test_suite.py @@ -287,3 +287,23 @@ def test_snakemake(command, output, format_out, convert_out, all_out, tmp_path): assert os.path.exists(obs_convert_output) is convert_out assert os.path.exists(obs_all_output) is all_out assert filecmp.cmp(exp_output, obs_output) is True + + +@pytest.mark.parametrize( + "sex", + [True, False], +) +def test_marker_plots(sex, tmp_path): + inputfile = data_file("UAS_bulk_input/Positive Control Sample Details Report 2315.xlsx") + exp_output = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_marker_plots.pdf") + sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_sexchr_marker_plots.pdf") + if sex: + arglist = ["config", "-w", str(tmp_path), "--input", inputfile, "--sex"] + else: + arglist = ["config", "-w", str(tmp_path), "--input", inputfile] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist)) + snakemake_arglist = ["strs", "convert", "-w", str(tmp_path)] + lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist)) + assert os.path.exists(exp_output) is True + if sex: + assert os.path.exists(sex_exp) is True diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index e804fd17..7c02eb25 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -12,6 +12,9 @@ import csv import json +import math +import matplotlib.pyplot as plt +import numpy as np import os import pandas as pd import re @@ -20,6 +23,8 @@ from lusSTR.scripts.repeat import collapse_all_repeats, collapse_repeats_by_length from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed +from matplotlib.backends.backend_pdf import PdfPages +from pathlib import Path with open(get_str_metadata_file(), "r") as fh: @@ -170,6 +175,48 @@ def sort_table(table): return sorted_table +def marker_plots(df, output_name, sex=False): + Path("MarkerPlots").mkdir(parents=True, exist_ok=True) + df["CE_Allele"] = df["CE_Allele"].astype(float) + for id in df["SampleID"].unique(): + sample_id = f"{id}_sexchr" if sex else id + with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: + make_plot(df, id, sex, sameyaxis=False) + pdf.savefig() + make_plot(df, id, sex) + pdf.savefig() + + +def make_plot(df, id, sex=False, sameyaxis=True): + sample_df = df[df["SampleID"] == id] + sample_id = f"{id}_sexchr" if sex else id + max_reads = max(sample_df["Reads"]) + n = 100 if max_reads > 1000 else 10 + max_yvalue = int(math.ceil(max_reads / n)) * n + increase_value = int(math.ceil((max_yvalue / 5)) / n) * n + fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30)) + n = 0 + for marker in sample_df["Locus"].unique(): + n += 1 + marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n) + ax.bar(marker_df["CE_Allele"], marker_df["Reads"]) + if sameyaxis: + ax.set_yticks(np.arange(0, max_yvalue, increase_value)) + ax.set_xticks( + np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0) + ) + ax.title.set_text(marker) + if sameyaxis: + plt.text( + 0.4, 0.95, "Marker Plots With Same Y-Axis Scale", transform=fig.transFigure, size=24 + ) + else: + plt.text( + 0.4, 0.95, "Marker Plots With Custom Y-Axis Scale", transform=fig.transFigure, size=24 + ) + + def main(input, out, kit, uas, sex, nocombine): input = str(input) out = str(out) @@ -191,6 +238,7 @@ def main(input, out, kit, uas, sex, nocombine): sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) + marker_plots(sex_final_table, output_name, sex=True) if not uas: if not autosomal_final_table.empty: autosomal_flank_table.to_csv(f"{output_name}_flanks.txt", sep="\t", index=False) @@ -202,6 +250,7 @@ def main(input, out, kit, uas, sex, nocombine): autosomal_final_table.to_csv(out, sep="\t", index=False) else: autosomal_final_table.to_csv(out, sep="\t", index=False) + marker_plots(autosomal_final_table, output_name) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 9deac591..de91f2a0 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,13 @@ ] }, include_package_data=True, - install_requires=["pandas>=1.0,<2.0", "openpyxl>=3.0.6", "snakemake>=7.22.0", "pyyaml>=6.0"], + install_requires=[ + "pandas>=1.0,<2.0", + "openpyxl>=3.0.6", + "snakemake>=7.22.0", + "pyyaml>=6.0", + "matplotlib>=3.5.3", + ], entry_points={"console_scripts": ["lusstr = lusSTR.cli:main"]}, scripts=glob.glob("lusSTR/scripts/*"), classifiers=[