bioforensics · standage · Nov 6, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/README.md b/README.md
@@ -165,7 +165,12 @@ Further, a second table (labeled as ```*_flanks.txt```) containing information r
 *  3' Flanking Sequence Bracketed Notation
 *  Potential Issues (such as: Possible indel or partial sequence)
 
-The ```Potential Issues``` column in this report is to draw attention to potential problem sequences (due to perhaps an indel or partial sequence) and requires the attention of the user to further evaluate the sequence for it's authenticity.
+The ```Potential Issues``` column in this report is to draw attention to potential problem sequences (due to perhaps an indel or partial sequence) and requires the attention of the user to further evaluate the sequence for it's authenticity.  
+
+Individual marker plots displaying reads vs. CE allele are created in PDF format. They are stored in the ```MarkerPlots/``` directory of the working directory:   
+* The first page of the PDF file contains the marker plots using a custom y-axis scale (i.e. the y-axis is scaled for each individual marker).  
+* The second page of the PDF file contains the marker plots using the same (i.e. the maximum) y-axis scale.  
+* A separate PDF file for the X- & Y-STRs are created, if specified.  
 
 ---
 

diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py
@@ -287,3 +287,23 @@ def test_snakemake(command, output, format_out, convert_out, all_out, tmp_path):
     assert os.path.exists(obs_convert_output) is convert_out
     assert os.path.exists(obs_all_output) is all_out
     assert filecmp.cmp(exp_output, obs_output) is True
+
+
+@pytest.mark.parametrize(
+    "sex",
+    [True, False],
+)
+def test_marker_plots(sex, tmp_path):
+    inputfile = data_file("UAS_bulk_input/Positive Control Sample Details Report 2315.xlsx")
+    exp_output = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_marker_plots.pdf")
+    sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_sexchr_marker_plots.pdf")
+    if sex:
+        arglist = ["config", "-w", str(tmp_path), "--input", inputfile, "--sex"]
+    else:
+        arglist = ["config", "-w", str(tmp_path), "--input", inputfile]
+    lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist))
+    snakemake_arglist = ["strs", "convert", "-w", str(tmp_path)]
+    lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist))
+    assert os.path.exists(exp_output) is True
+    if sex:
+        assert os.path.exists(sex_exp) is True
diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py
@@ -12,6 +12,9 @@
 
 import csv
 import json
+import math
+import matplotlib.pyplot as plt
+import numpy as np
 import os
 import pandas as pd
 import re
@@ -20,6 +23,8 @@
 from lusSTR.scripts.repeat import collapse_all_repeats, collapse_repeats_by_length
 from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n
 from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed
+from matplotlib.backends.backend_pdf import PdfPages
+from pathlib import Path
 
 
 with open(get_str_metadata_file(), "r") as fh:
@@ -170,6 +175,48 @@ def sort_table(table):
     return sorted_table
 
 
+def marker_plots(df, output_name, sex):
+    Path("MarkerPlots").mkdir(parents=True, exist_ok=True)
+    df["CE_Allele"] = df["CE_Allele"].astype(float)
+    for id in df["SampleID"].unique():
+        sample_id = f"{id}_sexchr" if sex else id
+        with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf:
+            make_plot(df, id, sex, False)
+            pdf.savefig()
+            make_plot(df, id, sex, True)
+            pdf.savefig()
+
+
+def make_plot(df, id, sex, sameyaxis):
+    sample_df = df[df["SampleID"] == id]
+    sample_id = f"{id}_sexchr" if sex else id
+    max_reads = max(sample_df["Reads"])
+    n = 100 if max_reads > 1000 else 10
+    max_yvalue = int(math.ceil(max_reads / n)) * n
+    increase_value = int(math.ceil((max_yvalue / 5)) / n) * n
+    fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30))
+    n = 0
+    for marker in sample_df["Locus"].unique():
+        n += 1
+        marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
+        ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n)
+        ax.bar(marker_df["CE_Allele"], marker_df["Reads"])
+        if sameyaxis:
+            ax.set_yticks(np.arange(0, max_yvalue, increase_value))
+        ax.set_xticks(
+            np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0)
+        )
+        ax.title.set_text(marker)
+    if sameyaxis:
+        plt.text(
+            0.4, 0.95, "Marker Plots With Same Y-Axis Scale", transform=fig.transFigure, size=24
+        )
+    else:
+        plt.text(
+            0.4, 0.95, "Marker Plots With Custom Y-Axis Scale", transform=fig.transFigure, size=24
+        )
+
+
 def main(input, out, kit, uas, sex, nocombine):
     input = str(input)
     out = str(out)
@@ -191,6 +238,7 @@ def main(input, out, kit, uas, sex, nocombine):
                 sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False)
         else:
             sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False)
+        marker_plots(sex_final_table, output_name, True)
     if not uas:
         if not autosomal_final_table.empty:
             autosomal_flank_table.to_csv(f"{output_name}_flanks.txt", sep="\t", index=False)
@@ -202,6 +250,7 @@ def main(input, out, kit, uas, sex, nocombine):
             autosomal_final_table.to_csv(out, sep="\t", index=False)
     else:
         autosomal_final_table.to_csv(out, sep="\t", index=False)
+    marker_plots(autosomal_final_table, output_name, False)
 
 
 if __name__ == "__main__":

diff --git a/setup.py b/setup.py
@@ -37,7 +37,13 @@
         ]
     },
     include_package_data=True,
-    install_requires=["pandas>=1.0,<2.0", "openpyxl>=3.0.6", "snakemake>=7.22.0", "pyyaml>=6.0"],
+    install_requires=[
+        "pandas>=1.0,<2.0",
+        "openpyxl>=3.0.6",
+        "snakemake>=7.22.0",
+        "pyyaml>=6.0",
+        "matplotlib>=3.5.3",
+    ],
     entry_points={"console_scripts": ["lusstr = lusSTR.cli:main"]},
     scripts=glob.glob("lusSTR/scripts/*"),
     classifiers=[