Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create Marker Plots #61

Merged
merged 11 commits into from
Nov 6, 2023
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,12 @@ Further, a second table (labeled as ```*_flanks.txt```) containing information r
* 3' Flanking Sequence Bracketed Notation
* Potential Issues (such as: Possible indel or partial sequence)

The ```Potential Issues``` column in this report is to draw attention to potential problem sequences (due to perhaps an indel or partial sequence) and requires the attention of the user to further evaluate the sequence for it's authenticity.
The ```Potential Issues``` column in this report is to draw attention to potential problem sequences (due to perhaps an indel or partial sequence) and requires the attention of the user to further evaluate the sequence for it's authenticity.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: "for its authenticity".


Individual marker plots displaying reads vs. CE allele are created in PDF format. They are stored in the ```MarkerPlots/``` directory of the working directory:
* The first page of the PDF file contains the marker plots using a custom y-axis scale (i.e. the y-axis is scaled for each individual marker).
* The second page of the PDF file contains the marker plots using the same (i.e. the maximum) y-axis scale.
* A separate PDF file for the X- & Y-STRs are created, if specified.

---

Expand Down
20 changes: 20 additions & 0 deletions lusSTR/tests/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,23 @@ def test_snakemake(command, output, format_out, convert_out, all_out, tmp_path):
assert os.path.exists(obs_convert_output) is convert_out
assert os.path.exists(obs_all_output) is all_out
assert filecmp.cmp(exp_output, obs_output) is True


@pytest.mark.parametrize(
"sex",
[True, False],
)
def test_marker_plots(sex, tmp_path):
inputfile = data_file("UAS_bulk_input/Positive Control Sample Details Report 2315.xlsx")
exp_output = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_marker_plots.pdf")
sex_exp = str(tmp_path / "MarkerPlots/lusstr_output_Positive_Control_sexchr_marker_plots.pdf")
if sex:
arglist = ["config", "-w", str(tmp_path), "--input", inputfile, "--sex"]
else:
arglist = ["config", "-w", str(tmp_path), "--input", inputfile]
lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(arglist))
snakemake_arglist = ["strs", "convert", "-w", str(tmp_path)]
lusSTR.cli.main(lusSTR.cli.get_parser().parse_args(snakemake_arglist))
assert os.path.exists(exp_output) is True
if sex:
assert os.path.exists(sex_exp) is True
49 changes: 49 additions & 0 deletions lusSTR/wrappers/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

import csv
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
Expand All @@ -20,6 +23,8 @@
from lusSTR.scripts.repeat import collapse_all_repeats, collapse_repeats_by_length
from lusSTR.scripts.repeat import sequence_to_bracketed_form, split_by_n
from lusSTR.scripts.repeat import reverse_complement, reverse_complement_bracketed
from matplotlib.backends.backend_pdf import PdfPages
from pathlib import Path


with open(get_str_metadata_file(), "r") as fh:
Expand Down Expand Up @@ -170,6 +175,48 @@ def sort_table(table):
return sorted_table


def marker_plots(df, output_name, sex):
Path("MarkerPlots").mkdir(parents=True, exist_ok=True)
df["CE_Allele"] = df["CE_Allele"].astype(float)
for id in df["SampleID"].unique():
sample_id = f"{id}_sexchr" if sex else id
with PdfPages(f"MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf:
make_plot(df, id, sex, False)
pdf.savefig()
make_plot(df, id, sex, True)
pdf.savefig()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That way, when someone is messing with the code 18 months from now and they're trying to remember what that dang 4th argument to the make_plot function is, passing the argument as a key/value pair will provide a great contextual reminder.

make_plot(df, id, sex, sameyaxis=False)



def make_plot(df, id, sex, sameyaxis):
sample_df = df[df["SampleID"] == id]
sample_id = f"{id}_sexchr" if sex else id
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion for a marginal improvement in readability: make the sameyaxis argument a keyword argument by giving it a default value.

max_reads = max(sample_df["Reads"])
n = 100 if max_reads > 1000 else 10
max_yvalue = int(math.ceil(max_reads / n)) * n
increase_value = int(math.ceil((max_yvalue / 5)) / n) * n
Comment on lines +194 to +196
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is to make nice round values (10, 20, 100, 200, etc.) when determining the max y value of the dataset and what the incremental increase should be for the plots. The first line determines whether to round to 10s or 100s depending on the highest # of reads.

fig = plt.figure(figsize=(31, 31)) if sex is True else plt.figure(figsize=(30, 30))
n = 0
for marker in sample_df["Locus"].unique():
n += 1
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
ax = fig.add_subplot(6, 6, n) if sex is True else fig.add_subplot(6, 5, n)
ax.bar(marker_df["CE_Allele"], marker_df["Reads"])
if sameyaxis:
ax.set_yticks(np.arange(0, max_yvalue, increase_value))
ax.set_xticks(
np.arange(min(marker_df["CE_Allele"]) - 1, max(marker_df["CE_Allele"]) + 2, 1.0)
)
ax.title.set_text(marker)
if sameyaxis:
plt.text(
0.4, 0.95, "Marker Plots With Same Y-Axis Scale", transform=fig.transFigure, size=24
)
else:
plt.text(
0.4, 0.95, "Marker Plots With Custom Y-Axis Scale", transform=fig.transFigure, size=24
)


def main(input, out, kit, uas, sex, nocombine):
input = str(input)
out = str(out)
Expand All @@ -191,6 +238,7 @@ def main(input, out, kit, uas, sex, nocombine):
sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False)
else:
sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False)
marker_plots(sex_final_table, output_name, True)
if not uas:
if not autosomal_final_table.empty:
autosomal_flank_table.to_csv(f"{output_name}_flanks.txt", sep="\t", index=False)
Expand All @@ -202,6 +250,7 @@ def main(input, out, kit, uas, sex, nocombine):
autosomal_final_table.to_csv(out, sep="\t", index=False)
else:
autosomal_final_table.to_csv(out, sep="\t", index=False)
marker_plots(autosomal_final_table, output_name, False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar suggestion with the sex argument here.

marker_plots(autosomal_final_table, output_name, sex_chr=False)



if __name__ == "__main__":
Expand Down
8 changes: 7 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@
]
},
include_package_data=True,
install_requires=["pandas>=1.0,<2.0", "openpyxl>=3.0.6", "snakemake>=7.22.0", "pyyaml>=6.0"],
install_requires=[
"pandas>=1.0,<2.0",
"openpyxl>=3.0.6",
"snakemake>=7.22.0",
"pyyaml>=6.0",
"matplotlib>=3.5.3",
],
entry_points={"console_scripts": ["lusstr = lusSTR.cli:main"]},
scripts=glob.glob("lusSTR/scripts/*"),
classifiers=[
Expand Down
Loading