From d8fedfb5342e6b017b068e744d653059ed9820e4 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Mon, 8 Apr 2024 15:09:17 -0400 Subject: [PATCH] changing the way snp binned files are named --- lusSTR/data/snp_config.yaml | 6 ++++-- lusSTR/tests/test_snps.py | 2 +- lusSTR/wrappers/snps_convert.py | 35 ++++++++++++++++++++++++++++----- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/lusSTR/data/snp_config.yaml b/lusSTR/data/snp_config.yaml index 38ed3c1..5f5879a 100644 --- a/lusSTR/data/snp_config.yaml +++ b/lusSTR/data/snp_config.yaml @@ -12,7 +12,9 @@ types: "all" ## choices are "all", "i" (identity SNPs only), "p" (phenotype only nofilter: False ## True/False if no filtering is desired; if False, will remove any allele designated as Not Typed ## convert settings -strand: "forward" ## forward/uas; indicates which oritentation to report the alleles for the ForenSeq SNPs; uas indicates the orientation as reported by the UAS or the forward strand +strand: "forward" ## forward/uas; strand orientation to report references: "" ## list IDs of the samples to be run as references in EFM separate: false ## True/False; if want to separate samples into individual files for use in EFM -thresh: 0.03 ## Analytical threshold value \ No newline at end of file +thresh: 0.03 ## Analytical threshold value + + diff --git a/lusSTR/tests/test_snps.py b/lusSTR/tests/test_snps.py index 3d977d9..f8578f3 100644 --- a/lusSTR/tests/test_snps.py +++ b/lusSTR/tests/test_snps.py @@ -239,7 +239,7 @@ def test_snp_bins(tmp_path): print(fh.read(), end="") assert filecmp.cmp(exp_out, obs_out) is True for snp_set in range(0, 10): - path = tmp_path / f"evidence_samples/Kin_pos_1ng_snp_evidence_set{snp_set}.csv" + path = tmp_path / f"evidence_samples/Kin_pos_1ng_set{snp_set}.csv" assert path.is_file() diff --git a/lusSTR/wrappers/snps_convert.py b/lusSTR/wrappers/snps_convert.py index f55e52e..9776fc7 100644 --- a/lusSTR/wrappers/snps_convert.py +++ b/lusSTR/wrappers/snps_convert.py @@ -37,9 +37,7 @@ def create_output_table(sample_df, orientation, separate, output_type, software) allele_col = "Forward_Strand_Allele" all_samples_df = pd.DataFrame() for sample in sample_df["SampleID"].unique(): - indiv_df = sample_df[ - (sample_df["SampleID"] == sample) & (sample_df["Issues"] != "Contains untyped allele") - ] + indiv_df = sample_df[sample_df["SampleID"] == sample] compiled_table = create_sample_df(indiv_df, output_type, allele_col) if software != "uas": compiled_table = check_allele_calls(compiled_table, output_type) @@ -80,7 +78,7 @@ def bin_snps(sample_file, output_type, sample): bin_df["Sample Name"] = bin_df["Sample Name"] + "_set" + str(snp_num) compiled_table = pd.concat([compiled_table, bin_df]) bin_df.to_csv( - f"{output_type}_samples/{sample}_snp_{output_type}_set{snp_num}.csv", + f"{output_type}_samples/{sample}_set{snp_num}.csv", index=False, sep="\t", ) @@ -97,9 +95,36 @@ def create_sample_df(indiv_df, output_type, all_col): try: compiled_table.columns = ["Marker", "Allele 1", "Allele 2", "Height 1", "Height 2"] except ValueError: - print("Too many alleles!") + try: + compiled_table.columns = [ + "Marker", + "Allele 1", + "Allele 2", + "Allele 3", + "Height 1", + "Height 2", + "Height 3", + ] + except ValueError: + compiled_table.columns = [ + "Marker", + "Allele 1", + "Allele 2", + "Allele 3", + "Allele 4", + "Height 1", + "Height 2", + "Height 3", + "Height 4", + ] + if len(compiled_table[compiled_table["Allele 4"].notna()]) > 0: + compiled_table = compiled_table.drop(compiled_table.columns[[4, 8]], axis=1) + if len(compiled_table[compiled_table["Allele 3"].notna()]) > 0: + print(compiled_table) + compiled_table = compiled_table.drop(compiled_table.columns[[3, 6]], axis=1) if output_type == "reference": for i, row in compiled_table.iterrows(): + print(compiled_table.loc[i, "Height 2"]) if pd.isnull(compiled_table.loc[i, "Height 2"]): compiled_table.loc[i, "Allele 2"] = compiled_table.loc[i, "Allele 1"] compiled_table = compiled_table[["Marker", "Allele 1", "Allele 2"]]