From 3921f6957bfce80acb7be1e28808819b52ece1e1 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Fri, 28 Jun 2024 14:46:51 -0400 Subject: [PATCH] began adding in custom ranges for Y-STRs --- lusSTR/data/str_markers.json | 36 +++++++- lusSTR/scripts/marker.py | 22 ++++- lusSTR/tests/test_marker.py | 161 +++++++++++++++++++++++++++++++++++ lusSTR/wrappers/convert.py | 34 +++++--- 4 files changed, 235 insertions(+), 18 deletions(-) diff --git a/lusSTR/data/str_markers.json b/lusSTR/data/str_markers.json index a3ddbc3..38c613d 100644 --- a/lusSTR/data/str_markers.json +++ b/lusSTR/data/str_markers.json @@ -571,6 +571,7 @@ "TCTA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -579,7 +580,7 @@ "Power_5": 0, "Power_3": 113, "Custom_5": 0, - "Custom_3": -35, + "Custom_3": -39, "Alleles": ["10", "11", "12", "13"] }, "DYS643": { @@ -589,6 +590,7 @@ "CTTTT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 5, "LUS": "CTTTT", "Sec": "", "Tert": "", @@ -608,6 +610,7 @@ "TAGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "", "Tert": "", @@ -626,6 +629,7 @@ "TCT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 3, "LUS": "TCT", "Sec": "TCT", "Tert": "", @@ -643,6 +647,7 @@ "AAAG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAAG", "Sec": "", "Tert": "", @@ -661,6 +666,7 @@ "TTTC" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TTTC", "Sec": "", "Tert": "", @@ -679,6 +685,7 @@ "GATA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "GATA", "Sec": "", "Tert": "", @@ -697,6 +704,7 @@ "TATC" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TATC", "Sec": "", "Tert": "", @@ -715,6 +723,7 @@ "ATAG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "ATAG", "Sec": "", "Tert": "", @@ -729,6 +738,7 @@ "TCCT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TCCT", "Sec": "", "Tert": "", @@ -743,6 +753,7 @@ "CTT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 3, "LUS": "CTT", "Sec": "", "Tert": "", @@ -761,6 +772,7 @@ "CTAT" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "CTAT", "Sec": "", "Tert": "", @@ -775,6 +787,7 @@ "GAAA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "GAAA", "Sec": "", "Tert": "", @@ -791,6 +804,7 @@ "AGAT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AGAT", "Sec": "", "Tert": "", @@ -807,6 +821,7 @@ "AGAGAT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 6, "LUS": "AGAGAT", "Sec": "AGAGAT", "Tert": "", @@ -825,6 +840,7 @@ "GATA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "GATA", "Sec": "", "Tert": "", @@ -843,6 +859,7 @@ "TTTTC" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 5, "LUS": "TTTTC", "Sec": "", "Tert": "", @@ -861,6 +878,7 @@ "TCTA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -879,6 +897,7 @@ "AGAT" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AGAT", "Sec": "", "Tert": "", @@ -895,6 +914,7 @@ "ATA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 3, "LUS": "ATA", "Sec": "", "Tert": "", @@ -913,6 +933,7 @@ "TCTA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -933,6 +954,7 @@ "GAGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "CAGA", "Tert": "GAGA", @@ -952,6 +974,7 @@ "CAGG" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "TAGA", "Tert": "CAGG", @@ -970,6 +993,7 @@ "TAGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "", "Tert": "", @@ -988,6 +1012,7 @@ "TTTC" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TTTC", "Sec": "", "Tert": "", @@ -1006,6 +1031,7 @@ "TCTA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TCTA", "Sec": "", "Tert": "", @@ -1025,6 +1051,7 @@ "GAAG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAAG", "Sec": "GAAG", "Tert": "", @@ -1039,6 +1066,7 @@ "ATCT" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "ATCT", "Sec": "", "Tert": "", @@ -1053,6 +1081,7 @@ "ATAG" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "ATAG", "Sec": "", "Tert": "", @@ -1067,6 +1096,7 @@ "TGGA" ], "ReverseCompNeeded": "Yes", + "NumBasesToSeparate": 4, "LUS": "TGGA", "Sec": "", "Tert": "", @@ -1081,6 +1111,7 @@ "TAGA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "", "Tert": "", @@ -1096,6 +1127,7 @@ "AAGG" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAGA", "Sec": "AAGG", "Tert": "", @@ -1116,6 +1148,7 @@ "AAAA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "AAGA", "Sec": "AAAA", "Tert": "", @@ -1133,6 +1166,7 @@ "TAGA" ], "ReverseCompNeeded": "No", + "NumBasesToSeparate": 4, "LUS": "TAGA", "Sec": "TAGA", "Tert": "", diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 90ef167..2205789 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -1411,10 +1411,13 @@ def canonical(self): def convert(self): sequence = self.forward_sequence if self.kit == "powerseq": - final_seq = ( - f"{collapse_repeats_by_length_flanks(sequence[:6], 4)} " - f"{collapse_repeats_by_length(sequence[6:], 4)}" - ) + if len(sequence) < 6: + final_seq = sequence + else: + final_seq = ( + f"{collapse_repeats_by_length_flanks(sequence[:6], 4)} " + f"{collapse_repeats_by_length(sequence[6:], 4)}" + ) elif len(sequence) % 4 != 0: final_seq = sequence_to_bracketed_form(sequence, self.repeat_size, self.repeats) else: @@ -1446,6 +1449,17 @@ def convert(self): ) return final_string + @property + def custom_brack(self): + if self.custom: + sequence = self.custom_sequence + final_string = ( + f"{collapse_repeats_by_length(sequence[:14], 4)} " + f"{collapse_repeats_by_length(sequence[14:], 4)}" + ) + return final_string + return None + class STRMarker_HPRTB(STRMarker): @property diff --git a/lusSTR/tests/test_marker.py b/lusSTR/tests/test_marker.py index 6ce2be4..926cc7c 100644 --- a/lusSTR/tests/test_marker.py +++ b/lusSTR/tests/test_marker.py @@ -1652,6 +1652,167 @@ def test_new_power_config(locus, sequence, bracketed, conc, lus, sec, tert, flan "CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCT", "CTTC CT [ATCT]10 A ATCT", ), + ( + "Y-GATA-H4", + "TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTACCTACCTACCTATCTATCTATAG" + "ATCTATCTATCTATCTTAAATTTGGAAATTCTCCTCAGCATAACATTTTAATGATGATTCCTAGGATACAAGTGATGTGCTGAA" + "AGTATCAATGTGTATCAGAAAACCAACATCTCTGCTTAGGTCTCT", + "TCTATCTATCTATCTATCTATCTATCTATCTATCT" "ATCTATCTATCTA", + "[TCTA]12", + ), + ( + "DYS643", + "ACCTCATGCTCTGTGATTTTTGCAGGTGTTCACTGCAAGCCATGCCTGGTTAAACTACTGTGCCTTTTCTTTTC" + "TTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTTTAAAACTTTTTACTTCAGTAGAATTTTGGGGGG", + "GTGCCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTCTTTT", + "GTGC [CTTTT]10 CTTT CTTTT", + ), + ( + "DYS635", + "CCCAAATATCCATCAATCAATGAATGGATAAAGAAAATGTGATAGATAGATAGATAGATAGATAGATAGATAGA" + "TAGATAGATACATACATAGATAGATACATACATAGATAGATACATACATAGATAGATAGATAGAGATT", + "ATGTGATAGATA" + "GATAGATAGATAGATAGATAGATAGATAGATAGATACATACATAGATAGATACATACATAGATAGATACATACATAGATAGATA" + "GATAGAGATT", + "ATGT GA [TAGA]10 [TACA]2 [TAGA]2 [TACA]2 [TAGA]2 [TACA]2 [TAGA]4 GATT", + ), + ( + "DYS576", + "AAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAA" + "GCCAAGACAAATACGCTTATTACTCCCATCTCCTCCTTCATCTCCAGGAAATGAGAC", + "AAAGAAAGAAAGAAAGAAAGAAA" "GAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAG", + "[AAAG]17", + ), + ( + "DYS570", + "TAAAATGAATGATGACTAGGTAGAAATCCTGGCTGTGTCCTCCAAGTTCCTTTTCTTTCTTTCTTTCTTTCTTT" + "CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTTT", + "TTTCTTTCTTTCTTTCTTTCTTTCTTTCTT" "TCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC", + "[TTTC]17", + ), + ( + "DYS549", + "GTAAAGAACTATAAAAAGATTAATACAACAAAAATTTGGTAATCTGAAATAATAAGGTAGACATAGCAATTAGG" + "TAGGTAAAGAGGAAGATGATAGATGATTAGAAAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA" + "GATAGAAAAAATC", + "AGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAAA", + "AGAT [GATA]13 GAAA AA", + ), + ( + "DYS533", + "CTAATATTTATCTATATCATTCTAATTATGTCTCTTCTAACTATATAACTATGTATTATCTATCAATCTTCTAC" + "CTATCATCTTTCTAGCTAGCTATCATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCATCTATCAT" + "CTTCTATTGTTTGGTTGAGTTAAGAACTGATCATGAATAAATACATTTCATTGGT", + "TATCATCTATCTATCTATCTATCTA" "TCTATCTATCTATCTATCTATCTATCTATCATCT", + "TATC ATC [TATC]12 ATCT", + ), + ( + "DYS481", + "TAAAAGGAATGTGGCTAACGCTGTTCAGCATGCTGCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTT" + "CTTCTTCTTCTTCTTCTTCTTCTTCTTTTTTGAGTCTTG", + "CTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCTTCT" "TCTTCTTCTTCTTCTTCTTCTTCTT", + "[CTT]22", + ), + ( + "DYS458", + "GAAAGAAAGAAAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA" + "GAAAGAAAGAAAGGAGGGTGGGCGTGGTGGCTCATGCTTGTAATGCCAGAACTTTGGGAGGCCGAGGTGG", + "GAAAGAAAGA" + "AAAGGAAGGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAA", + "[GAAA]3 AG GAAG [GAAA]17", + ), + ( + "DYS456", + "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATTCCATTAGTTCT" + "GTCCCTCTAGAGAACCCTAATACATCAGTTTAAGAA", + "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT" "AGATAGATAGATAGATATTCC", + "[AGAT]15 ATTC C", + ), + ( + "DYS448", + "AGACATGGATAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAG" + "AGATATAGAGATAGAGAGATAGAGATAGAGATAGATAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAG" + "AGATAGAGATAGAGAGGTAAAGATAGA", + "AGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGATAGAGA" + "TAGAGATAGAGATATAGAGATAGAGAGATAGAGATAGAGATAGATAGATAGAGAAAGAGATAGAGATAGAGATAGAGATAGAGA" + "TAGAGATAGAGATAGAGAT", + "[AGAGAT]11 [ATAGAG]2 [AGATAG]3 ATAGAT AGAGAA [AGAGAT]8", + ), + ( + "DYS439", + "AAATAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAGTATAAGTAA" + "AGAGATGATGG", + "TAGAAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAAAGT", + "TAGA A [GATA]13 GAAA GT", + ), + ( + "DYS438", + "CAGTATATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTATTTGA" + "AATGGAGTTTCACTCTTGTTGCCCAGG", + "TATATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTT" "CTTTTCTTTTCTATTT", + "TATA [TTTTC]12 TATTT", + ), + ( + "DYS437", + "GCCCATCCGGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTATCAT" + "CTATCATCTGTGAATGATGTCTATCTACTTATCTATGAATGATATTTATCTGTGGTTATCTATCTATCTATA", + "CCGGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGTCTGTCTATCTATCTATCTATCATCT", + "CCGG [TCTA]9 [TCTG]2 [TCTA]4 TCAT CT", + ), + ( + "DYS393", + "CGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATATGTATGTCTTTTCTATGAGAC" "ATA", + "CGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", + "CGAT [AGAT]12", + ), + ( + "DYS392", + "TAAATAATAATAATAATAATAATAATAATAATAATAATAATAAATAAATGGTGATACAAGAAAAAAATTTGTTT" + "TCCTTCTTGGCTTTTAAATAACAAACACTTGAAATCAAATTAG", + "ATAATAATAATAATAATAATAATAATAATAATAATAA" "TA", + "[ATA]13", + ), + ( + "DYS391", + "TGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTGCCTATCTGCCTGCCTACCTA" + "TCCCTCTATGGCAATTGCTTGCAACCAGGGAGATTTTA", + "TCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATC" "TATCTATCTGCCTATC", + "TCTG [TCTA]11 TCTG CCTA TC", + ), + ( + "DYS390", + "AACAAGGAAAGATAGATAGATGATAGATAGATAGATAGACAGATAGATAGATAGATAGATAGATAGATAGATAG" + "ATAGATAGACAGACAGACAGACAGACAGACAGACAGACAGATAGATAGAATATATTATGGGGTACCAAAATGCAGGGCCCAAAA" + "ATGTGTAAAATATATGTGT", + "TAGATAGATAGATAGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAC" + "AGACAGACAGACAGACAGACAGACAGACAGA", + "[TAGA]4 CAGA [TAGA]10 [CAGA]8", + ), + ( + "DYS389II", + "TCATAGATAGATGATGGACTGCTAGATAAATAGATAGATTGATAGAGGGAGGGATAGATAGATAGATAGATA" + "GATAGATAGATAGATAGATAGATAGACAGACAGACAGATACATAGATAATACAGATGAGAGTTGGATACAGAAGTAGGTATAAT" + "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACA", + "TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATACATAGATAATACAGATGAGAGTTGGA" + "TACAGAAGTAGGTATAATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGA", + "[TAGA]11 [CAGA]3 TACA TAGA TAAT ACAG ATGA GAGT TGGA TACA GAAG TAGG TATA ATGA [TAGA]11" + " [CAGA]5", + ), + ( + "DYS385A-B", + "TTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCCCTTCCTTCCTTCCT" + "TCCTTCCTTTCTTTCTCTTTCCTCTTTCTCTTTCTTCTCTTTCTTTCTTTTTCTCTTTTTCTCTTTCTTTCTTTTTTACTTTCT" + "TTCTCCTTCCTTCCTTCCTTTCTGAATTTCATTTCTTTTCTTT", + "TTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCT" "TTCTTTCTTTC", + "[TTTC]12", + ), + ( + "DYS19", + "TCTGGGTTAAGGAGAGTGTCACTATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTACCTAT" + "CTATCTATCTAAAACACTATATATATATAACACTATATATATAATACTATATATATATTA", + "TCTATCTATCTATCTATCTA" "TCTATCTATCTATCTATCTATCTACCTATCTATCTATCTA", + "[TCTA]11 CCTA [TCTA]3", + ), ], ) def test_custom_ranges(locus, sequence, cust_seq, bracketed): diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 34cbc6a..c499a38 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -237,16 +237,30 @@ def remove_columns(column_list, remove_list): return column_list +def create_custom_outputtable(columns, table): + remove_list = [ + "UAS_Output_Sequence", + "Forward_Strand_Sequence", + "UAS_Output_Bracketed_Notation", + "Forward_Strand_Bracketed_Notation", + ] + custom_columns = remove_columns(columns, remove_list) + custom_table = table[custom_columns] + custom_table_comb = combine_reads(custom_table, custom_columns) + return custom_table_comb + + def main(input, out, kit, software, sex, nocombine, custom): input = str(input) out = str(out) output_name = os.path.splitext(out)[0] input_name = os.path.splitext(input)[0] + full_table_name = re.sub(r"_custom_range", "", output_name) autosomal_final_table, autosomal_flank_table, columns = format_table( input, software, kit, custom ) if sex: - sex_final_table, sex_flank_table, columns = format_table( + sex_final_table, sex_flank_table, sex_columns = format_table( f"{input_name}_sexloci.csv", software, kit, custom ) if software != "uas": @@ -256,8 +270,11 @@ def main(input, out, kit, software, sex, nocombine, custom): sex_final_table.to_csv( f"{output_name}_sexloci_no_combined_reads.txt", index=False ) - sex_final_table = combine_reads(sex_final_table, columns) - sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) + sex_final_table = combine_reads(sex_final_table, sex_columns) + sex_final_table.to_csv(f"{full_table_name}_sexloci.txt", sep="\t", index=False) + if custom: + sex_table_custom = create_custom_outputtable(sex_columns, sex_final_table) + sex_table_custom.to_csv(f"{output_name}.txt", index=False, sep="\t") else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) if software != "uas": @@ -268,18 +285,9 @@ def main(input, out, kit, software, sex, nocombine, custom): f"{output_name}_no_combined_reads.txt", sep="\t", index=False ) autosomal_final_table = combine_reads(autosomal_final_table, columns) - full_table_name = re.sub(r"_custom_range", "", output_name) autosomal_final_table.to_csv(f"{full_table_name}.txt", sep="\t", index=False) if custom: - remove_list = [ - "UAS_Output_Sequence", - "Forward_Strand_Sequence", - "UAS_Output_Bracketed_Notation", - "Forward_Strand_Bracketed_Notation", - ] - custom_columns = remove_columns(columns, remove_list) - custom_table = autosomal_final_table[custom_columns] - custom_table_comb = combine_reads(custom_table, custom_columns) + custom_table_comb = create_custom_outputtable(columns, autosomal_final_table) custom_table_comb.to_csv(out, sep="\t", index=False) else: autosomal_final_table.to_csv(out, sep="\t", index=False)