From 4ad6fc68b2033fb553c2703ebce8272580ed9a86 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 5 Feb 2024 12:57:57 -0500 Subject: [PATCH 01/87] Update run_param_config.py Adding MSK/CMO Bait/Target set to table --- scripts/run_param_config.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index 90799e3..8351ca9 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -651,6 +651,18 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, + "MSK-CH": { + BAITS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.baits", + TARGETS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.targets", + MSKQ: "no", + MD: "yes" + }, + "CMO-CH": { + BAITS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.baits", + TARGETS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.targets", + MSKQ: "no", + MD: "yes" + }, "HumanWholeGenome": { MSKQ: "no", MD: "yes", From 843424dd7be553b54291f8f1f1d9377574c13e8e Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 12 Feb 2024 16:31:40 -0500 Subject: [PATCH 02/87] Update LaunchMetrics.py increase the bin memory for ChIPSeq and HiC data, or for DNA recipes that generate a lot of mark dup data. --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 533e1b6..59a6ee7 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -167,7 +167,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) - launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) + launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 7309927708b52ee97043b2e8ef9e71bbc0d6c5d0 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Fri, 16 Feb 2024 09:58:34 -0500 Subject: [PATCH 03/87] send_json_data.sh added the send_json_data bash script to post cellranger GEX and VDJ data to RUN QC. the /home/igo/Scripts where the script was previously stored was deleted, therefore causing an error when trying to post GEX and VDJ data to RUN QC --- scripts/cellranger.py | 2 +- scripts/send_json_data.sh | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 scripts/send_json_data.sh diff --git a/scripts/cellranger.py b/scripts/cellranger.py index e2268f8..f4f1887 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -191,7 +191,7 @@ def create_json(send_json, sequencer_and_run, project, tag, work_area): with open(json_data_file, "w") as jfile: json.dump(send_json, jfile) - bsub_json = "bsub -J create_json___{} -o create_json___{}.log -w \"done({}*)\" sh /home/igo/Scripts/PicardScripts/send_json_data.sh {} {}".format(job_id, job_id, job_id, work_area, json_data_file) + bsub_json = "bsub -J create_json___{} -o create_json___{}.log -w \"done({}*)\" sh /igo/work/igo/igo-demux/scripts/send_json_data.sh {} {}".format(job_id, job_id, job_id, work_area, json_data_file) print(bsub_json) subprocess.run(bsub_json, shell = True) diff --git a/scripts/send_json_data.sh b/scripts/send_json_data.sh new file mode 100644 index 0000000..8fb3aa2 --- /dev/null +++ b/scripts/send_json_data.sh @@ -0,0 +1,18 @@ +#!/bin/bash + + +##Usage sh send_json_data.sh patth_to_json json_file +args=$@ + +path_to_json=$1 +json_file=$2 + +cd $path_to_json + +echo $path_to_json +json_data=$(cat $json_file) +echo $json_data + + +curl -d "$json_data" -H "Content-Type: application/json" -X POST "http://igodb.mskcc.org:8080/ngs-stats/saveCellRangerSample" + From f7e4d15c88fb218ae93a361d0f8e9d2d0c005e69 Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 28 Feb 2024 16:31:49 -0500 Subject: [PATCH 04/87] add json option for visium --- scripts/cellranger.py | 4 ++++ scripts/cellranger_spatial.py | 33 ++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index f4f1887..0778a12 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -332,6 +332,10 @@ def launch_cellranger(sample_sheet, sequencer_and_run): probe = config_dict[tag]["probe"][sample_genome_dict[sample]] cmd = cmd + " --probe-set={}".format(probe) + # if there is manual alignment json file availabe, add that to the cmd + if sample_info.json != "EMPTY": + cmd = cmd + " --loupe-alignment={}".format(sample_info.json) + bsub_cmd = "bsub -J {}_{}_{}_SPATIAL -o {}_SPATIAL.out{}{}".format(sequencer_and_run, project, sample, sample, cmd, OPTIONS) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) diff --git a/scripts/cellranger_spatial.py b/scripts/cellranger_spatial.py index a6af142..70b27a3 100644 --- a/scripts/cellranger_spatial.py +++ b/scripts/cellranger_spatial.py @@ -22,8 +22,10 @@ def __init__(self, sample, project_id): self.chip_id = "EMPTY" self.preservation = "EMPTY" self.tiff_image = "EMPTY" + self.json = "EMPTY" self.get_info_from_LIMS() self.copy_tiff(project_id) + self.copy_json(project_id) def get_info_from_LIMS(self): response = requests.get(ENDPOINT + self.IGO_ID , auth = ("pms", "tiagostarbuckslightbike"), verify = False) @@ -42,12 +44,29 @@ def copy_tiff(self, project_id): if not os.path.exists(destination_loc): os.makedirs(destination_loc) - # copy all the image files using rsync? - original_tiff_image = glob.glob(source_loc_dir + "/" + self.sample_name + "*") - if len(original_tiff_image) != 1 or ".tif" not in original_tiff_image[0]: + # copy image file per sample + original_tiff_image = source_loc_dir + "/" + self.sample_name + ".tif" + if os.path.isfile(original_tiff_image): + shutil.copy(original_tiff_image, destination_file) + self.tiff_image = destination_file + print("copy {} to {}".format(original_tiff_image, destination_file)) + else: print("tif file is not in proper format for sample {}, please check".format(self.IGO_ID)) + + # copy json file if exists + def copy_json(self, project_id): + # project_id format as Project_12345 + source_loc = original_tiff_images_directory + project_id + "/json/" + self.sample_name + ".json" + destination_loc = tiff_images_directory + project_id + destination_file = destination_loc + "/" + self.sample_name + ".json" + + # create director if not exists + if not os.path.exists(destination_loc): + os.makedirs(destination_loc) + + if os.path.isfile(source_loc): + shutil.copy(source_loc, destination_file) + self.json = destination_file + print("copy {} to {}".format(source_loc, destination_file)) else: - shutil.copy(original_tiff_image[0], destination_file) - self.tiff_image = destination_file - print("copy {} to {}".format(original_tiff_image[0], destination_file)) - \ No newline at end of file + print("json file does not exist for {}".format(self.sample_name)) From 2fa413c2497150328a00d63876b152ccf7f0e28e Mon Sep 17 00:00:00 2001 From: luc Date: Thu, 29 Feb 2024 09:16:33 -0500 Subject: [PATCH 05/87] Update cellranger.py --- scripts/cellranger.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index 0778a12..cffe807 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -426,6 +426,10 @@ def lanuch_by_project(project_directory, recipe, species): probe = config_dict[tag]["probe"][species] cmd = cmd + " --probe-set={}".format(probe) + # if there is manual alignment json file availabe, add that to the cmd + if sample_info.json != "EMPTY": + cmd = cmd + " --loupe-alignment={}".format(sample_info.json) + bsub_cmd = "bsub -J {}_{}_{}_SPATIAL -o {}_SPATIAL.out{}{}".format(sequencer_and_run, project, sample, sample, cmd, OPTIONS) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) From 1725d7bc53b8301e95466f79152191c162025e2e Mon Sep 17 00:00:00 2001 From: luc <44953736+CuijieLu@users.noreply.github.com> Date: Thu, 29 Feb 2024 09:45:37 -0500 Subject: [PATCH 06/87] Update test_SampleSheet.py --- test_SampleSheet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_SampleSheet.py b/test_SampleSheet.py index d1a2af7..f6a40b9 100644 --- a/test_SampleSheet.py +++ b/test_SampleSheet.py @@ -2,7 +2,7 @@ import pytest def test_mixed_10X_barcodes(): - x = SampleSheet("test/MICHELLE_0543_10X_MIXED.csv") + x = SampleSheet("./test/MICHELLE_0543_10X_MIXED.csv") ss_list = x.split_sample_sheet() if "OverrideCycles" in ss_list[1].df_ss_header.astype(str): @@ -10,7 +10,7 @@ def test_mixed_10X_barcodes(): print(ss_list[2].df_ss_header) def test_only_10XSI_barcodes(): - x = SampleSheet("test/SampleSheet_10X_SI.csv") + x = SampleSheet("./test/SampleSheet_10X_SI.csv") print("Calling split sample sheet.") ss_list = x.split_sample_sheet() print("After split sample sheet.") @@ -72,4 +72,4 @@ def test_only_DLP_split(): x = SampleSheet("test/MICHELLE_420_ONLY_DLP.csv") ss_list = x.split_sample_sheet() assert(len(ss_list) == 1) - assert("Lane" in ss_list[0].df_ss_data.columns) \ No newline at end of file + assert("Lane" in ss_list[0].df_ss_data.columns) From 9951110e76640a8fade93f2a388761dba98a6ad9 Mon Sep 17 00:00:00 2001 From: luc <44953736+CuijieLu@users.noreply.github.com> Date: Thu, 29 Feb 2024 09:52:11 -0500 Subject: [PATCH 07/87] Update test_SampleSheet.py --- test_SampleSheet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_SampleSheet.py b/test_SampleSheet.py index f6a40b9..24b5484 100644 --- a/test_SampleSheet.py +++ b/test_SampleSheet.py @@ -2,7 +2,7 @@ import pytest def test_mixed_10X_barcodes(): - x = SampleSheet("./test/MICHELLE_0543_10X_MIXED.csv") + x = SampleSheet("igo-demux/test/MICHELLE_0543_10X_MIXED.csv") ss_list = x.split_sample_sheet() if "OverrideCycles" in ss_list[1].df_ss_header.astype(str): @@ -10,7 +10,7 @@ def test_mixed_10X_barcodes(): print(ss_list[2].df_ss_header) def test_only_10XSI_barcodes(): - x = SampleSheet("./test/SampleSheet_10X_SI.csv") + x = SampleSheet("test/SampleSheet_10X_SI.csv") print("Calling split sample sheet.") ss_list = x.split_sample_sheet() print("After split sample sheet.") From 1efc78676f8146c44fdd4f7411ff58bd749a441f Mon Sep 17 00:00:00 2001 From: luc <44953736+CuijieLu@users.noreply.github.com> Date: Thu, 29 Feb 2024 10:26:04 -0500 Subject: [PATCH 08/87] Update test_SampleSheet.py --- test_SampleSheet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_SampleSheet.py b/test_SampleSheet.py index 24b5484..2eed0cc 100644 --- a/test_SampleSheet.py +++ b/test_SampleSheet.py @@ -2,7 +2,7 @@ import pytest def test_mixed_10X_barcodes(): - x = SampleSheet("igo-demux/test/MICHELLE_0543_10X_MIXED.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/MICHELLE_0543_10X_MIXED.csv") ss_list = x.split_sample_sheet() if "OverrideCycles" in ss_list[1].df_ss_header.astype(str): From d8ea275524608d70ea46dcbd93e565360c5df1e5 Mon Sep 17 00:00:00 2001 From: luc Date: Thu, 29 Feb 2024 10:36:58 -0500 Subject: [PATCH 09/87] Update test_SampleSheet.py --- test_SampleSheet.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test_SampleSheet.py b/test_SampleSheet.py index 2eed0cc..970d7de 100644 --- a/test_SampleSheet.py +++ b/test_SampleSheet.py @@ -10,7 +10,7 @@ def test_mixed_10X_barcodes(): print(ss_list[2].df_ss_header) def test_only_10XSI_barcodes(): - x = SampleSheet("test/SampleSheet_10X_SI.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_10X_SI.csv") print("Calling split sample sheet.") ss_list = x.split_sample_sheet() print("After split sample sheet.") @@ -18,30 +18,30 @@ def test_only_10XSI_barcodes(): assert(len(ss_list) == 1) def test_read_10X_sample_sheet(): - samplesheet = SampleSheet("test/SampleSheet_10X_SI.csv") + samplesheet = SampleSheet("./test/SampleSheet_10X_SI.csv") corrected = convert_SI_barcodes(samplesheet) print(corrected.df_ss_data.to_string()) assert(len(corrected.df_ss_data) == 16) def test_read_empty_sample_sheet(): - x = SampleSheet("test/empty_sample_sheet.csv") + x = SampleSheet("/test/empty_sample_sheet.csv") print("Success") def test_read_blank_sample_sheet(): with pytest.raises(Exception): - x = SampleSheet("test/blank_sample_sheet.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/blank_sample_sheet.csv") def test_read_SE_sample_sheet(): - x = SampleSheet("test/SampleSheet_PEPE.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_PEPE.csv") print("Success") def test_WGS_only_not_split(): - x = SampleSheet("test/DIANA_0434.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/DIANA_0434.csv") ss_list = x.split_sample_sheet() assert(len(ss_list) == 1) def test_barcode_read_lengths(): - x = SampleSheet("test/SampleSheet.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet.csv") assert (x.read_lengths[0] == 151) assert (x.read_lengths[1] == 151) From ffc9fa34cc059dbdc236a4398e02e763ebdf6c77 Mon Sep 17 00:00:00 2001 From: luc Date: Thu, 29 Feb 2024 10:46:56 -0500 Subject: [PATCH 10/87] update test file path --- test_SampleSheet.py | 12 ++++++------ test_demux_run_dag.py | 4 ++-- test_scripts.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test_SampleSheet.py b/test_SampleSheet.py index 970d7de..539ef65 100644 --- a/test_SampleSheet.py +++ b/test_SampleSheet.py @@ -18,13 +18,13 @@ def test_only_10XSI_barcodes(): assert(len(ss_list) == 1) def test_read_10X_sample_sheet(): - samplesheet = SampleSheet("./test/SampleSheet_10X_SI.csv") + samplesheet = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_10X_SI.csv") corrected = convert_SI_barcodes(samplesheet) print(corrected.df_ss_data.to_string()) assert(len(corrected.df_ss_data) == 16) def test_read_empty_sample_sheet(): - x = SampleSheet("/test/empty_sample_sheet.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/empty_sample_sheet.csv") print("Success") def test_read_blank_sample_sheet(): @@ -46,15 +46,15 @@ def test_barcode_read_lengths(): assert (x.read_lengths[1] == 151) def test_recipe_set(): - x = SampleSheet("test/SampleSheet.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet.csv") assert ("DLP" in x.recipe_set) def test_barcode_list(): - x = SampleSheet("test/SampleSheet.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet.csv") assert ("AAGGACATAACCCCGT" in x.barcode_list) def test_split(): - x = SampleSheet("test/SampleSheet_DLP.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_DLP.csv") ss_list = x.split_sample_sheet() path0 = ss_list[0].path path1 = ss_list[1].path @@ -69,7 +69,7 @@ def test_split(): # Test when a sample sheet is only DLP lane information is removed and it is demuxed with "NoLaneSplitting" option in the sample sheet def test_only_DLP_split(): - x = SampleSheet("test/MICHELLE_420_ONLY_DLP.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/MICHELLE_420_ONLY_DLP.csv") ss_list = x.split_sample_sheet() assert(len(ss_list) == 1) assert("Lane" in ss_list[0].df_ss_data.columns) diff --git a/test_demux_run_dag.py b/test_demux_run_dag.py index 80323c9..8d88612 100644 --- a/test_demux_run_dag.py +++ b/test_demux_run_dag.py @@ -2,14 +2,14 @@ import demux_run_dag def test_WGS_only_not_split(): - x = SampleSheet("test/SampleSheet_220304_MICHELLE_0485_BHFN7NDSX3.csv") + x = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_220304_MICHELLE_0485_BHFN7NDSX3.csv") cmd_set = demux_run_dag.build_dragen_cmds(x, "MICHELLE_0485_BHFN7NDSX3") assert(len(cmd_set) == 7) def test_get_dlp_chip(): # Test that the DLP chip returned is correct even when the run has multiple DLP projects with different chip IDs - sample_sheet = SampleSheet("test/SampleSheet_220412_MICHELLE_0501_BHFNH5DSX3_DLP.csv") + sample_sheet = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_220412_MICHELLE_0501_BHFNH5DSX3_DLP.csv") for project in sample_sheet.project_set: chip_id = demux_run_dag.get_dlp_chip(sample_sheet, project) if project == "13098": diff --git a/test_scripts.py b/test_scripts.py index 2376554..e14975c 100644 --- a/test_scripts.py +++ b/test_scripts.py @@ -57,8 +57,8 @@ def testGettotalreads(): assert(total_reads_dict["PDX_WD0010_P1_1850_IGO_12754_E_2"] == 602357556) def testGettotalreadsDLP(): - sample_sheet = SampleSheet("test/SampleSheet_DLP_multiprojects.csv") - total_reads_dict = scripts.get_total_reads_from_demux.get_total_reads_DLP(sample_sheet, "test/Demultiplex_Stats_DLP.csv" ) + sample_sheet = SampleSheet("/home/runner/work/igo-demux/igo-demux/test/SampleSheet_DLP_multiprojects.csv") + total_reads_dict = scripts.get_total_reads_from_demux.get_total_reads_DLP(sample_sheet, "/home/runner/work/igo-demux/igo-demux/test/Demultiplex_Stats_DLP.csv" ) print(total_reads_dict) assert(total_reads_dict["Project_11113_L"]["samples"][1] == 3802998466) assert(total_reads_dict["Project_11113_L"]["pos_control"][1] == 654555718) From cfb6100aa2242ad58bcf1dda6adb8d5a8bebbc7e Mon Sep 17 00:00:00 2001 From: luc Date: Thu, 29 Feb 2024 10:50:17 -0500 Subject: [PATCH 11/87] Update test_scripts.py --- test_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_scripts.py b/test_scripts.py index e14975c..e9174b0 100644 --- a/test_scripts.py +++ b/test_scripts.py @@ -51,7 +51,7 @@ def testCellranger_get_sequencer_runID(): def testGettotalreads(): sample_list = ["PDX_WD0010_P1_1845_IGO_12754_E_1", "PDX_WD0010_P1_1850_IGO_12754_E_2"] - total_reads_dict = scripts.get_total_reads_from_demux.get_total_reads(sample_list, "test/Demultiplex_Stats.csv") + total_reads_dict = scripts.get_total_reads_from_demux.get_total_reads(sample_list, "/home/runner/work/igo-demux/igo-demux/test/Demultiplex_Stats.csv") print(total_reads_dict) assert(total_reads_dict["PDX_WD0010_P1_1845_IGO_12754_E_1"] == 770373032) assert(total_reads_dict["PDX_WD0010_P1_1850_IGO_12754_E_2"] == 602357556) From 4ac47f49c291762d1aa466e324a5fd3bab9d8e90 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 7 Mar 2024 07:45:29 -0500 Subject: [PATCH 12/87] Update LaunchMetrics.py temporary update to test SAMPLES on DRAGEN 4.2 --- scripts/LaunchMetrics.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 59a6ee7..0445334 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -125,7 +125,8 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + # rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) @@ -133,7 +134,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -162,13 +163,14 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + # dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From a53ec4c0d6ab095c88742ae8b627966aa0797012 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 7 Mar 2024 09:35:58 -0500 Subject: [PATCH 13/87] Update LaunchMetrics.py Finished running stats for DRAGEN 4.2. changing back to original code --- scripts/LaunchMetrics.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 0445334..59a6ee7 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -125,8 +125,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - # rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" - rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" + rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" else: rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) @@ -134,7 +133,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -163,14 +162,13 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - # dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" - dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" + dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" else: dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From e239ec078b9ab5cf59757c8dcccd1a08ac3fcd80 Mon Sep 17 00:00:00 2001 From: luc Date: Thu, 14 Mar 2024 10:39:38 -0400 Subject: [PATCH 14/87] add function for pooled ONT samples --- scripts/ont_stats.py | 64 ++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/scripts/ont_stats.py b/scripts/ont_stats.py index f23c825..291ae89 100644 --- a/scripts/ont_stats.py +++ b/scripts/ont_stats.py @@ -5,22 +5,46 @@ import os from collections import OrderedDict -# TODO check for multiple run -def get_read_length_and_summary(file_path): - summary_metrix = pd.read_csv(file_path, delimiter = "\t") - read_length = summary_metrix[summary_metrix["passes_filtering"]]["sequence_length_template"].tolist() - read_length.sort(reverse = True) - median = statistics.median(read_length) - N50_value = sum(read_length) / 2 - total = 0 - for item in read_length: - total += item - if total >= N50_value: - N50 = item - break - +# TODO get barcode info from lims +# check if the run is pooled +def if_pooled(sequencing_summary_df): + pooled = False + if "barcode_kit" in sequencing_summary_df.columns: + pooled = True + return pooled + +# get stats metric if the run is not pooled +def get_read_length_and_summary(sequencing_summary_df): + read_length = sequencing_summary_df[sequencing_summary_df["passes_filtering"]]["sequence_length_template"].tolist() + if len(read_length) != 0: + read_length.sort(reverse = True) + median = statistics.median(read_length) + N50_value = sum(read_length) / 2 + total = 0 + for item in read_length: + total += item + if total >= N50_value: + N50 = item + break + else: + median = 0 + N50_value = 0 + N50 = 0 return(len(read_length), N50_value * 2 / 1000000000, N50, median) +# get stats metric if the run is pooled +def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name): + sample_dict = {} + samples = sequencing_summary_df["barcode_arrangement"].unique() + for sample in samples: + sample_df = sequencing_summary_df.loc[sequencing_summary_df['barcode_arrangement'] == sample] + sample_sub = sample_name + "_" + sample + stats = get_read_length_and_summary(sample_df) + # only record barcodes with more than 10000 reads + if stats[0] > 10000: + sample_dict[sample_sub] = get_read_length_and_summary(sample_df) + return sample_dict + def write_to_csv(sample_dict): file_name = "summary.csv" with open(file_name,'w') as file: @@ -35,12 +59,18 @@ def write_to_csv(sample_dict): project_directory = sys.argv[1] os.chdir(project_directory) sample_list = next(os.walk("."))[1] - sample_dict = OrderedDict() + sample_dict = {} sample_list.sort() for sample in sample_list: destination = project_directory + "/" + sample file = glob.glob(destination + "/*/sequencing_summary_*") if len(file) != 0: - sample_dict[sample] = get_read_length_and_summary(file[0]) - + summary_metrix = pd.read_csv(file[0], delimiter = "\t") + pooled = if_pooled(summary_metrix) + if pooled: + sample_dict_sub = get_read_length_and_summary_pooled(summary_metrix, sample) + sample_dict.update(sample_dict_sub) + else: + sample_dict[sample] = get_read_length_and_summary(summary_metrix) + write_to_csv(sample_dict) From 30b01ec8f78cc84fb946dae14885f10911dda1ba Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 18 Mar 2024 14:11:30 -0400 Subject: [PATCH 15/87] Update run_param_config.py adding "SMARTSeq" recipe to run_param_config script to run RNA stats on samples with the aforementioned recipe --- scripts/run_param_config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index 8351ca9..f74814a 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -74,6 +74,8 @@ def get_ordered_dic(unordered_dic): ".*SMARTer.*": { TYPE: "RNA" }, "FusionDiscoverySeq": { TYPE: "RNA" }, ".*Ribo.*": { TYPE: "RNA" }, + "SMART-Seq": { TYPE: "RNA" }, + "SMARTSeq": { TYPE: "RNA" }, ".*CDH1_RNA.*": { TYPE: "CAPTURE" }, # FOR NEW ENTRIES # "{regex}": { TYPE: type } From 649b17dac66602191f29b444a7a3b6acd57a710a Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Tue, 19 Mar 2024 12:29:17 -0400 Subject: [PATCH 16/87] Update LaunchMetrics.py taking id02 out of production. License expired --- scripts/LaunchMetrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 59a6ee7..d14b790 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -133,7 +133,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -168,7 +168,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) @@ -211,7 +211,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 0b50c75db5a313d2f661ba3316eb8a29a3fbeb5f Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 21 Mar 2024 09:28:13 -0400 Subject: [PATCH 17/87] Update LaunchMetrics.py temporary change to generate bams using dragen 4.2 --- scripts/LaunchMetrics.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index d14b790..9390184 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -125,15 +125,16 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + # rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: - rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) + rna_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) rna_dragen_job_name_header = "{}___RNA_DRAGEN___".format(run) launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -162,13 +163,14 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + # dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: - dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 73c96588c8dc72260b94e6425fcfb9d2b799cc6d Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 21 Mar 2024 09:34:11 -0400 Subject: [PATCH 18/87] Update LaunchMetrics.py add special directory for DRAGEN 4.2 testing --- scripts/LaunchMetrics.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 9390184..969afc1 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -33,6 +33,9 @@ def __init__(self): def launch_metrics(self, all_samples, run, project_directory): # + # special run + run = "FAUCI_0121_B222WMMLT4_special" + # create output directories parent_directory = "/igo/staging/stats" work_directory = "{}/{}/".format(parent_directory, run) From c06f662bb81be393628f97809aa2e6d3eb2e4762 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 21 Mar 2024 09:47:17 -0400 Subject: [PATCH 19/87] Update LaunchMetrics.py taking out the special run --- scripts/LaunchMetrics.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 969afc1..9390184 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -33,9 +33,6 @@ def __init__(self): def launch_metrics(self, all_samples, run, project_directory): # - # special run - run = "FAUCI_0121_B222WMMLT4_special" - # create output directories parent_directory = "/igo/staging/stats" work_directory = "{}/{}/".format(parent_directory, run) From f0ca0628d0326db557afc95880ee5351dca7a733 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Fri, 22 Mar 2024 17:55:53 -0400 Subject: [PATCH 20/87] Update LaunchMetrics.py putting id03 back into production --- scripts/LaunchMetrics.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 9390184..6b72816 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -125,16 +125,16 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - # rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" - rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" + rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + # rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: - rna_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) + rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) rna_dragen_job_name_header = "{}___RNA_DRAGEN___".format(run) launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -163,14 +163,14 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - # dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" - dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" + dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + # dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: - dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) + dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 4908efed8b68a5bb44211f58e877adf3acb194c8 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 25 Mar 2024 07:44:01 -0400 Subject: [PATCH 21/87] Update LaunchMetrics.py taking ID03 DRAGEN server out of production. License quota exceeded --- scripts/LaunchMetrics.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 6b72816..9390184 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -125,16 +125,16 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" - # rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" + # rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: - rna_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) + rna_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) rna_dragen_job_name_header = "{}___RNA_DRAGEN___".format(run) launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -163,14 +163,14 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" - # dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" + # dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: - dragen_path = "/igo/work/igo/dragen_hash_tables/{}".format(sample_parameters["GTAG"]) + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From f1b3e25d2f1057b53ef2a379b8ef451ae8e76c15 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 25 Mar 2024 16:28:42 -0400 Subject: [PATCH 22/87] Update LaunchMetrics.py adding demux only routine to the script --- scripts/LaunchMetrics.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 9390184..cf7323b 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -10,12 +10,16 @@ import shutil import pathlib import scripts.generate_run_params +import scripts.get_total_reads_from_demux # Global Variable : we do not want to process these experiments in this script -DO_NOT_PROCESS = ["10X_Genomics", "DLP"] +DO_NOT_PROCESS = ["DLP"] # These recipes will be evaluated using DRAGEN because of their larger size of fastqs RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq"] +# these projects willl only need demux stats +DEMUX_ONLY = ["SMARTSeq", "10X_Genomics"] + # Organisms to have DRAGEN BAMS DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"] # this list contains the headers of the columns. we will access the data using these listings @@ -38,6 +42,7 @@ def launch_metrics(self, all_samples, run, project_directory): work_directory = "{}/{}/".format(parent_directory, run) rna_directory = "{}RNA/".format(work_directory) dragen_directory = "{}DRAGEN/".format(work_directory) + stats_done_directory = "/igo/stats/DONE/{}".format(run.split("_")[0]) # create work directory pathlib.Path(work_directory).mkdir(parents = True, exist_ok = True) @@ -59,6 +64,12 @@ def launch_metrics(self, all_samples, run, project_directory): # test to see if there are some samples that this script will not process if any(s in sample.recipe for s in DO_NOT_PROCESS): continue + + if any(s in sample.recipe for s in DEMUX_ONLY): + demux_report_file = "/igo/staging/FASTQ/{}/Reports/Demultiplex_Stats.csv".format(run) + demux_reads_per_sample = scripts.get_total_reads_from_demux.get_total_reads([sample.sample_id], demux_report_file) + scripts.get_total_reads_from_demux.write_to_am_txt(run, sample.sample_id, demux_reads_per_sample[sample.sample_id], stats_done_directory) + # grab the sample parameters (bait set, type, gtag, etc) sample_parameters = self.get_parameters(sample.genome, sample.recipe) # process the RNA data seperately From 7d26837ad8fd1022c80f6ebc6962bc98b6ca1d51 Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 25 Mar 2024 16:30:05 -0400 Subject: [PATCH 23/87] change 10x run to be checked by run length --- demux_run_dag.py | 3 ++- scripts/get_sequencing_read_data.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/demux_run_dag.py b/demux_run_dag.py index f364ab0..ba9ee55 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -170,7 +170,8 @@ def stats(ds, **kwargs): return "DLP stats posted and yaml file generated" - if any("10X_" in s for s in sample_sheet.recipe_set): + atac, use_bases_mask = scripts.get_sequencing_read_data.main(sequencer_path) + if use_bases_mask == [29, 89]: # if is atac run, demux is using cellranger mkfastq if scripts.get_sequencing_read_data.main(sequencer_path)[0]: scripts.get_total_reads_from_demux.by_json(sequencer_and_run) diff --git a/scripts/get_sequencing_read_data.py b/scripts/get_sequencing_read_data.py index 8021ba3..bdcab04 100755 --- a/scripts/get_sequencing_read_data.py +++ b/scripts/get_sequencing_read_data.py @@ -36,7 +36,7 @@ def get_sequencing_read_data(sequencer_path): use_bases_mask = "Y" + str(reads_tag[0][1]) + ",I" + str(reads_tag[1][1]) + ",Y" + str(reads_tag[2][1]) + ",Y" + str(reads_tag[3][1]) else: atac = False - use_bases_mask = "" + use_bases_mask = [reads_tag[0][1], reads_tag[3][1]] return(atac, use_bases_mask) From b0a60f1fb0b647243fa0f41436838664fdb84dc2 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 25 Mar 2024 16:37:44 -0400 Subject: [PATCH 24/87] Update LaunchMetrics.py need the continue statement to get out of the loop --- scripts/LaunchMetrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index cf7323b..1cec80d 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -69,6 +69,7 @@ def launch_metrics(self, all_samples, run, project_directory): demux_report_file = "/igo/staging/FASTQ/{}/Reports/Demultiplex_Stats.csv".format(run) demux_reads_per_sample = scripts.get_total_reads_from_demux.get_total_reads([sample.sample_id], demux_report_file) scripts.get_total_reads_from_demux.write_to_am_txt(run, sample.sample_id, demux_reads_per_sample[sample.sample_id], stats_done_directory) + continue # grab the sample parameters (bait set, type, gtag, etc) sample_parameters = self.get_parameters(sample.genome, sample.recipe) From 90607e8467ad3b172658b1ea9ced3f59b7421d6d Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 25 Mar 2024 16:40:23 -0400 Subject: [PATCH 25/87] Update LaunchMetrics.py --- scripts/LaunchMetrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 1cec80d..f48fc26 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -68,6 +68,7 @@ def launch_metrics(self, all_samples, run, project_directory): if any(s in sample.recipe for s in DEMUX_ONLY): demux_report_file = "/igo/staging/FASTQ/{}/Reports/Demultiplex_Stats.csv".format(run) demux_reads_per_sample = scripts.get_total_reads_from_demux.get_total_reads([sample.sample_id], demux_report_file) + print(demux_reads_per_sample) scripts.get_total_reads_from_demux.write_to_am_txt(run, sample.sample_id, demux_reads_per_sample[sample.sample_id], stats_done_directory) continue From d6488f151f040707d6e7b3f23b2ee22c6bae2c36 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 25 Mar 2024 16:44:12 -0400 Subject: [PATCH 26/87] Update LaunchMetrics.py --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index f48fc26..6abe131 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -43,7 +43,7 @@ def launch_metrics(self, all_samples, run, project_directory): rna_directory = "{}RNA/".format(work_directory) dragen_directory = "{}DRAGEN/".format(work_directory) stats_done_directory = "/igo/stats/DONE/{}".format(run.split("_")[0]) - + print(stats_done_directory) # create work directory pathlib.Path(work_directory).mkdir(parents = True, exist_ok = True) From 6ec1dd7caf0a98dffe123724395c3407d36a1aa0 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 25 Mar 2024 16:48:14 -0400 Subject: [PATCH 27/87] Update LaunchMetrics.py --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 6abe131..db0fd1c 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -42,7 +42,7 @@ def launch_metrics(self, all_samples, run, project_directory): work_directory = "{}/{}/".format(parent_directory, run) rna_directory = "{}RNA/".format(work_directory) dragen_directory = "{}DRAGEN/".format(work_directory) - stats_done_directory = "/igo/stats/DONE/{}".format(run.split("_")[0]) + stats_done_directory = "/igo/stats/DONE/{}/".format(run.split("_")[0]) print(stats_done_directory) # create work directory pathlib.Path(work_directory).mkdir(parents = True, exist_ok = True) From 26484bf61f702d9237d80c01d1780b3ab753f527 Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 27 Mar 2024 14:34:25 -0400 Subject: [PATCH 28/87] add atac to 10X situation --- demux_run_dag.py | 6 +- test/Top_Unknown_Barcodes.csv | 1001 +++++++++++++++++++++++++++++++++ 2 files changed, 1004 insertions(+), 3 deletions(-) create mode 100644 test/Top_Unknown_Barcodes.csv diff --git a/demux_run_dag.py b/demux_run_dag.py index ba9ee55..6c85c45 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -3,7 +3,6 @@ import subprocess from datetime import datetime, timedelta -from numpy import equal import pandas from SampleSheet import SampleSheet import scripts.organise_fastq_split_by_lane @@ -170,10 +169,11 @@ def stats(ds, **kwargs): return "DLP stats posted and yaml file generated" + # check if the run is 10X by read length atac, use_bases_mask = scripts.get_sequencing_read_data.main(sequencer_path) - if use_bases_mask == [29, 89]: + if use_bases_mask == [29, 89] or atac: # if is atac run, demux is using cellranger mkfastq - if scripts.get_sequencing_read_data.main(sequencer_path)[0]: + if atac: scripts.get_total_reads_from_demux.by_json(sequencer_and_run) scripts.upload_stats.upload_stats(sequencer_and_run) diff --git a/test/Top_Unknown_Barcodes.csv b/test/Top_Unknown_Barcodes.csv new file mode 100644 index 0000000..76acc74 --- /dev/null +++ b/test/Top_Unknown_Barcodes.csv @@ -0,0 +1,1001 @@ +Lane,index,index2,# Reads,% of Unknown Barcodes,% of All Reads +1,AAAAAAAA,,2219166,0.016658,0.016654 +1,CCCCCCCC,,1271809,0.009547,0.009545 +1,TTTTTTTT,,813459,0.006106,0.006105 +1,TCCCCCCC,,306154,0.002298,0.002298 +1,CCCCCCCT,,283122,0.002125,0.002125 +1,CCCCCCTC,,259274,0.001946,0.001946 +1,CCCCCTCC,,253669,0.001904,0.001904 +1,CCCCTCCC,,252278,0.001894,0.001893 +1,CCCTCCCC,,251165,0.001885,0.001885 +1,CTCCCCCC,,246202,0.001848,0.001848 +1,CCTCCCCC,,245390,0.001842,0.001842 +1,CTTTTTTT,,215947,0.001621,0.001621 +1,TTTTTTTC,,208944,0.001568,0.001568 +1,TCTTTTTT,,203072,0.001524,0.001524 +1,TTTTTTCT,,192106,0.001442,0.001442 +1,TTTTTCTT,,191081,0.001434,0.001434 +1,GCCCCCCC,,188080,0.001412,0.001412 +1,TTCTTTTT,,187573,0.001408,0.001408 +1,TTTTCTTT,,180659,0.001356,0.001356 +1,TTTCTTTT,,180120,0.001352,0.001352 +1,CCCCCCCA,,170377,0.001279,0.001279 +1,CCCCCCCG,,155423,0.001167,0.001166 +1,TTTTTTTA,,151436,0.001137,0.001136 +1,CCCCCCAC,,150223,0.001128,0.001127 +1,CCCCCACC,,150057,0.001126,0.001126 +1,CCCCCCGC,,148185,0.001112,0.001112 +1,CCCACCCC,,147185,0.001105,0.001105 +1,CCCCACCC,,144821,0.001087,0.001087 +1,CGCCCCCC,,140909,0.001058,0.001057 +1,CCGCCCCC,,140841,0.001057,0.001057 +1,CCCCGCCC,,140507,0.001055,0.001054 +1,CCACCCCC,,138745,0.001041,0.001041 +1,AAAAAAAT,,135372,0.001016,0.001016 +1,CACCCCCC,,134748,0.001011,0.001011 +1,CCCGCCCC,,134499,0.001010,0.001009 +1,CCCCCGCC,,134076,0.001006,0.001006 +1,AAAAAATA,,132096,0.000992,0.000991 +1,TTTTTTAT,,131869,0.000990,0.000990 +1,TTTTTATT,,130836,0.000982,0.000982 +1,AAAAATAA,,125043,0.000939,0.000938 +1,TTTATTTT,,124636,0.000936,0.000935 +1,AAAATAAA,,124514,0.000935,0.000934 +1,TATTTTTT,,124251,0.000933,0.000932 +1,AATAAAAA,,124238,0.000933,0.000932 +1,TTTTATTT,,124113,0.000932,0.000931 +1,AAAAAAAC,,122686,0.000921,0.000921 +1,AAAAAACA,,122236,0.000918,0.000917 +1,AAATAAAA,,119977,0.000901,0.000900 +1,TTATTTTT,,119866,0.000900,0.000900 +1,ATAAAAAA,,119001,0.000893,0.000893 +1,ACAAAAAA,,115857,0.000870,0.000869 +1,AAAAACAA,,115104,0.000864,0.000864 +1,ACCCCCCC,,115060,0.000864,0.000864 +1,AACAAAAA,,114874,0.000862,0.000862 +1,CCCCCCTT,,110146,0.000827,0.000827 +1,AAAACAAA,,110106,0.000827,0.000826 +1,TTCCCCCC,,109518,0.000822,0.000822 +1,AAACAAAA,,107415,0.000806,0.000806 +1,TCTCCCCC,,102118,0.000767,0.000766 +1,TCCCCCCT,,100358,0.000753,0.000753 +1,TCCTCCCC,,98745,0.000741,0.000741 +1,CCCCCTCT,,97053,0.000729,0.000728 +1,ATTTTTTT,,96842,0.000727,0.000727 +1,GTTTTTTT,,96581,0.000725,0.000725 +1,CCCCCTTC,,96442,0.000724,0.000724 +1,CCTTTTTT,,96200,0.000722,0.000722 +1,TCCCTCCC,,95765,0.000719,0.000719 +1,TCCCCTCC,,94553,0.000710,0.000710 +1,TCCCCCTC,,94264,0.000708,0.000707 +1,TAAAAAAA,,93519,0.000702,0.000702 +1,CCCCTCCT,,92799,0.000697,0.000696 +1,CCCTTCCC,,91881,0.000690,0.000690 +1,CCCCTTCC,,91417,0.000686,0.000686 +1,TTTTTTCC,,90280,0.000678,0.000678 +1,CCCTCCCT,,90236,0.000677,0.000677 +1,CCTTCCCC,,89372,0.000671,0.000671 +1,CTTCCCCC,,89369,0.000671,0.000671 +1,CCCCTCTC,,88027,0.000661,0.000661 +1,CCTCCCCT,,85741,0.000644,0.000643 +1,CCTCTCCC,,85012,0.000638,0.000638 +1,CCCTCTCC,,84245,0.000632,0.000632 +1,CCCTCCTC,,84241,0.000632,0.000632 +1,CTCCCCCT,,84228,0.000632,0.000632 +1,TCCTTTTT,,84185,0.000632,0.000632 +1,CTCTCCCC,,84087,0.000631,0.000631 +1,CTCTTTTT,,82104,0.000616,0.000616 +1,TTTTTCTC,,81509,0.000612,0.000612 +1,TTTTTTTG,,81420,0.000611,0.000611 +1,CCTCCCTC,,81349,0.000611,0.000611 +1,CTCCTCCC,,80850,0.000607,0.000607 +1,CCTCCTCC,,80652,0.000605,0.000605 +1,TTTTTCCT,,80447,0.000604,0.000604 +1,CAAAAAAA,,80349,0.000603,0.000603 +1,CTTTTTTC,,79913,0.000600,0.000600 +1,CTCCCCTC,,79258,0.000595,0.000595 +1,CTCCCTCC,,78896,0.000592,0.000592 +1,TTGTTTTT,,78203,0.000587,0.000587 +1,TTTTTTGT,,78051,0.000586,0.000586 +1,TCTTTTTC,,77259,0.000580,0.000580 +1,CTTTTCTT,,76328,0.000573,0.000573 +1,TGTTTTTT,,75814,0.000569,0.000569 +1,CTTCTTTT,,75757,0.000569,0.000569 +1,CTTTTTCT,,74858,0.000562,0.000562 +1,TCTCTTTT,,74712,0.000561,0.000561 +1,TTTTCCTT,,74679,0.000561,0.000560 +1,TTTTCTTC,,74531,0.000559,0.000559 +1,TCTTTCTT,,74188,0.000557,0.000557 +1,CTTTCTTT,,73672,0.000553,0.000553 +1,TTCCTTTT,,73549,0.000552,0.000552 +1,TTTTGTTT,,73526,0.000552,0.000552 +1,TCTTTTCT,,73427,0.000551,0.000551 +1,TTTTTGTT,,73199,0.000549,0.000549 +1,TTCTTTTC,,72829,0.000547,0.000547 +1,TTTCTTTC,,72691,0.000546,0.000546 +1,TCTTCTTT,,72676,0.000546,0.000545 +1,TTTGTTTT,,71798,0.000539,0.000539 +1,TTTTCTCT,,71409,0.000536,0.000536 +1,TTTCCTTT,,70521,0.000529,0.000529 +1,TTCTTCTT,,69864,0.000524,0.000524 +1,TTCTTTCT,,69383,0.000521,0.000521 +1,TTCTCTTT,,69225,0.000520,0.000520 +1,TTTCTCTT,,68854,0.000517,0.000517 +1,TTTCTTCT,,68349,0.000513,0.000513 +1,CCCCCTTT,,63694,0.000478,0.000478 +1,TTTCCCCC,,62272,0.000467,0.000467 +1,CCCTTTTT,,60686,0.000456,0.000455 +1,TTTTTCCC,,58423,0.000439,0.000438 +1,CCCCTCTT,,56182,0.000422,0.000422 +1,TTCTCCCC,,55426,0.000416,0.000416 +1,TCTTCCCC,,55346,0.000415,0.000415 +1,TCCCCCTT,,55149,0.000414,0.000414 +1,CCCTCCTT,,53582,0.000402,0.000402 +1,CCCCTTTC,,53488,0.000402,0.000401 +1,CCCCTTCT,,53125,0.000399,0.000399 +1,CCCCTTTT,,52374,0.000393,0.000393 +1,CCTCTTTT,,52201,0.000392,0.000392 +1,TCCTTCCC,,51894,0.000390,0.000389 +1,TTCCTCCC,,51753,0.000388,0.000388 +1,TTCCCCCT,,51433,0.000386,0.000386 +1,CCTTTCCC,,51126,0.000384,0.000384 +1,TCCCCCCA,,51033,0.000383,0.000383 +1,TTTTCCCC,,50816,0.000381,0.000381 +1,CTTTCCCC,,50683,0.000380,0.000380 +1,TCCCTTTT,,50631,0.000380,0.000380 +1,CCCTTTCC,,50491,0.000379,0.000379 +1,CCTCCCTT,,50099,0.000376,0.000376 +1,CCCTTCCT,,49979,0.000375,0.000375 +1,TTTTTTAA,,49958,0.000375,0.000375 +1,TCCCCTTC,,49952,0.000375,0.000375 +1,TCTCTCCC,,49897,0.000375,0.000374 +1,TTCCCTCC,,49852,0.000374,0.000374 +1,TTTTCTCC,,49800,0.000374,0.000374 +1,CCTTTTTC,,49747,0.000373,0.000373 +1,TCCCCTCT,,49700,0.000373,0.000373 +1,TTCCCCTC,,49389,0.000371,0.000371 +1,TCCTCCCT,,49326,0.000370,0.000370 +1,TCCCTTCC,,49189,0.000369,0.000369 +1,TCTCCCCT,,48986,0.000368,0.000368 +1,CCCCCCTA,,48787,0.000366,0.000366 +1,TCCCTCCT,,48691,0.000365,0.000365 +1,CCTTCTTT,,48590,0.000365,0.000365 +1,CCCTTCTC,,48439,0.000364,0.000364 +1,CCTTTCTT,,48400,0.000363,0.000363 +1,CCCTCTCT,,48259,0.000362,0.000362 +1,TTTTCCCT,,48235,0.000362,0.000362 +1,CCCTCTTC,,48214,0.000362,0.000362 +1,GTCCCCCC,,48155,0.000361,0.000361 +1,GCCCCCCT,,47953,0.000360,0.000360 +1,CTCCCCTT,,47864,0.000359,0.000359 +1,TCTTTTCC,,47814,0.000359,0.000359 +1,TTTTCCTC,,47784,0.000359,0.000359 +1,TCTCCTCC,,47580,0.000357,0.000357 +1,CTCCTTTT,,47565,0.000357,0.000357 +1,CTTTTTCC,,47487,0.000356,0.000356 +1,TCCTCTCC,,47470,0.000356,0.000356 +1,CCTTTTCT,,47379,0.000356,0.000356 +1,CCCCCCAA,,47348,0.000355,0.000355 +1,TCTCCCTC,,47001,0.000353,0.000353 +1,TTTCTTCC,,46980,0.000353,0.000353 +1,TCCCTCTC,,46805,0.000351,0.000351 +1,TCCTCCTC,,46594,0.000350,0.000350 +1,CCTTCCCT,,46555,0.000349,0.000349 +1,GCTCCCCC,,46541,0.000349,0.000349 +1,CCCTCTTT,,46327,0.000348,0.000348 +1,TCCCCACC,,46174,0.000347,0.000347 +1,GCCTCCCC,,46024,0.000345,0.000345 +1,CTTCTCCC,,46001,0.000345,0.000345 +1,CTCTTCCC,,45953,0.000345,0.000345 +1,CCTCTTCC,,45924,0.000345,0.000345 +1,TTCTTTCC,,45923,0.000345,0.000345 +1,TCCTCTTT,,45837,0.000344,0.000344 +1,GCCCTCCC,,45774,0.000344,0.000344 +1,CCCCCTCA,,45712,0.000343,0.000343 +1,CCCCCCAT,,45664,0.000343,0.000343 +1,TCCCCCAC,,45549,0.000342,0.000342 +1,TGCCCCCC,,45492,0.000341,0.000341 +1,TCCTTTTC,,45411,0.000341,0.000341 +1,TCCACCCC,,45270,0.000340,0.000340 +1,CCTTCTCC,,45194,0.000339,0.000339 +1,TTTTTATA,,45089,0.000338,0.000338 +1,CTTTTTTA,,45020,0.000338,0.000338 +1,CCTCCTCT,,45013,0.000338,0.000338 +1,TCCTTCTT,,45006,0.000338,0.000338 +1,CCTCCTTC,,44932,0.000337,0.000337 +1,TTTCTCCC,,44910,0.000337,0.000337 +1,CCCCCCGT,,44839,0.000337,0.000337 +1,CCTTCCTC,,44818,0.000336,0.000336 +1,TTTTTTCA,,44768,0.000336,0.000336 +1,CCTCTCCT,,44718,0.000336,0.000336 +1,TTCCCTTT,,44543,0.000334,0.000334 +1,GCCCCTCC,,44477,0.000334,0.000334 +1,GGCCCCCC,,44452,0.000334,0.000334 +1,CTTCCCCT,,44383,0.000333,0.000333 +1,TCTTTCCT,,44329,0.000333,0.000333 +1,TCCCCCCG,,44294,0.000332,0.000332 +1,CCCTTCTT,,44251,0.000332,0.000332 +1,CTTTTCTC,,44152,0.000331,0.000331 +1,TCCCACCC,,44148,0.000331,0.000331 +1,GCCCCCTC,,44124,0.000331,0.000331 +1,TCCCCTTT,,44062,0.000331,0.000331 +1,TTTCCCTT,,44037,0.000331,0.000330 +1,CCCCTCCA,,44008,0.000330,0.000330 +1,TCTTTCTC,,43964,0.000330,0.000330 +1,TCTTTCCC,,43924,0.000330,0.000330 +1,CTTTTCCT,,43813,0.000329,0.000329 +1,TCTCCTTT,,43749,0.000328,0.000328 +1,TCCTTTCT,,43747,0.000328,0.000328 +1,CCTCTCTC,,43645,0.000328,0.000328 +1,CTTCCTCC,,43511,0.000327,0.000327 +1,CCCTCCCA,,43493,0.000326,0.000326 +1,CTCCCTTC,,43410,0.000326,0.000326 +1,TCTTCCTT,,43331,0.000325,0.000325 +1,CCCCCACT,,43324,0.000325,0.000325 +1,TCGCCCCC,,43267,0.000325,0.000325 +1,CCCCCCTG,,43193,0.000324,0.000324 +1,CCCTTTTC,,43190,0.000324,0.000324 +1,CTCTTTTC,,43185,0.000324,0.000324 +1,TACCCCCC,,43161,0.000324,0.000324 +1,CTCCCTCT,,43126,0.000324,0.000324 +1,CTCTCCCT,,43094,0.000323,0.000323 +1,CTCTCTTT,,43069,0.000323,0.000323 +1,CCCCCACA,,43068,0.000323,0.000323 +1,TTTCCTTC,,42975,0.000323,0.000323 +1,CTCTTCTT,,42906,0.000322,0.000322 +1,TTTTTAAT,,42901,0.000322,0.000322 +1,TCACCCCC,,42896,0.000322,0.000322 +1,TTTCTCTC,,42845,0.000322,0.000322 +1,CTTCCCTC,,42823,0.000321,0.000321 +1,CTCCTTCC,,42790,0.000321,0.000321 +1,CTTCCTTT,,42699,0.000321,0.000320 +1,CTCCCCCA,,42658,0.000320,0.000320 +1,CCTCCCCA,,42576,0.000320,0.000320 +1,TCTCTTTC,,42546,0.000319,0.000319 +1,CCCTTTCT,,42515,0.000319,0.000319 +1,TTCTTCCC,,42511,0.000319,0.000319 +1,TCTTTTTA,,42485,0.000319,0.000319 +1,CTCCTCCT,,42401,0.000318,0.000318 +1,TTTTTTAC,,42393,0.000318,0.000318 +1,TTCTTCCT,,42291,0.000317,0.000317 +1,TTTCTCCT,,42270,0.000317,0.000317 +1,CCTCCTTT,,42229,0.000317,0.000317 +1,CTTTCCTT,,42216,0.000317,0.000317 +1,CTTTTCCC,,42145,0.000316,0.000316 +1,TTCTTCTC,,42113,0.000316,0.000316 +1,TTCTCCTT,,42049,0.000316,0.000316 +1,TCCCCCGC,,41991,0.000315,0.000315 +1,CCCACCCT,,41984,0.000315,0.000315 +1,CCCCACCT,,41863,0.000314,0.000314 +1,TTTTTCTA,,41826,0.000314,0.000314 +1,TTCCTTTC,,41801,0.000314,0.000314 +1,TTCCTCTT,,41644,0.000313,0.000313 +1,TCTTCTTC,,41466,0.000311,0.000311 +1,CTCTCTCC,,41433,0.000311,0.000311 +1,TCTCTCTT,,41349,0.000310,0.000310 +1,CTTCTTTC,,41335,0.000310,0.000310 +1,TTTCCTCC,,41334,0.000310,0.000310 +1,CTCCTCTC,,41138,0.000309,0.000309 +1,CCCCCATC,,41136,0.000309,0.000309 +1,TTTTATTA,,41110,0.000309,0.000309 +1,CTCTTTCT,,41105,0.000309,0.000308 +1,TTTCCTCT,,41101,0.000309,0.000308 +1,TTTCCCCT,,41073,0.000308,0.000308 +1,CCTTTTCC,,41069,0.000308,0.000308 +1,CCCCCTAC,,41030,0.000308,0.000308 +1,CTCTCCTC,,41023,0.000308,0.000308 +1,CCCCCAAC,,40998,0.000308,0.000308 +1,CTTTCTTC,,40979,0.000308,0.000308 +1,TCCCGCCC,,40647,0.000305,0.000305 +1,CTTCTCTT,,40644,0.000305,0.000305 +1,TTCTCTTC,,40565,0.000305,0.000304 +1,TCTTCTCT,,40551,0.000304,0.000304 +1,TTTATTTA,,40487,0.000304,0.000304 +1,TCTCTTCT,,40199,0.000302,0.000302 +1,TCCGCCCC,,40093,0.000301,0.000301 +1,CCCCACCA,,40081,0.000301,0.000301 +1,TTTTAATT,,39992,0.000300,0.000300 +1,TTCCTTCT,,39990,0.000300,0.000300 +1,CCCCTACC,,39968,0.000300,0.000300 +1,CCCCCTCG,,39899,0.000300,0.000299 +1,CCCACCCA,,39796,0.000299,0.000299 +1,CATTTTTT,,39784,0.000299,0.000299 +1,CCCCTCAC,,39763,0.000298,0.000298 +1,TTTCCCTC,,39751,0.000298,0.000298 +1,CCCCCTGC,,39680,0.000298,0.000298 +1,TCCCTCTT,,39640,0.000298,0.000297 +1,CTTTCTCT,,39614,0.000297,0.000297 +1,TTTTTATC,,39596,0.000297,0.000297 +1,CTTTTTAT,,39595,0.000297,0.000297 +1,CTCCCTTT,,39582,0.000297,0.000297 +1,TTTTCTTA,,39581,0.000297,0.000297 +1,GCGCCCCC,,39546,0.000297,0.000297 +1,TCCTTTCC,,39498,0.000296,0.000296 +1,TTTCTTTA,,39432,0.000296,0.000296 +1,TTCTTTTA,,39344,0.000295,0.000295 +1,TTCCCCTT,,39337,0.000295,0.000295 +1,TCCCTTTC,,39326,0.000295,0.000295 +1,CCTTTCCT,,39307,0.000295,0.000295 +1,CTTCTTCT,,39278,0.000295,0.000295 +1,TTCTCTCT,,39255,0.000295,0.000295 +1,CCTTCCTT,,39219,0.000294,0.000294 +1,CTTTTATT,,39204,0.000294,0.000294 +1,CCTCTCTT,,39114,0.000294,0.000294 +1,CCCCTCCG,,39103,0.000294,0.000293 +1,AAAAAATT,,39010,0.000293,0.000293 +1,TCCTCCTT,,38987,0.000293,0.000293 +1,CCCCAACC,,38981,0.000293,0.000293 +1,CCCATCCC,,38881,0.000292,0.000292 +1,CCACCCCT,,38851,0.000292,0.000292 +1,TCCCCGCC,,38821,0.000291,0.000291 +1,CCCCCGCT,,38776,0.000291,0.000291 +1,CCCCACTC,,38775,0.000291,0.000291 +1,TCTTCCCT,,38774,0.000291,0.000291 +1,TCTTCTCC,,38715,0.000291,0.000291 +1,CCCTCACC,,38671,0.000290,0.000290 +1,CCCACCTC,,38669,0.000290,0.000290 +1,CCCCGCCT,,38566,0.000289,0.000289 +1,CCCTCCAC,,38545,0.000289,0.000289 +1,CCCTACCC,,38509,0.000289,0.000289 +1,CCCCATCC,,38466,0.000289,0.000289 +1,TCTCCCTT,,38292,0.000287,0.000287 +1,CTTATTTT,,38288,0.000287,0.000287 +1,CCTACCCC,,38279,0.000287,0.000287 +1,CCTCTTTC,,38262,0.000287,0.000287 +1,CCTTTCTC,,38259,0.000287,0.000287 +1,TAATTTTT,,38235,0.000287,0.000287 +1,TTTTTCAT,,38210,0.000287,0.000287 +1,TTCCTTCC,,38190,0.000287,0.000287 +1,CCCAACCC,,38146,0.000286,0.000286 +1,CCTCCCAC,,38084,0.000286,0.000286 +1,TCCCTTCT,,38082,0.000286,0.000286 +1,CCTCCACC,,38077,0.000286,0.000286 +1,TTCTCCCT,,38072,0.000286,0.000286 +1,TTCTCTCC,,38061,0.000286,0.000286 +1,TATTTTTA,,38015,0.000285,0.000285 +1,CTCCCACC,,38015,0.000285,0.000285 +1,CCGCCCCT,,38008,0.000285,0.000285 +1,CTTTATTT,,37908,0.000285,0.000284 +1,TTATTTTA,,37893,0.000284,0.000284 +1,CCCTCCCG,,37893,0.000284,0.000284 +1,TTTAATTT,,37854,0.000284,0.000284 +1,CTCCCCAC,,37824,0.000284,0.000284 +1,TTTTTACT,,37816,0.000284,0.000284 +1,CCCCCCGG,,37796,0.000284,0.000284 +1,TCTTTTAT,,37689,0.000283,0.000283 +1,GGGGGGGG,,37684,0.000283,0.000283 +1,TCCTTCCT,,37630,0.000282,0.000282 +1,CCCACTCC,,37625,0.000282,0.000282 +1,CTATTTTT,,37622,0.000282,0.000282 +1,TCTCTTCC,,37592,0.000282,0.000282 +1,CTCACCCC,,37584,0.000282,0.000282 +1,CCCCCGTC,,37499,0.000281,0.000281 +1,TCTTTATT,,37487,0.000281,0.000281 +1,CCGTCCCC,,37482,0.000281,0.000281 +1,TTTTATAT,,37447,0.000281,0.000281 +1,CCTCACCC,,37345,0.000280,0.000280 +1,CCCCTCGC,,37330,0.000280,0.000280 +1,CACCCCCT,,37328,0.000280,0.000280 +1,CGCCCCCT,,37316,0.000280,0.000280 +1,TCTTCCTC,,37211,0.000279,0.000279 +1,TCCTTCTC,,37208,0.000279,0.000279 +1,CCCCGTCC,,37131,0.000279,0.000279 +1,TTCCCTTC,,37130,0.000279,0.000279 +1,TTAATTTT,,37118,0.000279,0.000279 +1,CCCGCCCT,,37068,0.000278,0.000278 +1,CCTCTTCT,,37045,0.000278,0.000278 +1,CCAACCCC,,37035,0.000278,0.000278 +1,CTTTCCCT,,36922,0.000277,0.000277 +1,CCCCACAC,,36899,0.000277,0.000277 +1,TTCTCCTC,,36840,0.000277,0.000276 +1,TTTATTTC,,36770,0.000276,0.000276 +1,CTTTCTCC,,36729,0.000276,0.000276 +1,CTCCTCTT,,36640,0.000275,0.000275 +1,CTGCCCCC,,36632,0.000275,0.000275 +1,CCTTCTTC,,36609,0.000275,0.000275 +1,CCCACACC,,36593,0.000275,0.000275 +1,TTTATATT,,36588,0.000275,0.000275 +1,CCCCGCTC,,36563,0.000274,0.000274 +1,GCCCCCCG,,36562,0.000274,0.000274 +1,CCCTCCGC,,36491,0.000274,0.000274 +1,CTCCACCC,,36469,0.000274,0.000274 +1,CCCTGCCC,,36460,0.000274,0.000274 +1,TTAAAAAA,,36427,0.000273,0.000273 +1,CCACCCCA,,36417,0.000273,0.000273 +1,TCCTCTTC,,36413,0.000273,0.000273 +1,TTTATTAT,,36394,0.000273,0.000273 +1,TTTTATTC,,36392,0.000273,0.000273 +1,CTACCCCC,,36334,0.000273,0.000273 +1,TCATTTTT,,36328,0.000273,0.000273 +1,CTCCCCCG,,36303,0.000273,0.000272 +1,CCTCCCCG,,36283,0.000272,0.000272 +1,TCCTCTCT,,36271,0.000272,0.000272 +1,TTCCTCCT,,36196,0.000272,0.000272 +1,TTCCCTCT,,36140,0.000271,0.000271 +1,TATATTTT,,36119,0.000271,0.000271 +1,TCTATTTT,,36118,0.000271,0.000271 +1,CTCTTTCC,,36099,0.000271,0.000271 +1,CCGCTCCC,,36098,0.000271,0.000271 +1,GCCCGCCC,,36068,0.000271,0.000271 +1,CCATCCCC,,36000,0.000270,0.000270 +1,CCGCCCTC,,35980,0.000270,0.000270 +1,CCTTCTCT,,35971,0.000270,0.000270 +1,TCTCCTTC,,35961,0.000270,0.000270 +1,GCCGCCCC,,35958,0.000270,0.000270 +1,CCACCCTC,,35950,0.000270,0.000270 +1,CCCCTGCC,,35917,0.000270,0.000270 +1,CCCACCAC,,35870,0.000269,0.000269 +1,CTTCCCTT,,35854,0.000269,0.000269 +1,TCTTATTT,,35852,0.000269,0.000269 +1,CGTCCCCC,,35847,0.000269,0.000269 +1,TCTCTCCT,,35822,0.000269,0.000269 +1,TTCCTCTC,,35749,0.000268,0.000268 +1,CTCCTTTC,,35715,0.000268,0.000268 +1,CCCGTCCC,,35693,0.000268,0.000268 +1,ATTAAAAA,,35687,0.000268,0.000268 +1,CTTCTTCC,,35667,0.000268,0.000268 +1,AAAAATTA,,35605,0.000267,0.000267 +1,CCACTCCC,,35603,0.000267,0.000267 +1,CCTCCCGC,,35571,0.000267,0.000267 +1,CTCTCCTT,,35561,0.000267,0.000267 +1,CCGCCTCC,,35494,0.000266,0.000266 +1,TTTTCATT,,35436,0.000266,0.000266 +1,CTTTCCTC,,35356,0.000265,0.000265 +1,GCCCCCGC,,35317,0.000265,0.000265 +1,CCACCTCC,,35286,0.000265,0.000265 +1,CAACCCCC,,35282,0.000265,0.000265 +1,TATTTTTC,,35252,0.000265,0.000265 +1,CGCTCCCC,,35233,0.000264,0.000264 +1,CTCCCCGC,,35194,0.000264,0.000264 +1,TCTCCTCT,,35179,0.000264,0.000264 +1,TTTTACTT,,35135,0.000264,0.000264 +1,TTCTTTAT,,35096,0.000263,0.000263 +1,CACCCCTC,,35052,0.000263,0.000263 +1,CACCCCCA,,35035,0.000263,0.000263 +1,TATTTATT,,35011,0.000263,0.000263 +1,TCTCTCTC,,34962,0.000262,0.000262 +1,CCTGCCCC,,34940,0.000262,0.000262 +1,CATCCCCC,,34929,0.000262,0.000262 +1,CTCTTCCT,,34836,0.000261,0.000261 +1,TTATTATT,,34779,0.000261,0.000261 +1,TTTTCTAT,,34767,0.000261,0.000261 +1,TTCATTTT,,34663,0.000260,0.000260 +1,TTCTTATT,,34607,0.000260,0.000260 +1,CCCGCCTC,,34598,0.000260,0.000260 +1,TTATATTT,,34488,0.000259,0.000259 +1,TATTATTT,,34437,0.000259,0.000258 +1,CGCCTCCC,,34397,0.000258,0.000258 +1,CTCCTTCT,,34391,0.000258,0.000258 +1,TATTTTAT,,34352,0.000258,0.000258 +1,CACCCTCC,,34341,0.000258,0.000258 +1,CTCTTCTC,,34295,0.000257,0.000257 +1,TTTTATCT,,34276,0.000257,0.000257 +1,CCTCGCCC,,34271,0.000257,0.000257 +1,CGCCCCTC,,34247,0.000257,0.000257 +1,CGCCCTCC,,34234,0.000257,0.000257 +1,CACACCCC,,34158,0.000256,0.000256 +1,TACTTTTT,,34141,0.000256,0.000256 +1,TTATTTTC,,34122,0.000256,0.000256 +1,CCACACCC,,34106,0.000256,0.000256 +1,GCTTTTTT,,34085,0.000256,0.000256 +1,TTTCTATT,,34082,0.000256,0.000256 +1,TTTCTTAT,,34020,0.000255,0.000255 +1,CCCTCGCC,,33905,0.000255,0.000254 +1,TTCTATTT,,33888,0.000254,0.000254 +1,CTCCGCCC,,33887,0.000254,0.000254 +1,AAAAAACC,,33869,0.000254,0.000254 +1,CCCGCTCC,,33839,0.000254,0.000254 +1,CACTCCCC,,33794,0.000254,0.000254 +1,TTATTTAT,,33778,0.000254,0.000253 +1,CCACCACC,,33759,0.000253,0.000253 +1,TTTATTCT,,33748,0.000253,0.000253 +1,TTTATCTT,,33743,0.000253,0.000253 +1,TTTACTTT,,33671,0.000253,0.000253 +1,CTTCTCTC,,33663,0.000253,0.000253 +1,CACCTCCC,,33660,0.000253,0.000253 +1,CTCGCCCC,,33649,0.000253,0.000253 +1,CTTCCTTC,,33642,0.000253,0.000252 +1,AAAAATAT,,33621,0.000252,0.000252 +1,CTTCTCCT,,33607,0.000252,0.000252 +1,AATTTTTT,,33604,0.000252,0.000252 +1,ACCAAAAA,,33581,0.000252,0.000252 +1,AATTAAAA,,33428,0.000251,0.000251 +1,TTTCATTT,,33406,0.000251,0.000251 +1,CTTCCTCT,,33154,0.000249,0.000249 +1,CCACCCAC,,33085,0.000248,0.000248 +1,CTCTCTTC,,33047,0.000248,0.000248 +1,AAAATTAA,,32861,0.000247,0.000247 +1,GCCCCGCC,,32809,0.000246,0.000246 +1,CCCCCGGC,,32762,0.000246,0.000246 +1,CCTCCGCC,,32736,0.000246,0.000246 +1,CACCCACC,,32682,0.000245,0.000245 +1,CTCTCTCT,,32618,0.000245,0.000245 +1,TATTTTCT,,32515,0.000244,0.000244 +1,AAAATATA,,32472,0.000244,0.000244 +1,AACCCCCC,,32467,0.000244,0.000244 +1,TATTTCTT,,32419,0.000243,0.000243 +1,CACCACCC,,32396,0.000243,0.000243 +1,TATAAAAA,,32316,0.000243,0.000243 +1,GTTTTTTC,,32293,0.000242,0.000242 +1,CTCCCGCC,,32183,0.000242,0.000242 +1,CGGCCCCC,,32170,0.000241,0.000241 +1,TTATTTCT,,32081,0.000241,0.000241 +1,AAATTAAA,,32067,0.000241,0.000241 +1,AAAATAAT,,32037,0.000240,0.000240 +1,ACCCCCCA,,31904,0.000239,0.000239 +1,TTATTCTT,,31863,0.000239,0.000239 +1,TTACTTTT,,31844,0.000239,0.000239 +1,TATCTTTT,,31780,0.000239,0.000239 +1,CCAAAAAA,,31746,0.000238,0.000238 +1,CACCCCAC,,31570,0.000237,0.000237 +1,AATATAAA,,31457,0.000236,0.000236 +1,CCCCCGCG,,31323,0.000235,0.000235 +1,AAAAACCA,,31298,0.000235,0.000235 +1,CCCCGCGC,,31290,0.000235,0.000235 +1,CCCCGCCG,,31259,0.000235,0.000235 +1,ATTTTTTA,,31192,0.000234,0.000234 +1,TATTCTTT,,31142,0.000234,0.000234 +1,TTATCTTT,,30891,0.000232,0.000232 +1,ACCCCCCT,,30778,0.000231,0.000231 +1,AATAAAAT,,30778,0.000231,0.000231 +1,ATATAAAA,,30733,0.000231,0.000231 +1,AATAAATA,,30717,0.000231,0.000231 +1,CCGGCCCC,,30641,0.000230,0.000230 +1,GTCTTTTT,,30605,0.000230,0.000230 +1,CCCGGCCC,,30521,0.000229,0.000229 +1,AAATAAAT,,30453,0.000229,0.000229 +1,AAATAATA,,30183,0.000227,0.000227 +1,GTTTTTCT,,30175,0.000227,0.000226 +1,CCCCGGCC,,30115,0.000226,0.000226 +1,ACACCCCC,,30084,0.000226,0.000226 +1,AATAATAA,,29842,0.000224,0.000224 +1,AAATATAA,,29800,0.000224,0.000224 +1,ATAAAATA,,29796,0.000224,0.000224 +1,ACCACCCC,,29778,0.000224,0.000223 +1,GTTTTCTT,,29741,0.000223,0.000223 +1,ATAAAAAT,,29678,0.000223,0.000223 +1,ATATTTTT,,29635,0.000222,0.000222 +1,CCGCGCCC,,29563,0.000222,0.000222 +1,ATAATAAA,,29505,0.000221,0.000221 +1,ATAAATAA,,29365,0.000220,0.000220 +1,TAATAAAA,,29348,0.000220,0.000220 +1,TAAAAAAT,,29325,0.000220,0.000220 +1,TTTTTTGC,,29237,0.000219,0.000219 +1,AAAAACAC,,29234,0.000219,0.000219 +1,ATCCCCCC,,29202,0.000219,0.000219 +1,TAAAAATA,,29182,0.000219,0.000219 +1,ATTATTTT,,29179,0.000219,0.000219 +1,ACCCCACC,,28962,0.000217,0.000217 +1,CCCGCCCG,,28863,0.000217,0.000217 +1,TAAATAAA,,28712,0.000216,0.000215 +1,GTTCTTTT,,28688,0.000215,0.000215 +1,ACCCCCTC,,28675,0.000215,0.000215 +1,ACCCACCC,,28612,0.000215,0.000215 +1,CGTTTTTT,,28527,0.000214,0.000214 +1,ATTTTATT,,28508,0.000214,0.000214 +1,CCGCCCCG,,28488,0.000214,0.000214 +1,CTGTTTTT,,28474,0.000214,0.000214 +1,AAAACACA,,28423,0.000213,0.000213 +1,GTTTCTTT,,28381,0.000213,0.000213 +1,ACCCCTCC,,28352,0.000213,0.000213 +1,ACTTTTTT,,28322,0.000213,0.000213 +1,AAAACCAA,,28319,0.000213,0.000213 +1,CCCGCCGC,,28312,0.000213,0.000212 +1,ACTCCCCC,,28309,0.000213,0.000212 +1,AACCAAAA,,28281,0.000212,0.000212 +1,ATTTTTAT,,28222,0.000212,0.000212 +1,CGCGCCCC,,28220,0.000212,0.000212 +1,TCGTTTTT,,28155,0.000211,0.000211 +1,CCGCCCGC,,28149,0.000211,0.000211 +1,ACCCCCAC,,28140,0.000211,0.000211 +1,TAAAATAA,,28136,0.000211,0.000211 +1,ACCCTCCC,,28069,0.000211,0.000211 +1,TTTTTTCG,,28066,0.000211,0.000211 +1,ACAAAAAC,,28048,0.000211,0.000210 +1,ACCTCCCC,,28036,0.000210,0.000210 +1,CGCCCCCG,,27918,0.000210,0.000210 +1,CGCCGCCC,,27811,0.000209,0.000209 +1,CTTTTTTG,,27777,0.000209,0.000208 +1,ACAAAACA,,27743,0.000208,0.000208 +1,AAAACAAC,,27648,0.000208,0.000207 +1,AACAAAAC,,27489,0.000206,0.000206 +1,CTTTTTGT,,27429,0.000206,0.000206 +1,AAACCAAA,,27426,0.000206,0.000206 +1,TCTTTTTG,,27379,0.000206,0.000205 +1,AACAAACA,,27283,0.000205,0.000205 +1,ACACAAAA,,27278,0.000205,0.000205 +1,ATTTATTT,,27235,0.000204,0.000204 +1,TTTTTAAA,,27051,0.000203,0.000203 +1,AACACAAA,,26996,0.000203,0.000203 +1,TTGTTTTC,,26989,0.000203,0.000203 +1,ACAACAAA,,26971,0.000202,0.000202 +1,CGCCCCGC,,26944,0.000202,0.000202 +1,ATTTTTTC,,26936,0.000202,0.000202 +1,CACAAAAA,,26935,0.000202,0.000202 +1,ACAAACAA,,26821,0.000201,0.000201 +1,AACAACAA,,26811,0.000201,0.000201 +1,TTTTTCTG,,26779,0.000201,0.000201 +1,TCTTTTGT,,26425,0.000198,0.000198 +1,AAACAACA,,26411,0.000198,0.000198 +1,TTTTTCGT,,26410,0.000198,0.000198 +1,CCGCCGCC,,26353,0.000198,0.000198 +1,TTTTTGTC,,26261,0.000197,0.000197 +1,CTTTGTTT,,26143,0.000196,0.000196 +1,AAACAAAC,,26110,0.000196,0.000196 +1,CTTGTTTT,,26085,0.000196,0.000196 +1,TTTTGTTC,,26074,0.000196,0.000196 +1,CCCGCGCC,,26054,0.000196,0.000196 +1,CTTTTGTT,,26048,0.000196,0.000195 +1,AAACACAA,,25998,0.000195,0.000195 +1,TGTTTTTC,,25923,0.000195,0.000195 +1,CGCCCGCC,,25349,0.000190,0.000190 +1,TCTTGTTT,,25296,0.000190,0.000190 +1,TTTTTGCT,,25256,0.000190,0.000190 +1,TGCTTTTT,,25242,0.000189,0.000189 +1,ATCTTTTT,,25219,0.000189,0.000189 +1,TCTTTGTT,,25155,0.000189,0.000189 +1,TTCTTTTG,,25141,0.000189,0.000189 +1,TCTGTTTT,,25036,0.000188,0.000188 +1,ATTTTTCT,,24932,0.000187,0.000187 +1,TTGTTCTT,,24893,0.000187,0.000187 +1,TTTTCTTG,,24883,0.000187,0.000187 +1,TTTCTTTG,,24881,0.000187,0.000187 +1,TTTGTTTC,,24745,0.000186,0.000186 +1,TTGTTTCT,,24673,0.000185,0.000185 +1,TTTTCTGT,,24576,0.000184,0.000184 +1,ATTTTCTT,,24553,0.000184,0.000184 +1,TTGCTTTT,,24521,0.000184,0.000184 +1,TTGTCTTT,,24302,0.000182,0.000182 +1,TTTTGTCT,,24224,0.000182,0.000182 +1,TGTTTTCT,,24199,0.000182,0.000182 +1,TTCTTTGT,,24168,0.000181,0.000181 +1,TTTTGCTT,,24083,0.000181,0.000181 +1,TGTTTCTT,,23987,0.000180,0.000180 +1,CAACAAAA,,23964,0.000180,0.000180 +1,CAAAAAAC,,23933,0.000180,0.000180 +1,CAAAAACA,,23924,0.000180,0.000180 +1,ATTCTTTT,,23857,0.000179,0.000179 +1,TTTCTTGT,,23798,0.000179,0.000179 +1,TTCTGTTT,,23778,0.000178,0.000178 +1,ATTTCTTT,,23748,0.000178,0.000178 +1,ACTAAAAA,,23747,0.000178,0.000178 +1,TGTCTTTT,,23554,0.000177,0.000177 +1,TTCGTTTT,,23401,0.000176,0.000176 +1,TTTGTCTT,,23379,0.000175,0.000175 +1,TGTTCTTT,,23370,0.000175,0.000175 +1,AAAAAATC,,23320,0.000175,0.000175 +1,TTTGTTCT,,23314,0.000175,0.000175 +1,CCCCCAAA,,23265,0.000175,0.000175 +1,AAAAAACT,,23245,0.000174,0.000174 +1,TTTTCGTT,,23239,0.000174,0.000174 +1,CAAAACAA,,23185,0.000174,0.000174 +1,CCCCCTTA,,23173,0.000174,0.000174 +1,TTTCGTTT,,23168,0.000174,0.000174 +1,CAAACAAA,,23157,0.000174,0.000174 +1,TTCTTGTT,,23155,0.000174,0.000174 +1,TCAAAAAA,,23154,0.000174,0.000174 +1,TTTAAAAA,,23088,0.000173,0.000173 +1,TTTGCTTT,,22765,0.000171,0.000171 +1,TTCCCCCA,,22755,0.000171,0.000171 +1,TTTCTGTT,,22621,0.000170,0.000170 +1,TTTTATAA,,22527,0.000169,0.000169 +1,CTAAAAAA,,22373,0.000168,0.000168 +1,CCTTTTTA,,22307,0.000167,0.000167 +1,TTTTTCCA,,22228,0.000167,0.000167 +1,TTTTAATA,,22107,0.000166,0.000166 +1,CCCCCATT,,22058,0.000166,0.000166 +1,AAAAATCA,,21532,0.000162,0.000162 +1,TTTATTAA,,21531,0.000162,0.000162 +1,GTTCCCCC,,21516,0.000162,0.000161 +1,ACAAAAAT,,21504,0.000161,0.000161 +1,AAAAACTA,,21470,0.000161,0.000161 +1,ATCAAAAA,,21414,0.000161,0.000161 +1,TCTCCCCA,,21354,0.000160,0.000160 +1,AAAAACAT,,21330,0.000160,0.000160 +1,TTTTAAAT,,21309,0.000160,0.000160 +1,TTTTTACC,,21191,0.000159,0.000159 +1,AAAAATAC,,21153,0.000159,0.000159 +1,GCCCCCTT,,21043,0.000158,0.000158 +1,ACAAAATA,,20947,0.000157,0.000157 +1,CCCCTTCA,,20930,0.000157,0.000157 +1,TCCCCCTA,,20902,0.000157,0.000157 +1,AAAATACA,,20870,0.000157,0.000157 +1,CCCCTCTA,,20783,0.000156,0.000156 +1,AACAAAAT,,20757,0.000156,0.000156 +1,CCCCCTAT,,20683,0.000155,0.000155 +1,AATAAAAC,,20661,0.000155,0.000155 +1,TTTTAAAA,,20628,0.000155,0.000155 +1,ACATAAAA,,20579,0.000154,0.000154 +1,ATAAAAAC,,20576,0.000154,0.000154 +1,AAAATAAC,,20555,0.000154,0.000154 +1,CATAAAAA,,20535,0.000154,0.000154 +1,TTCCCCAC,,20517,0.000154,0.000154 +1,AAAACATA,,20474,0.000154,0.000154 +1,TCCCCTCA,,20440,0.000153,0.000153 +1,TCCTCCCA,,20431,0.000153,0.000153 +1,TACAAAAA,,20427,0.000153,0.000153 +1,AATAAACA,,20393,0.000153,0.000153 +1,TTCCCACC,,20378,0.000153,0.000153 +1,AAAACAAT,,20363,0.000153,0.000153 +1,TTCACCCC,,20339,0.000153,0.000153 +1,TTTTTCAC,,20334,0.000153,0.000153 +1,AACTAAAA,,20331,0.000153,0.000153 +1,ACAATAAA,,20291,0.000152,0.000152 +1,CCATTTTT,,20281,0.000152,0.000152 +1,TCCCTCCA,,20255,0.000152,0.000152 +1,TTTATATA,,20242,0.000152,0.000152 +1,GGTTTTTT,,20212,0.000152,0.000152 +1,AACAAATA,,20188,0.000152,0.000152 +1,CCCTTCCA,,20172,0.000151,0.000151 +1,GCTTCCCC,,20128,0.000151,0.000151 +1,CCCACCTT,,20114,0.000151,0.000151 +1,TTACCCCC,,20100,0.000151,0.000151 +1,CAAAAAAT,,20096,0.000151,0.000151 +1,AAAATCAA,,20084,0.000151,0.000151 +1,AAAAATTT,,20060,0.000151,0.000151 +1,ATAAAACA,,20052,0.000151,0.000150 +1,CCCCACAA,,20040,0.000150,0.000150 +1,TAAAAAAC,,20039,0.000150,0.000150 +1,CCCCACTT,,20035,0.000150,0.000150 +1,AATAACAA,,19970,0.000150,0.000150 +1,CCCAAAAA,,19933,0.000150,0.000150 +1,TTCCACCC,,19916,0.000149,0.000149 +1,ACAAATAA,,19904,0.000149,0.000149 +1,CAAAAATA,,19897,0.000149,0.000149 +1,CTTTTTCA,,19886,0.000149,0.000149 +1,CCCTCCTA,,19866,0.000149,0.000149 +1,AACATAAA,,19841,0.000149,0.000149 +1,AAAACTAA,,19813,0.000149,0.000149 +1,CCTTTATT,,19808,0.000149,0.000149 +1,CCCCCTGT,,19808,0.000149,0.000149 +1,AAATAACA,,19769,0.000148,0.000148 +1,CCTATTTT,,19749,0.000148,0.000148 +1,TTTAATTA,,19744,0.000148,0.000148 +1,TTGCCCCC,,19741,0.000148,0.000148 +1,TTATTTAA,,19716,0.000148,0.000148 +1,TAAAAACA,,19706,0.000148,0.000148 +1,TCCTTTTA,,19694,0.000148,0.000148 +1,TTTAAATT,,19673,0.000148,0.000148 +1,AAATAAAC,,19667,0.000148,0.000148 +1,TTTTCCTA,,19666,0.000148,0.000148 +1,CCTTTTAT,,19650,0.000148,0.000147 +1,GCCTTCCC,,19642,0.000147,0.000147 +1,AATCAAAA,,19635,0.000147,0.000147 +1,AAACAATA,,19617,0.000147,0.000147 +1,TCCCCCAT,,19595,0.000147,0.000147 +1,CCCCCTTG,,19582,0.000147,0.000147 +1,CCCCCGTT,,19561,0.000147,0.000147 +1,TCTACCCC,,19511,0.000146,0.000146 +1,AACAATAA,,19510,0.000146,0.000146 +1,TTTTCTCA,,19499,0.000146,0.000146 +1,CCTTCCCA,,19477,0.000146,0.000146 +1,GTCTCCCC,,19449,0.000146,0.000146 +1,TAAATTTT,,19408,0.000146,0.000146 +1,TTATAAAA,,19396,0.000146,0.000146 +1,CTTCCCCA,,19386,0.000146,0.000145 +1,GTGTTTTT,,19381,0.000145,0.000145 +1,AAACTAAA,,19366,0.000145,0.000145 +1,TATTTTAA,,19358,0.000145,0.000145 +1,ATAAACAA,,19327,0.000145,0.000145 +1,TCTCCACC,,19317,0.000145,0.000145 +1,TTTATAAT,,19302,0.000145,0.000145 +1,CCCACCAA,,19284,0.000145,0.000145 +1,AAACAAAT,,19213,0.000144,0.000144 +1,CCTTATTT,,19208,0.000144,0.000144 +1,AAATACAA,,19180,0.000144,0.000144 +1,CCTCCCTA,,19158,0.000144,0.000144 +1,ATACAAAA,,19153,0.000144,0.000144 +1,GCCCCTCT,,19146,0.000144,0.000144 +1,TCTTTTCA,,19140,0.000144,0.000144 +1,CCCCTCAT,,19126,0.000144,0.000144 +1,ATAACAAA,,19123,0.000144,0.000144 +1,AAATCAAA,,19112,0.000143,0.000143 +1,CTCTTTTA,,19105,0.000143,0.000143 +1,CCCCAACA,,19087,0.000143,0.000143 +1,CCCCTTAC,,19084,0.000143,0.000143 +1,TCCCCACT,,19045,0.000143,0.000143 +1,AATACAAA,,19030,0.000143,0.000143 +1,AAATTTTT,,19027,0.000143,0.000143 +1,CAATAAAA,,19021,0.000143,0.000143 +1,TGTCCCCC,,19001,0.000143,0.000143 +1,GCCCCTTC,,18980,0.000142,0.000142 +1,TCTCCCAC,,18930,0.000142,0.000142 +1,CAAAATAA,,18927,0.000142,0.000142 +1,TTTTATCC,,18913,0.000142,0.000142 +1,CCCTCTCA,,18905,0.000142,0.000142 +1,GCCCTTCC,,18864,0.000142,0.000142 +1,TAAACAAA,,18841,0.000141,0.000141 +1,GCCTCCCT,,18841,0.000141,0.000141 +1,CAAATAAA,,18825,0.000141,0.000141 +1,TCTCACCC,,18768,0.000141,0.000141 +1,GTCCCCCT,,18735,0.000141,0.000141 +1,CCCCAAAC,,18683,0.000140,0.000140 +1,CTTTTCTA,,18675,0.000140,0.000140 +1,CCCTTCAC,,18672,0.000140,0.000140 +1,TAACAAAA,,18670,0.000140,0.000140 +1,GCCCCCCA,,18644,0.000140,0.000140 +1,TCCTACCC,,18639,0.000140,0.000140 +1,TAAAACAA,,18638,0.000140,0.000140 +1,GCCCTCCT,,18626,0.000140,0.000140 +1,AAACATAA,,18626,0.000140,0.000140 +1,CACTTTTT,,18603,0.000140,0.000140 +1,CCCCATCT,,18598,0.000140,0.000140 +1,TATCCCCC,,18596,0.000140,0.000140 +1,CCCTCCAT,,18590,0.000140,0.000140 +1,CCCCTACT,,18580,0.000139,0.000139 +1,CCACCCTT,,18569,0.000139,0.000139 +1,CCCCGCTT,,18558,0.000139,0.000139 +1,TCCTCACC,,18543,0.000139,0.000139 +1,TTAAATTT,,18532,0.000139,0.000139 +1,GCTCTCCC,,18524,0.000139,0.000139 +1,TTTCTTCA,,18461,0.000139,0.000139 +1,TCCCCTAC,,18438,0.000138,0.000138 +1,TCTTTCTA,,18433,0.000138,0.000138 +1,TCCTCCAC,,18406,0.000138,0.000138 +1,TCCCTCAC,,18405,0.000138,0.000138 +1,TATTAAAA,,18388,0.000138,0.000138 +1,GTCCTCCC,,18388,0.000138,0.000138 +1,CCCCATTC,,18388,0.000138,0.000138 +1,CCTCTCCA,,18378,0.000138,0.000138 +1,TTTATTCC,,18365,0.000138,0.000138 +1,TTCCTTTA,,18346,0.000138,0.000138 +1,CTCCCCTA,,18346,0.000138,0.000138 +1,CTTCTTTA,,18333,0.000138,0.000138 +1,TTTATAAA,,18314,0.000137,0.000137 +1,TCCCTACC,,18303,0.000137,0.000137 +1,TTAATTTA,,18276,0.000137,0.000137 +1,TCCCCATC,,18273,0.000137,0.000137 +1,TAATTTTA,,18273,0.000137,0.000137 +1,TCTCTTTA,,18272,0.000137,0.000137 +1,TCCACCCT,,18246,0.000137,0.000137 +1,TATTTATA,,18211,0.000137,0.000137 +1,TCCATTTT,,18200,0.000137,0.000137 +1,GCTCCCCT,,18192,0.000137,0.000137 +1,TCCATCCC,,18187,0.000137,0.000136 +1,TTTAATAT,,18147,0.000136,0.000136 +1,CTCTCCCA,,18130,0.000136,0.000136 +1,CTTTTTAC,,18119,0.000136,0.000136 +1,ATTTAAAA,,18117,0.000136,0.000136 +1,TCCCACCT,,18114,0.000136,0.000136 +1,TTATTATA,,18107,0.000136,0.000136 +1,TTTTCTAC,,18090,0.000136,0.000136 +1,TTTCCTTA,,18088,0.000136,0.000136 +1,TTCCCCCG,,18071,0.000136,0.000136 +1,CTTTCTTA,,18061,0.000136,0.000136 +1,CCCAACAC,,18053,0.000136,0.000135 +1,TCTTCTTA,,18048,0.000135,0.000135 +1,CCCACTCT,,18045,0.000135,0.000135 +1,CCCCTATC,,18040,0.000135,0.000135 +1,TCCCCCGT,,18025,0.000135,0.000135 +1,TTCTTTCA,,18020,0.000135,0.000135 +1,CCCTTACC,,18009,0.000135,0.000135 +1,CCCACACA,,18004,0.000135,0.000135 +1,TGCTCCCC,,18002,0.000135,0.000135 +1,GCCTTTTT,,17994,0.000135,0.000135 +1,TCCCCCTG,,17971,0.000135,0.000135 +1,TTAATAAA,,17908,0.000134,0.000134 +1,GTCCCTCC,,17876,0.000134,0.000134 +1,TCGTCCCC,,17860,0.000134,0.000134 +1,TCTTTTAC,,17848,0.000134,0.000134 +1,AAAATTTA,,17841,0.000134,0.000134 +1,AAAATATT,,17824,0.000134,0.000134 +1,TTTTCCAT,,17811,0.000134,0.000134 +1,GCCTCTCC,,17792,0.000134,0.000134 +1,TATAATTT,,17788,0.000134,0.000133 +1,CCTCCTCA,,17785,0.000134,0.000133 +1,TTCGCCCC,,17778,0.000133,0.000133 +1,CCCACTTC,,17778,0.000133,0.000133 +1,CCCCTCGT,,17741,0.000133,0.000133 +1,CCCTCACT,,17712,0.000133,0.000133 +1,GCCCTCTC,,17701,0.000133,0.000133 +1,CCCAAACC,,17693,0.000133,0.000133 +1,TTTCTCTA,,17680,0.000133,0.000133 +1,TATATTTA,,17676,0.000133,0.000133 +1,TCATCCCC,,17675,0.000133,0.000133 +1,TCCTTATT,,17674,0.000133,0.000133 +1,CACCCCTT,,17674,0.000133,0.000133 +1,CCGCCCTT,,17671,0.000133,0.000133 +1,CCTTCACC,,17666,0.000133,0.000133 +1,CCCCTCTG,,17662,0.000133,0.000133 +1,CTTACCCC,,17646,0.000132,0.000132 +1,CCCATCCT,,17646,0.000132,0.000132 +1,TTTTTTGG,,17637,0.000132,0.000132 +1,TTCTTCTA,,17636,0.000132,0.000132 +1,TCCACTCC,,17625,0.000132,0.000132 +1,TTAATATT,,17613,0.000132,0.000132 +1,CCCAACCA,,17609,0.000132,0.000132 +1,CCCGCCTT,,17601,0.000132,0.000132 +1,TTTTACCT,,17590,0.000132,0.000132 +1,CTCATTTT,,17585,0.000132,0.000132 +1,TTATTAAT,,17571,0.000132,0.000132 +1,TCCTTTAT,,17562,0.000132,0.000132 +1,CTCCCTCA,,17550,0.000132,0.000132 +1,CCCCTTCG,,17535,0.000132,0.000132 +1,GCCTCCTC,,17534,0.000132,0.000132 +1,GCTCCTCC,,17528,0.000132,0.000132 +1,CTTTTCAT,,17516,0.000131,0.000131 +1,TTATAATT,,17507,0.000131,0.000131 +1,CCCATTCC,,17506,0.000131,0.000131 +1,CCTTACCC,,17498,0.000131,0.000131 +1,TTCCCCGC,,17491,0.000131,0.000131 +1,CCCTCCGT,,17489,0.000131,0.000131 +1,GTTTTTCC,,17476,0.000131,0.000131 +1,TTATATTA,,17453,0.000131,0.000131 +1,TATTTAAT,,17431,0.000131,0.000131 +1,CCGTTTTT,,17417,0.000131,0.000131 +1,TTTTCATC,,17416,0.000131,0.000131 +1,CCTCCCAT,,17416,0.000131,0.000131 +1,TTAAAAAT,,17415,0.000131,0.000131 +1,TCCCATCC,,17411,0.000131,0.000131 +1,TTCCGCCC,,17408,0.000131,0.000131 +1,TAATATTT,,17378,0.000130,0.000130 +1,TCCCACTC,,17372,0.000130,0.000130 +1,TTAAAATA,,17371,0.000130,0.000130 +1,GTCCCCTC,,17370,0.000130,0.000130 +1,TCCACCTC,,17356,0.000130,0.000130 +1,TCACCCCT,,17351,0.000130,0.000130 +1,TACCCCCT,,17335,0.000130,0.000130 +1,CTCCTCCA,,17317,0.000130,0.000130 +1,TTAAATAA,,17312,0.000130,0.000130 +1,CTCTTTAT,,17301,0.000130,0.000130 +1,CCCTACCT,,17285,0.000130,0.000130 +1,TGCCCCCT,,17282,0.000130,0.000130 +1,CTCTTATT,,17280,0.000130,0.000130 +1,CTTCCCAC,,17267,0.000130,0.000130 +1,CCTTCCAC,,17264,0.000130,0.000130 +1,CCCCTTGC,,17255,0.000130,0.000129 +1,TTTCTTAC,,17235,0.000129,0.000129 +1,TTTTCACT,,17229,0.000129,0.000129 +1,TAAAAATT,,17217,0.000129,0.000129 +1,TACTCCCC,,17212,0.000129,0.000129 +1,TCCTCCCG,,17201,0.000129,0.000129 +1,TATTATTA,,17198,0.000129,0.000129 +1,TTTTACTC,,17185,0.000129,0.000129 +1,CCCTCTAC,,17160,0.000129,0.000129 +1,TCTGCCCC,,17151,0.000129,0.000129 +1,TTCTCTTA,,17135,0.000129,0.000129 +1,CTCCCCAT,,17131,0.000129,0.000129 +1,TCTCCCCG,,17127,0.000129,0.000129 +1,CTTCCACC,,17125,0.000129,0.000129 +1,CCCCGTTC,,17114,0.000128,0.000128 +1,CCCACAAC,,17110,0.000128,0.000128 +1,CCCCAAAA,,17096,0.000128,0.000128 +1,CTTTTATC,,17080,0.000128,0.000128 +1,CCCCGTCT,,17072,0.000128,0.000128 +1,TTTTTCAA,,17071,0.000128,0.000128 +1,GTTGTTTT,,17068,0.000128,0.000128 +1,TATTAATT,,17054,0.000128,0.000128 +1,CCTCCACT,,17044,0.000128,0.000128 +1,CCACCCAA,,17034,0.000128,0.000128 +1,GCTCCCTC,,17033,0.000128,0.000128 +1,AAAAACCC,,17030,0.000128,0.000128 +1,GTTTTTTG,,17009,0.000128,0.000128 +1,TATTTTCC,,16998,0.000128,0.000128 +1,TAATTATT,,16959,0.000127,0.000127 +1,TCCCCTCG,,16950,0.000127,0.000127 +1,TCCTATTT,,16947,0.000127,0.000127 +1,CCAAACCC,,16935,0.000127,0.000127 +1,TTATTTCC,,16910,0.000127,0.000127 +1,TTAATTAT,,16882,0.000127,0.000127 +1,AATATTTT,,16878,0.000127,0.000127 +1,GTTTTTGT,,16875,0.000127,0.000127 +1,CTTCACCC,,16868,0.000127,0.000127 +1,TCTTTCAT,,16862,0.000127,0.000127 +1,CCGTTCCC,,16841,0.000126,0.000126 +1,TCACTCCC,,16833,0.000126,0.000126 +1,TCTTTATC,,16828,0.000126,0.000126 +1,TCCCCTGC,,16827,0.000126,0.000126 +1,TGCCTCCC,,16826,0.000126,0.000126 +1,TCCCCCAA,,16825,0.000126,0.000126 +1,GGGCCCCC,,16815,0.000126,0.000126 +1,TCGCTCCC,,16814,0.000126,0.000126 +1,TCCTGCCC,,16802,0.000126,0.000126 +1,CTCTATTT,,16798,0.000126,0.000126 +1,CCCATCTC,,16798,0.000126,0.000126 +1,TCCCTCCG,,16783,0.000126,0.000126 +1,CTTTTACT,,16779,0.000126,0.000126 +1,CCTATCCC,,16779,0.000126,0.000126 +1,CCCTATCC,,16777,0.000126,0.000126 +1,TTCTTTAC,,16751,0.000126,0.000126 +1,AATTTAAA,,16741,0.000126,0.000126 +1,TATATATT,,16738,0.000126,0.000126 From d3e6ad053b61f421126e17c0f1599bfbd2d28d4f Mon Sep 17 00:00:00 2001 From: luc Date: Fri, 29 Mar 2024 09:36:17 -0400 Subject: [PATCH 29/87] fix getting read length issue for single index --- demux_run_dag.py | 1 + scripts/get_sequencing_read_data.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/demux_run_dag.py b/demux_run_dag.py index 6c85c45..09f010a 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -171,6 +171,7 @@ def stats(ds, **kwargs): # check if the run is 10X by read length atac, use_bases_mask = scripts.get_sequencing_read_data.main(sequencer_path) + print("read length: {}".format(use_bases_mask)) if use_bases_mask == [29, 89] or atac: # if is atac run, demux is using cellranger mkfastq if atac: diff --git a/scripts/get_sequencing_read_data.py b/scripts/get_sequencing_read_data.py index bdcab04..e8e16f7 100755 --- a/scripts/get_sequencing_read_data.py +++ b/scripts/get_sequencing_read_data.py @@ -36,7 +36,7 @@ def get_sequencing_read_data(sequencer_path): use_bases_mask = "Y" + str(reads_tag[0][1]) + ",I" + str(reads_tag[1][1]) + ",Y" + str(reads_tag[2][1]) + ",Y" + str(reads_tag[3][1]) else: atac = False - use_bases_mask = [reads_tag[0][1], reads_tag[3][1]] + use_bases_mask = [reads_tag[0][1], reads_tag[-1][1]] return(atac, use_bases_mask) From a9f0a1e79104199e209e9295b66ef4d3f52f24db Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Fri, 29 Mar 2024 11:15:42 -0400 Subject: [PATCH 30/87] Update cellranger.py updating versions of cellranger and spaceranger to have latest versions when 10X database is implemented --- scripts/cellranger.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index cffe807..ca87b32 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -27,14 +27,14 @@ ACCESS = 0o775 config_dict = { "count": { - "tool": " /igo/work/nabors/tools/cellranger-7.0.0/cellranger count ", + "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger count ", "genome": { "Human": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A ", "Mouse": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A " } }, "vdj": { - "tool": " /igo/work/nabors/tools/cellranger-7.0.0/cellranger vdj ", + "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger vdj ", "genome": { "Human": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0 ", "Mouse": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0 " @@ -55,17 +55,17 @@ } }, "multi": { - "tool": " /igo/work/nabors/tools/cellranger-7.0.0/cellranger multi " + "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger multi " }, "arc": { - "tool": " /igo/work/bin/cellranger-arc-2.0.0/cellranger-arc count ", + "tool": " /igo/work/bin/cellranger-arc-2.0.2/cellranger-arc count ", "genome": { "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 ", "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-mm10-2020-A-2.0.0 " } }, "spaceranger": { - "tool": " /igo/work/nabors/tools/spaceranger-2.0.0/spaceranger count ", + "tool": " /igo/work/nabors/tools/spaceranger-3.0.0/spaceranger count ", "genome": { "Human": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A ", "Mouse": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/spatial_gex/refdata-gex-mm10-2020-A " @@ -73,7 +73,7 @@ "probe": { "Human": "/igo/work/nabors/genomes/10X_Genomics/spatial_gex/Visium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv", "Human_CytAssist": "/igo/work/genomes/10X_Genomics/spaceranger/Visium_Human_Transcriptome_Probe_Set_v2.0_GRCh38-2020-A.csv", - "Mouse": "/igo/work/nabors/tools/spaceranger-2.0.0/external/tenx_feature_references/targeted_panels/Visium_Mouse_Transcriptome_Probe_Set_v1.0_mm10-2020-A.csv" + "Mouse": "/igo/work/nabors/tools/spaceranger-3.0.0/external/tenx_feature_references/targeted_panels/Visium_Mouse_Transcriptome_Probe_Set_v1.0_mm10-2020-A.csv" } } } From 8fbc4b77894c1d4ee087914f100b8879e4a87af7 Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 1 Apr 2024 08:46:19 -0400 Subject: [PATCH 31/87] put 10X config parameter into separate file --- scripts/cellranger_config.py | 78 +++++++++++++++++++++++++++++++++++ scripts/cellranger_spatial.py | 19 +++------ 2 files changed, 84 insertions(+), 13 deletions(-) create mode 100644 scripts/cellranger_config.py diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py new file mode 100644 index 0000000..801ea70 --- /dev/null +++ b/scripts/cellranger_config.py @@ -0,0 +1,78 @@ +# work folder +STATS_AREA = "/igo/stats/CELLRANGER/" + +# config info +ACCESS = 0o775 +config_dict = { + "count": { + "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger count ", + "genome": { + "Human": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A ", + "Mouse": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A " + } + }, + "vdj": { + "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger vdj ", + "genome": { + "Human": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0 ", + "Mouse": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0 " + } + }, + "atac_count": { + "tool": " /igo/work/nabors/tools/cellranger-atac-2.1.0/cellranger-atac count ", + "genome": { + "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ATAC/refdata-cellranger-atac-GRCh38-1.0.1 ", + "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ATAC/refdata-cellranger-atac-mm10-1.1.0 " + } + }, + "cnv": { + "tool": " /igo/work/nabors/tools/cellranger-dna-1.1.0/cellranger-dna cnv ", + "genome": { + "Human": " --reference=/igo/work/nabors/10X_Genomics/CNV/refdata-GRCh38-1.0.0 ", + "Mouse": " --reference=/igo/work/nabors/10X_Genomics/CNV/refdata-GRCm38-1.0.0 " + } + }, + "multi": { + "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger multi " + }, + "arc": { + "tool": " /igo/work/bin/cellranger-arc-2.0.2/cellranger-arc count ", + "genome": { + "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 ", + "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-mm10-2020-A-2.0.0 " + } + }, + "spaceranger": { + "tool": " /igo/work/nabors/tools/spaceranger-3.0.0/spaceranger count ", + "genome": { + "Human": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A ", + "Mouse": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/spatial_gex/refdata-gex-mm10-2020-A " + }, + "probe": { + "Human": "/igo/work/nabors/genomes/10X_Genomics/spatial_gex/Visium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv", + "Human_CytAssist": "/igo/work/genomes/10X_Genomics/spaceranger/Visium_Human_Transcriptome_Probe_Set_v2.0_GRCh38-2020-A.csv", + "Mouse": "/igo/work/nabors/tools/spaceranger-3.0.0/external/tenx_feature_references/targeted_panels/Visium_Mouse_Transcriptome_Probe_Set_v1.0_mm10-2020-A.csv", + "Mouse_HD": "/igo/work/nabors/tools/spaceranger-3.0.0/external/tenx_feature_references/targeted_panels/Visium_Mouse_Transcriptome_Probe_Set_v2.0_mm10-2020-A.csv" + } + } +} + +# cellranger command line options +OPTIONS = " --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" + +# 10X recipe list for different pipelines +COUNT_FLAVORS = ["10X_Genomics_GeneExpression-3", "10X_Genomics_GeneExpression-5"] +VDJ_FLAVORS = ["10X_Genomics_VDJ"] +ATAC_FLAVORS = ["10X_Genomics_ATAC"] +CNV_FLAVORS = ["10X_Genomics_CNV"] +ARC_FLAVORS = ["10X_Genomics_Multiome", "10X_Genomics_Multiome_ATAC", "10X_Genomics_Multiome_GeneExpression"] +SPATIAL_FLAVORS = ["10X_Genomics_Visium"] + +# we do not want to PROCESS SAIL (15500) or SCRI (12437) projects +SCRI = "12437" +SAIL = "15500" +DO_NOT_PROCESS = [SCRI, SAIL] + +VISIUM_ENDPOINT = "https://igolims.mskcc.org:8443/LimsRest/getConfig?igoId=" +original_tiff_images_directory = "/rtssdc/mohibullahlab/IGO_Pipeline_Results/Single_Cell/10X_Genomics/TIFF_Images/" +tiff_images_directory = "/igo/work/igo/TIFF_Images/" diff --git a/scripts/cellranger_spatial.py b/scripts/cellranger_spatial.py index 70b27a3..f28e9de 100644 --- a/scripts/cellranger_spatial.py +++ b/scripts/cellranger_spatial.py @@ -1,17 +1,10 @@ import pandas as pd -import sys import os import json import os.path import requests import shutil -import glob - - -ENDPOINT = "https://igolims.mskcc.org:8443/LimsRest/getConfig?igoId=" -original_tiff_images_directory = "/rtssdc/mohibullahlab/IGO_Pipeline_Results/Single_Cell/10X_Genomics/TIFF_Images/" -tiff_images_directory = "/igo/work/igo/TIFF_Images/" - +import scripts.cellranger_config as CONFIG # sample_id can be get from sample sheet, will be the part in front of _IGO_ class Spatial_sample: @@ -28,7 +21,7 @@ def __init__(self, sample, project_id): self.copy_json(project_id) def get_info_from_LIMS(self): - response = requests.get(ENDPOINT + self.IGO_ID , auth = ("pms", "tiagostarbuckslightbike"), verify = False) + response = requests.get(CONFIG.VISIUM_ENDPOINT + self.IGO_ID , auth = ("pms", "tiagostarbuckslightbike"), verify = False) response_data = json.loads(response.text.encode("utf8")) self.chip_position = response_data["chipPosition"] self.chip_id = response_data["chipID"] @@ -37,8 +30,8 @@ def get_info_from_LIMS(self): def copy_tiff(self, project_id): # project_id format as Project_12345 - source_loc_dir = original_tiff_images_directory + project_id - destination_loc = tiff_images_directory + project_id + source_loc_dir = CONFIG.original_tiff_images_directory + project_id + destination_loc = CONFIG.tiff_images_directory + project_id destination_file = destination_loc + "/" + self.sample_name + ".tif" # create TIFF_images director if not exists if not os.path.exists(destination_loc): @@ -56,8 +49,8 @@ def copy_tiff(self, project_id): # copy json file if exists def copy_json(self, project_id): # project_id format as Project_12345 - source_loc = original_tiff_images_directory + project_id + "/json/" + self.sample_name + ".json" - destination_loc = tiff_images_directory + project_id + source_loc = CONFIG.original_tiff_images_directory + project_id + "/json/" + self.sample_name + ".json" + destination_loc = CONFIG.tiff_images_directory + project_id destination_file = destination_loc + "/" + self.sample_name + ".json" # create director if not exists From 8c4d027f2512e11ac27fe84ffa8d579f7bdaa988 Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 1 Apr 2024 08:52:07 -0400 Subject: [PATCH 32/87] update test reflecting new cellranger version --- test_scripts.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/test_scripts.py b/test_scripts.py index e9174b0..fe372d6 100644 --- a/test_scripts.py +++ b/test_scripts.py @@ -21,26 +21,13 @@ def testCellranger_generate_cellranger_cmd(): if genome_dict[sample] != "Human" and genome_dict[sample] != "Mouse": genome_dict[sample] = "Mouse" cmd.append(cellranger.generate_cellranger_cmd(sample, "count", genome_dict[sample], fastq_file_list_dict[sample], "DIANA_0453_AHFKJ5DRXY")) - test_result = ["bsub -J DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger.out /igo/work/nabors/tools/cellranger-7.0.0/cellranger count --id=Sample_06265_8869_1_IGO_06265_AG_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_06265_AG/Sample_06265_8869_1_IGO_06265_AG_3 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", - "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger.out /igo/work/nabors/tools/cellranger-7.0.0/cellranger count --id=Sample_Third-Transcriptome_IGO_11969_E_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3,/igo/staging/FASTQ/DIANA_0454_BH555MDMXY/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", - "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger.out /igo/work/nabors/tools/cellranger-7.0.0/cellranger count --id=Sample_Second_IGO_11969_E_2__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_11969_E/Sample_Second_IGO_11969_E_2,/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Second_IGO_11969_E_2 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200"] + test_result = ["bsub -J DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_06265_8869_1_IGO_06265_AG_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_06265_AG/Sample_06265_8869_1_IGO_06265_AG_3 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", + "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_Third-Transcriptome_IGO_11969_E_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3,/igo/staging/FASTQ/DIANA_0454_BH555MDMXY/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", + "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_Second_IGO_11969_E_2__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_11969_E/Sample_Second_IGO_11969_E_2,/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Second_IGO_11969_E_2 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200"] for i in range (3): assert(cmd[i] == test_result[i]) -def testCellranger_get_SCRI_tag(): - sample1 = "SD-1680_Patient_D_nucseq_H_VDJ_IGO_12437_AN_5" - sample2 = "SDtest_IGO_12437_AN_4" - sample3 = "SDtest_GE_IGO_12437_AN_4" - - tag_genome1 = cellranger.get_SCRI_tag(sample1) - tag_genome2 = cellranger.get_SCRI_tag(sample2) - tag_genome3 = cellranger.get_SCRI_tag(sample3) - - assert(tag_genome1 == ("vdj", "Human")) - assert(tag_genome2 == ("Skip", "na")) - assert(tag_genome3 == ("Skip", "na")) - def testCellranger_get_tag(): assert(cellranger.get_tag("10X_genomic") == "Skip") assert(cellranger.get_tag("10X_Genomics_GeneExpression-3") == "count") From 85dd23199bc5e63f63a9968a0a20ee5062acdefe Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 1 Apr 2024 09:35:42 -0400 Subject: [PATCH 33/87] refactor cellranger code --- demux_run_dag.py | 2 +- scripts/cellranger.py | 351 ++++++++++------------------------------ stats_by_project_dag.py | 2 +- 3 files changed, 86 insertions(+), 269 deletions(-) diff --git a/demux_run_dag.py b/demux_run_dag.py index 09f010a..802769d 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -190,7 +190,7 @@ def stats(ds, **kwargs): # step 2, start cell ranger based on recipe/barcode, check whether multiple fastq files existing # trim sequencer_and_run if postfix like _10X exsiting sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3]) - scripts.cellranger.launch_cellranger(sample_sheet, sequencer_and_run_prefix) + scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix) # add DONE file when all the 10X pipeline finished, -K to wait until finish cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/stats/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index ca87b32..5c5149e 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -1,100 +1,19 @@ # launch cell ranger pipeline (GE, VDJ, ATAC....) for 10X samples by recipe -# put result in /igo/stats/CELLRANGER/ - import pandas as pd import re import sys import os import json import subprocess -from os.path import join -from os.path import basename -from os.path import abspath -from os.path import isdir -from subprocess import call +import os.path import scripts.get_sequencing_read_data import scripts.cellranger_spatial +import scripts.cellranger_config as CONFIG """ input: sample_sheet object(for sample list and essential info), sequencer_and_run(for stats folder and fastq file location) output: running cmd for cellranger by sample -""" - -# work folder -STATS_AREA = "/igo/stats/CELLRANGER/" - -# config info -ACCESS = 0o775 -config_dict = { - "count": { - "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger count ", - "genome": { - "Human": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A ", - "Mouse": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A " - } - }, - "vdj": { - "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger vdj ", - "genome": { - "Human": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCh38-alts-ensembl-7.0.0 ", - "Mouse": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0 " - } - }, - "atac_count": { - "tool": " /igo/work/nabors/tools/cellranger-atac-2.1.0/cellranger-atac count ", - "genome": { - "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ATAC/refdata-cellranger-atac-GRCh38-1.0.1 ", - "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ATAC/refdata-cellranger-atac-mm10-1.1.0 " - } - }, - "cnv": { - "tool": " /igo/work/nabors/tools/cellranger-dna-1.1.0/cellranger-dna cnv ", - "genome": { - "Human": " --reference=/igo/work/nabors/10X_Genomics/CNV/refdata-GRCh38-1.0.0 ", - "Mouse": " --reference=/igo/work/nabors/10X_Genomics/CNV/refdata-GRCm38-1.0.0 " - } - }, - "multi": { - "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger multi " - }, - "arc": { - "tool": " /igo/work/bin/cellranger-arc-2.0.2/cellranger-arc count ", - "genome": { - "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 ", - "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-mm10-2020-A-2.0.0 " - } - }, - "spaceranger": { - "tool": " /igo/work/nabors/tools/spaceranger-3.0.0/spaceranger count ", - "genome": { - "Human": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A ", - "Mouse": " --transcriptome=/igo/work/nabors/genomes/10X_Genomics/spatial_gex/refdata-gex-mm10-2020-A " - }, - "probe": { - "Human": "/igo/work/nabors/genomes/10X_Genomics/spatial_gex/Visium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv", - "Human_CytAssist": "/igo/work/genomes/10X_Genomics/spaceranger/Visium_Human_Transcriptome_Probe_Set_v2.0_GRCh38-2020-A.csv", - "Mouse": "/igo/work/nabors/tools/spaceranger-3.0.0/external/tenx_feature_references/targeted_panels/Visium_Mouse_Transcriptome_Probe_Set_v1.0_mm10-2020-A.csv" - } - } -} - -# cellranger command line options -OPTIONS = " --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" - -# 10X recipe list for different pipelines -COUNT_FLAVORS = ["10X_Genomics_GeneExpression-3", "10X_Genomics_GeneExpression-5"] -VDJ_FLAVORS = ["10X_Genomics_VDJ"] -ATAC_FLAVORS = ["10X_Genomics_ATAC"] -CNV_FLAVORS = ["10X_Genomics_CNV"] -ARC_FLAVORS = ["10X_Genomics_Multiome", "10X_Genomics_Multiome_ATAC", "10X_Genomics_Multiome_GeneExpression"] -SPATIAL_FLAVORS = ["10X_Genomics_Visium"] -# we do not want to PROCESS SAIL (15500) or SCRI (12437) projects -SCRI = "12437" -SAIL = "15500" -DO_NOT_PROCESS = [SCRI, SAIL] - -""" steps: 1. check whether there is previous fastq existing under /igo/staging/FASTQ (find_fastq_file) 2. get tag by recipe, if recipe not in the list above, skip for now (get_tag) @@ -134,58 +53,30 @@ def find_fastq_file(sample_ID_list): def get_tag(recipe): tag = "Skip" - if recipe in COUNT_FLAVORS: + if recipe in CONFIG.COUNT_FLAVORS: tag = "count" - if recipe in CNV_FLAVORS: + if recipe in CONFIG.CNV_FLAVORS: tag = "cnv" - if recipe in VDJ_FLAVORS: + if recipe in CONFIG.VDJ_FLAVORS: tag = "vdj" - if recipe in ATAC_FLAVORS: + if recipe in CONFIG.ATAC_FLAVORS: tag = "atac_count" - if recipe in ARC_FLAVORS: + if recipe in CONFIG.ARC_FLAVORS: tag = "arc" - if recipe in SPATIAL_FLAVORS: + if recipe in CONFIG.SPATIAL_FLAVORS: tag = "spaceranger" return tag -# return tag and genome according to sample_ID for SCRI samples, all SCRI samples are starting with Project_12437 -# eg: SD-1680_Patient_D_nucseq_H_VDJ_IGO_12437_AN_5 will given tag as vdj, genome as Human -# eg: SDtest_IGO_12437_AN_4 will given tag as Skip, genome as na -# _H: Human, _M: Mouse -# _VDJ: vdj, _GE: count, _ATAC: "atac_count" -def get_SCRI_tag(sample_ID): - tag_orig = sample_ID.split("_")[sample_ID.split("_").index("IGO") - 1] - tag = "Skip" - if tag_orig == "VDJ": - tag = "vdj" - if tag_orig == "GE": - tag = "count" - if tag_orig == "ATAC": - tag = "atac_count" - - genome = "na" - if tag != "Skip": - genome_orig = sample_ID.split("_")[sample_ID.split("_").index("IGO") - 2] - if genome_orig == "H": - genome = "Human" - if genome_orig == "M": - genome = "Mouse" - # if genome parameter couldn't detected, set tag back to skip - if genome == "na": - tag = "Skip" - - return tag, genome - def generate_cellranger_cmd(sample_ID, tag, genome, fastq_file_path, sequencer_and_run): - tool = config_dict[tag]["tool"] - transcriptome = config_dict[tag]["genome"][genome] + tool = CONFIG.config_dict[tag]["tool"] + transcriptome = CONFIG.config_dict[tag]["genome"][genome] project_ID = "Project_" + "_".join(sample_ID.split("_")[sample_ID.split("_").index("IGO") + 1:-1]) - cellranger_cmd = "{}--id=Sample_{}__{}".format(tool, sample_ID, tag) + transcriptome + "--fastqs=" + ",".join(fastq_file_path) + OPTIONS + cellranger_cmd = "{}--id=Sample_{}__{}".format(tool, sample_ID, tag) + transcriptome + "--fastqs=" + ",".join(fastq_file_path) + CONFIG.OPTIONS job_name = "{}_{}_{}_{}_cellranger".format(sequencer_and_run, project_ID, sample_ID, tag) bsub_cmd = "bsub -J {} -o {}.out{}".format(job_name, job_name, cellranger_cmd) return bsub_cmd -def create_json(send_json, sequencer_and_run, project, tag, work_area): +def create_json(send_json, sequencer_and_run, project, work_area): job_id = sequencer_and_run + "_" + project json_data_file = "cellranger_json___" + sequencer_and_run + "__" + project + ".json" with open(json_data_file, "w") as jfile: @@ -250,197 +141,123 @@ def multiome_valid(fastq_list): return [is_valid, ge_list, atac_list] -# Main function: launch cellranger cmd by given samplesheet object and sequencer_and_run -def launch_cellranger(sample_sheet, sequencer_and_run): - # get parameters from sample_sheet - # dictionary of Sample_ID->Project - sample_project_dict = pd.Series(sample_sheet.df_ss_data["Sample_Project"].values,index=sample_sheet.df_ss_data["Sample_ID"]).to_dict() - # dictionary of project->sample_ID - project_sample_dict = {} - for sample_ID, project_ID in sample_project_dict.items(): - if project_ID in project_sample_dict.keys(): - project_sample_dict[project_ID].append(sample_ID) - else: - project_sample_dict[project_ID] = [sample_ID] - # dictionary of sample_ID->recipe - sample_recipe_dict = pd.Series(sample_sheet.df_ss_data["Sample_Well"].values,index=sample_sheet.df_ss_data["Sample_ID"]).to_dict() - # dictionary of sample_ID->genome - sample_genome_dict = pd.Series(sample_sheet.df_ss_data["Sample_Plate"].values,index=sample_sheet.df_ss_data["Sample_ID"]).to_dict() - # dictionary of sample_ID->fastq_list - sample_ID_list = list(sample_project_dict.keys()) - sample_fastqfile_dict = find_fastq_file(sample_ID_list) - - for project in project_sample_dict.keys(): - send_json = {} - send_json["samples"] = [] - # CREATE RUN FOLDER AND PROJECT FOLDER IF NOT ALREADY THERE - os.chdir(STATS_AREA) - runs = next(os.walk("."))[1] - if sequencer_and_run not in runs: - os.mkdir(sequencer_and_run, ACCESS) - - stats_and_run = STATS_AREA + sequencer_and_run - os.chdir(stats_and_run) - projects = next(os.walk("."))[1] - if project not in projects: - os.mkdir(project, ACCESS) - work_area = stats_and_run + "/" + project + "/" - # GO TO project ID LOCATION to start cellranger command - os.chdir(work_area) - - - # SCRI or SAIL samples don't need to be pushed onto qc website - if (not any(prj in project for prj in DO_NOT_PROCESS)): - sample_list = project_sample_dict[project] - # call cellranger for each sample and append info to json dict - for sample in sample_list: - if sample_genome_dict[sample] != "Human" and sample_genome_dict[sample] != "Mouse": - sample_genome_dict[sample] = "Mouse" - tag = get_tag(sample_recipe_dict[sample]) - # if recipe within the tool being set up, lanuch cellranger - if tag == "arc": - validation = multiome_valid(sample_fastqfile_dict[sample]) - if validation[0] == "YES": - create_library_csv_file(validation[1], validation[2], sample) - tool = config_dict[tag]["tool"] - transcriptome = config_dict[tag]["genome"][sample_genome_dict[sample]] - cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}Sample_{}.csv".format(work_area, sample) + OPTIONS - bsub_cmd = "bsub -J {}_{}_{}_ARC -o {}_ARC.out{}".format(sequencer_and_run, project, sample, sample, cmd) - print(bsub_cmd) - subprocess.run(bsub_cmd, shell=True) - else: - print("Multiome sample set not complete yet") - elif tag == "spaceranger": - sample_info = scripts.cellranger_spatial.Spatial_sample(sample, project) - if sample_info.tiff_image == "EMPTY": - print("check tif image") - else: - tool = config_dict[tag]["tool"] - transcriptome = config_dict[tag]["genome"][sample_genome_dict[sample]] - cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--fastqs=" + ",".join(sample_fastqfile_dict[sample]) + " --image={} --slide={} --area={}".format(sample_info.tiff_image, sample_info.chip_id, sample_info.chip_position) - - if sample_info.cytAssist: - cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--fastqs=" + ",".join(sample_fastqfile_dict[sample]) + " --cytaimage={} --slide={} --area={}".format(sample_info.tiff_image, sample_info.chip_id, sample_info.chip_position) - if sample_genome_dict[sample] == "Human": - probe = config_dict[tag]["probe"]["Human_CytAssist"] - cmd = cmd + " --probe-set={}".format(probe) - elif sample_genome_dict[sample] == "Mouse": - probe = config_dict[tag]["probe"][sample_genome_dict[sample]] - cmd = cmd + " --probe-set={}".format(probe) - - elif sample_info.preservation == "FFPE": - probe = config_dict[tag]["probe"][sample_genome_dict[sample]] - cmd = cmd + " --probe-set={}".format(probe) - - # if there is manual alignment json file availabe, add that to the cmd - if sample_info.json != "EMPTY": - cmd = cmd + " --loupe-alignment={}".format(sample_info.json) - - bsub_cmd = "bsub -J {}_{}_{}_SPATIAL -o {}_SPATIAL.out{}{}".format(sequencer_and_run, project, sample, sample, cmd, OPTIONS) - print(bsub_cmd) - subprocess.run(bsub_cmd, shell=True) - - elif tag != "Skip": - cmd = generate_cellranger_cmd(sample, tag, sample_genome_dict[sample], sample_fastqfile_dict[sample], sequencer_and_run) - print(cmd) - subprocess.run(cmd, shell=True) - send_json["samples"].append({"sample":"Sample_" + sample, "type":tag, "project":project, "run":sequencer_and_run}) - if send_json["samples"]: - create_json(send_json, sequencer_and_run, project, tag, work_area) - else: - sample_list = project_sample_dict[project] - # call cellranger for each sample - for sample in sample_list: - tag, genome = get_SCRI_tag(sample) - # if recipe within the tool being set up, lanuch cellranger - if tag != "Skip" and genome != "na": - cmd = generate_cellranger_cmd(sample, tag, genome, sample_fastqfile_dict[sample], sequencer_and_run) - print(cmd) - subprocess.run(cmd, shell=True) - -# lanuch cellranger by given project_directory eg: /igo/staging/FASTQ/RUTH_0141_AH27NGDSX5/Project_13586_B -def lanuch_by_project(project_directory, recipe, species): - # get sample_ID list - sample_list_ori = os.listdir(project_directory) - sample_list = [] - for sample in sample_list_ori: - # remove Sample_ prefix - sample_list.append(sample[7:]) - # get project and run info from project_directory - project = project_directory.split("/")[5] - sequencer_and_run = project_directory.split("/")[4] - sample_fastqfile_dict = find_fastq_file(sample_list) - tag = get_tag(recipe) +# lanuch cellranger per project +def lanuch_by_project(sequencer_and_run, project, sample_id_list, sample_genome_dict, sample_recipe_dict): + sample_fastqfile_dict = find_fastq_file(sample_id_list) send_json = {} send_json["samples"] = [] # CREATE RUN FOLDER AND PROJECT FOLDER IF NOT ALREADY THERE - os.chdir(STATS_AREA) + os.chdir(CONFIG.STATS_AREA) runs = next(os.walk("."))[1] if sequencer_and_run not in runs: - os.mkdir(sequencer_and_run, ACCESS) - - stats_and_run = STATS_AREA + sequencer_and_run + os.mkdir(sequencer_and_run, CONFIG.ACCESS) + + stats_and_run = CONFIG.STATS_AREA + sequencer_and_run os.chdir(stats_and_run) projects = next(os.walk("."))[1] if project not in projects: - os.mkdir(project, ACCESS) + os.mkdir(project, CONFIG.ACCESS) work_area = stats_and_run + "/" + project + "/" # GO TO project ID LOCATION to start cellranger command os.chdir(work_area) # call cellranger for each sample and append info to json dict - for sample in sample_list: + for sample in sample_id_list: + if sample_genome_dict[sample] != "Human" and sample_genome_dict[sample] != "Mouse": + sample_genome_dict[sample] = "Mouse" + tag = get_tag(sample_recipe_dict[sample]) # if recipe within the tool being set up, lanuch cellranger if tag == "arc": validation = multiome_valid(sample_fastqfile_dict[sample]) if validation[0] == "YES": create_library_csv_file(validation[1], validation[2], sample) - tool = config_dict[tag]["tool"] - transcriptome = config_dict[tag]["genome"][species] - cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}/Sample_{}.csv".format(work_area, sample) + OPTIONS + tool = CONFIG.config_dict[tag]["tool"] + transcriptome = CONFIG.config_dict[tag]["genome"][sample_genome_dict[sample]] + cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}Sample_{}.csv".format(work_area, sample) + OPTIONS bsub_cmd = "bsub -J {}_{}_{}_ARC -o {}_ARC.out{}".format(sequencer_and_run, project, sample, sample, cmd) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) else: - print("Multiome sample not finished yet") - print(validation) + print("Multiome sample set not complete yet") elif tag == "spaceranger": sample_info = scripts.cellranger_spatial.Spatial_sample(sample, project) if sample_info.tiff_image == "EMPTY": - print("check tif image") + print("check tif image for sample {}".format(sample)) else: - tool = config_dict[tag]["tool"] - transcriptome = config_dict[tag]["genome"][species] + tool = CONFIG.config_dict[tag]["tool"] + transcriptome = CONFIG.config_dict[tag]["genome"][sample_genome_dict[sample]] cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--fastqs=" + ",".join(sample_fastqfile_dict[sample]) + " --image={} --slide={} --area={}".format(sample_info.tiff_image, sample_info.chip_id, sample_info.chip_position) if sample_info.cytAssist: cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--fastqs=" + ",".join(sample_fastqfile_dict[sample]) + " --cytaimage={} --slide={} --area={}".format(sample_info.tiff_image, sample_info.chip_id, sample_info.chip_position) - if species == "Human": - probe = config_dict[tag]["probe"]["Human_CytAssist"] + if sample_genome_dict[sample] == "Human": + probe = CONFIG.config_dict[tag]["probe"]["Human_CytAssist"] cmd = cmd + " --probe-set={}".format(probe) - elif species == "Mouse": - probe = config_dict[tag]["probe"][species] + elif sample_genome_dict[sample] == "Mouse": + probe = CONFIG.config_dict[tag]["probe"][sample_genome_dict[sample]] cmd = cmd + " --probe-set={}".format(probe) elif sample_info.preservation == "FFPE": - probe = config_dict[tag]["probe"][species] + probe = CONFIG.config_dict[tag]["probe"][sample_genome_dict[sample]] cmd = cmd + " --probe-set={}".format(probe) # if there is manual alignment json file availabe, add that to the cmd if sample_info.json != "EMPTY": cmd = cmd + " --loupe-alignment={}".format(sample_info.json) - - bsub_cmd = "bsub -J {}_{}_{}_SPATIAL -o {}_SPATIAL.out{}{}".format(sequencer_and_run, project, sample, sample, cmd, OPTIONS) + + bsub_cmd = "bsub -J {}_{}_{}_SPATIAL -o {}_SPATIAL.out{}{}".format(sequencer_and_run, project, sample, sample, cmd, CONFIG.OPTIONS) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) - + elif tag != "Skip": - cmd = generate_cellranger_cmd(sample, tag, species, sample_fastqfile_dict[sample], sequencer_and_run) + cmd = generate_cellranger_cmd(sample, tag, sample_genome_dict[sample], sample_fastqfile_dict[sample], sequencer_and_run) print(cmd) subprocess.run(cmd, shell=True) send_json["samples"].append({"sample":"Sample_" + sample, "type":tag, "project":project, "run":sequencer_and_run}) + if send_json["samples"]: - create_json(send_json, sequencer_and_run, project, tag, work_area) + create_json(send_json, sequencer_and_run, project, work_area) + +# Main function: launch cellranger cmd by given samplesheet object and sequencer_and_run +def launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run): + # get parameters from sample_sheet + # dictionary of Sample_ID->Project + sample_project_dict = pd.Series(sample_sheet.df_ss_data["Sample_Project"].values,index=sample_sheet.df_ss_data["Sample_ID"]).to_dict() + # dictionary of project->sample_ID + project_sample_dict = {} + for sample_ID, project_ID in sample_project_dict.items(): + if project_ID in project_sample_dict.keys(): + project_sample_dict[project_ID].append(sample_ID) + else: + project_sample_dict[project_ID] = [sample_ID] + # dictionary of sample_ID->recipe + sample_recipe_dict = pd.Series(sample_sheet.df_ss_data["Sample_Well"].values,index=sample_sheet.df_ss_data["Sample_ID"]).to_dict() + # dictionary of sample_ID->genome + sample_genome_dict = pd.Series(sample_sheet.df_ss_data["Sample_Plate"].values,index=sample_sheet.df_ss_data["Sample_ID"]).to_dict() + # launch cellranger cmd for each project + for project in project_sample_dict.keys(): + # SCRI or SAIL samples don't need to run cellranger + if (not any(prj in project for prj in CONFIG.DO_NOT_PROCESS)): + sample_list = project_sample_dict[project] + lanuch_by_project(sequencer_and_run, project, sample_list, sample_genome_dict, sample_recipe_dict) + +def launch_cellranger_by_project_location(project_directory, recipe, species): + # get sample_ID list + sample_list_ori = os.listdir(project_directory) + sample_list = [] + for sample in sample_list_ori: + # remove Sample_ prefix + sample_list.append(sample[7:]) + # get project and run info from project_directory + project = project_directory.split("/")[5] + sequencer_and_run = project_directory.split("/")[4] + sample_genome_dict = {} + sample_recipe_dict = {} + for sample in sample_list: + sample_genome_dict[sample] = species + sample_recipe_dict[sample] = recipe + + lanuch_by_project(sequencer_and_run, project, sample_list, sample_genome_dict, sample_recipe_dict) if __name__ == '__main__': @@ -450,4 +267,4 @@ def lanuch_by_project(project_directory, recipe, species): project_directory = sys.argv[1] recipe = sys.argv[2] species = sys.argv[3] - lanuch_by_project(project_directory, recipe, species) + launch_cellranger_by_project_location(project_directory, recipe, species) diff --git a/stats_by_project_dag.py b/stats_by_project_dag.py index b56366d..2f21d2a 100644 --- a/stats_by_project_dag.py +++ b/stats_by_project_dag.py @@ -57,7 +57,7 @@ def run_stats(ds, **kwargs): subprocess.run(cmd, shell=True) elif "10X_" in recipe: - scripts.cellranger.lanuch_by_project(project_directory, recipe, species) + scripts.cellranger.launch_cellranger_by_project_location(project_directory, recipe, species) elif "ONT" in recipe: cmd = "bsub -J ont_stats_{} -n 16 -M 16 /igo/work/nabors/tools/venvpy3/bin/python /igo/work/igo/igo-demux/scripts/ont_stats.py {}".format(project_id, project_directory) print(cmd) From a1ab25e88065a39aea4926f66ed81f6f1fb42ffa Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 1 Apr 2024 09:39:55 -0400 Subject: [PATCH 34/87] Update cellranger.py --- scripts/cellranger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index 5c5149e..d5ed8fa 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -173,7 +173,7 @@ def lanuch_by_project(sequencer_and_run, project, sample_id_list, sample_genome_ create_library_csv_file(validation[1], validation[2], sample) tool = CONFIG.config_dict[tag]["tool"] transcriptome = CONFIG.config_dict[tag]["genome"][sample_genome_dict[sample]] - cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}Sample_{}.csv".format(work_area, sample) + OPTIONS + cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}Sample_{}.csv".format(work_area, sample) + CONFIG.OPTIONS bsub_cmd = "bsub -J {}_{}_{}_ARC -o {}_ARC.out{}".format(sequencer_and_run, project, sample, sample, cmd) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) From 18325f9dda1a42d464f81b0f0e92754658be08f6 Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 2 Apr 2024 11:09:20 -0400 Subject: [PATCH 35/87] add create bam option for new version --- scripts/cellranger_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py index 801ea70..73e20ac 100644 --- a/scripts/cellranger_config.py +++ b/scripts/cellranger_config.py @@ -58,7 +58,7 @@ } # cellranger command line options -OPTIONS = " --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" +OPTIONS = " --create-bam=true --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" # 10X recipe list for different pipelines COUNT_FLAVORS = ["10X_Genomics_GeneExpression-3", "10X_Genomics_GeneExpression-5"] From 43aab4862dfed7c6339ecfbd28cf8c6bdddfd4af Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 2 Apr 2024 11:12:53 -0400 Subject: [PATCH 36/87] Update test_scripts.py --- test_scripts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_scripts.py b/test_scripts.py index fe372d6..8feba1f 100644 --- a/test_scripts.py +++ b/test_scripts.py @@ -21,9 +21,9 @@ def testCellranger_generate_cellranger_cmd(): if genome_dict[sample] != "Human" and genome_dict[sample] != "Mouse": genome_dict[sample] = "Mouse" cmd.append(cellranger.generate_cellranger_cmd(sample, "count", genome_dict[sample], fastq_file_list_dict[sample], "DIANA_0453_AHFKJ5DRXY")) - test_result = ["bsub -J DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_06265_8869_1_IGO_06265_AG_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_06265_AG/Sample_06265_8869_1_IGO_06265_AG_3 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", - "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_Third-Transcriptome_IGO_11969_E_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3,/igo/staging/FASTQ/DIANA_0454_BH555MDMXY/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", - "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_Second_IGO_11969_E_2__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_11969_E/Sample_Second_IGO_11969_E_2,/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Second_IGO_11969_E_2 --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200"] + test_result = ["bsub -J DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_06265_AG_06265_8869_1_IGO_06265_AG_3_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_06265_8869_1_IGO_06265_AG_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-GRCh38-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_06265_AG/Sample_06265_8869_1_IGO_06265_AG_3 --create-bam=true --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", + "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Third-Transcriptome_IGO_11969_E_3_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_Third-Transcriptome_IGO_11969_E_3__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3,/igo/staging/FASTQ/DIANA_0454_BH555MDMXY/Project_11969_E/Sample_Third-Transcriptome_IGO_11969_E_3 --create-bam=true --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200", + "bsub -J DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger -o DIANA_0453_AHFKJ5DRXY_Project_11969_E_Second_IGO_11969_E_2_count_cellranger.out /igo/work/nabors/tools/cellranger-8.0.0/cellranger count --id=Sample_Second_IGO_11969_E_2__count --transcriptome=/igo/work/nabors/genomes/10X_Genomics/GEX/refdata-gex-mm10-2020-A --fastqs=/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_11969_E/Sample_Second_IGO_11969_E_2,/igo/staging/FASTQ/DIANA_0450_AH3JL3DSX3/Project_11969_E/Sample_Second_IGO_11969_E_2 --create-bam=true --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200"] for i in range (3): assert(cmd[i] == test_result[i]) From 54128a616ea3b92a886ecbff5ce9f2774fa0fd0d Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 4 Apr 2024 07:34:49 -0400 Subject: [PATCH 37/87] Update demux_run_dag.py new script could not find launch_cellranger. new script is launch_cellranger_by_sample_sheet --- demux_run_dag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/demux_run_dag.py b/demux_run_dag.py index 802769d..7a743f4 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -180,7 +180,7 @@ def stats(ds, **kwargs): # launch cell ranger based on recipe sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3]) - scripts.cellranger.launch_cellranger(sample_sheet, sequencer_and_run_prefix) + scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix) else: # step 1, generate txt files containing total reads and upload to qc website From e22ea68b06d6438b3c6afd6b34ab7b58fd237229 Mon Sep 17 00:00:00 2001 From: luc Date: Thu, 4 Apr 2024 09:07:43 -0400 Subject: [PATCH 38/87] update cellranger arc path --- scripts/cellranger_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py index 73e20ac..a45d3c1 100644 --- a/scripts/cellranger_config.py +++ b/scripts/cellranger_config.py @@ -36,7 +36,7 @@ "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger multi " }, "arc": { - "tool": " /igo/work/bin/cellranger-arc-2.0.2/cellranger-arc count ", + "tool": " /igo/work/nabors/tools/cellranger-arc-2.0.2/cellranger-arc count ", "genome": { "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 ", "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ARC/refdata-cellranger-arc-mm10-2020-A-2.0.0 " From 189c214e2d55942ca5d907b5f0d62df37e79abc1 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Tue, 9 Apr 2024 14:17:54 -0400 Subject: [PATCH 39/87] CELLRANGER AND PIPELINE directories moving CELLRANGER and PIPELINE directories from STATS to STAGING directory --- demux_run_dag.py | 2 +- scripts/cellranger_config.py | 2 +- scripts/cellranger_multi.py | 8 ++++---- scripts/deliver_cellranger.py | 2 +- scripts/deliver_pipeline.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/demux_run_dag.py b/demux_run_dag.py index 7a743f4..12e3733 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -193,7 +193,7 @@ def stats(ds, **kwargs): scripts.cellranger.launch_cellranger_by_sample_sheet(sample_sheet, sequencer_and_run_prefix) # add DONE file when all the 10X pipeline finished, -K to wait until finish - cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/stats/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix) + cmd = 'bsub -K -J wait_stats_done_for_{} -w \"ended(create_json___{}*)\" touch /igo/staging/CELLRANGER/{}/DONE'.format(sequencer_and_run_prefix, sequencer_and_run_prefix, sequencer_and_run_prefix) print(cmd) subprocess.run(cmd, shell=True) diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py index a45d3c1..0d4a590 100644 --- a/scripts/cellranger_config.py +++ b/scripts/cellranger_config.py @@ -1,5 +1,5 @@ # work folder -STATS_AREA = "/igo/stats/CELLRANGER/" +STATS_AREA = "/igo/staging/CELLRANGER/" # config info ACCESS = 0o775 diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index 583f6a3..3bead5c 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -66,7 +66,7 @@ def find_fastq_file(sample_ID_list): DRIVE_LOCATION = "/igo/work/igo/Cellranger_Multi_Config/" ORIGIN_DRIVE_LOCATION = "/rtssdc/mohibullahlab/LIMS/LIMS_cellranger_multi/" BAMTOFASTQ = "/igo/work/nabors/tools/cellranger-7.0.0/lib/bin/bamtofastq" -STATS_AREA = "/igo/stats/PIPELINE/" +STATS_AREA = "/igo/staging/PIPELINE/" # endpoint for cellranger multi ENDPOINT= "https://igolims.mskcc.org:8443/LimsRest/getTenxSampleInfo?requestId=" @@ -157,7 +157,7 @@ def new_config_and_generate_cmd(self): # get reads number and sub sample cell number def update_info_from_step1(self, fb_project_id): # get total reads number for gene expression library - reads_file = "/igo/stats/PIPELINE/Project_{}_step1/{}/outs/per_sample_outs/{}/metrics_summary.csv".format(fb_project_id, self.name, list(self.samples.keys())[0]) + reads_file = "/igo/staging/PIPELINE/Project_{}_step1/{}/outs/per_sample_outs/{}/metrics_summary.csv".format(fb_project_id, self.name, list(self.samples.keys())[0]) summary_metrix = pd.read_csv(reads_file) ind = summary_metrix.index[(summary_metrix["Category"] == "Library") & (summary_metrix["Metric Name"] == "Number of reads") & (summary_metrix["Library Type"] == "Gene Expression") & (summary_metrix["Grouped By"] == "Physical library ID")].tolist() reads_number = summary_metrix.iloc[ind[0]]["Metric Value"] @@ -165,7 +165,7 @@ def update_info_from_step1(self, fb_project_id): self.ge_reads_number = reads_number # update sub sample cell number - cell_file = "/igo/stats/PIPELINE/Project_{}_step1/{}/outs/multi/multiplexing_analysis/tag_calls_summary.csv".format(fb_project_id, self.name) + cell_file = "/igo/staging/PIPELINE/Project_{}_step1/{}/outs/multi/multiplexing_analysis/tag_calls_summary.csv".format(fb_project_id, self.name) cell_matrix = pd.read_csv(cell_file) for key, value in self.samples.items(): if value in cell_matrix["Category"].values: @@ -286,7 +286,7 @@ def cellragner_ch_vdj(config, file_name, ch_project_ID, project_ID, ge): # create bam2fastq cmd per sub sample for key in config.sub_sample_info.keys(): name2 = ge + "_" + key - source_bam = "/igo/stats/PIPELINE/Project_{}_step1/{}/outs/per_sample_outs/{}/count/sample_alignments.bam".format(ch_project_ID, ge, key) + source_bam = "/igo/staging/PIPELINE/Project_{}_step1/{}/outs/per_sample_outs/{}/count/sample_alignments.bam".format(ch_project_ID, ge, key) destination_bam = "{}Project_{}/bamtofastq/{}".format(CONFIG_AREA, project_ID, name2) cmd = "bsub -K -J {}_bamtofastq -o {}_bamtofastq.out -n 8 -M 8 {} --reads-per-fastq={} {} {}".format(name2, name2, BAMTOFASTQ, config.ge_reads_number, source_bam, destination_bam) print(cmd) diff --git a/scripts/deliver_cellranger.py b/scripts/deliver_cellranger.py index 13d80cd..f1c946a 100644 --- a/scripts/deliver_cellranger.py +++ b/scripts/deliver_cellranger.py @@ -4,7 +4,7 @@ # given project ID, look through cellranger folder and return a list of path of folders need to copy -CELLRANGER_DIR = '/igo/stats/CELLRANGER/' +CELLRANGER_DIR = '/igo/staging/CELLRANGER/' # structure '/igo/stats/CELLRANGER/RUNNAME/PROJECTID/SAMPLEFOLDER # find all the cellranger result given project ID, return a list of address diff --git a/scripts/deliver_pipeline.py b/scripts/deliver_pipeline.py index d72b158..26e8af8 100644 --- a/scripts/deliver_pipeline.py +++ b/scripts/deliver_pipeline.py @@ -7,7 +7,7 @@ - Re-run setaccess.py (on a separate server) At time of delivery for all 10X projects: -- Search under folder /igo/stats/CELLRANGER/ for any possible cell ranger output +- Search under folder /igo/staging/CELLRANGER/ for any possible cell ranger output - If existing, then copy to delivery/pipeline/cellranger directory """ @@ -64,7 +64,7 @@ def deliver_pipeline_output(project, pi, recipe): # if recipe is CRISPRSeq or GeoMx, go to pipeline folder and find output, if exists the copy # add cellranger multi output for featurebarcoding project here for now elif recipe == "CRISPRSeq" or recipe == "GeoMx" or recipe == "GeoMX" or recipe == "10XGenomics_FeatureBarcoding": - pipeline_path = "/igo/stats/PIPELINE/Project_" + project + pipeline_path = "/igo/staging/PIPELINE/Project_" + project if not os.path.exists(pipeline_path): print("No pipeline result available") else: From 9cd961c8b17be65f0360259a06989171cae115ae Mon Sep 17 00:00:00 2001 From: luc Date: Fri, 12 Apr 2024 09:09:32 -0400 Subject: [PATCH 40/87] fixed arc cmd option issue --- scripts/cellranger.py | 2 +- scripts/cellranger_config.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index d5ed8fa..d238edd 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -173,7 +173,7 @@ def lanuch_by_project(sequencer_and_run, project, sample_id_list, sample_genome_ create_library_csv_file(validation[1], validation[2], sample) tool = CONFIG.config_dict[tag]["tool"] transcriptome = CONFIG.config_dict[tag]["genome"][sample_genome_dict[sample]] - cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}Sample_{}.csv".format(work_area, sample) + CONFIG.OPTIONS + cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--libraries={}Sample_{}.csv".format(work_area, sample) + CONFIG.ARC_OPTIONS bsub_cmd = "bsub -J {}_{}_{}_ARC -o {}_ARC.out{}".format(sequencer_and_run, project, sample, sample, cmd) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py index 0d4a590..235488f 100644 --- a/scripts/cellranger_config.py +++ b/scripts/cellranger_config.py @@ -59,6 +59,7 @@ # cellranger command line options OPTIONS = " --create-bam=true --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" +ARC_OPTIONS = " --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" # 10X recipe list for different pipelines COUNT_FLAVORS = ["10X_Genomics_GeneExpression-3", "10X_Genomics_GeneExpression-5"] From 7633e51991e40b384776f9e4f2ff0f9b21c636ec Mon Sep 17 00:00:00 2001 From: luc Date: Fri, 12 Apr 2024 14:47:47 -0400 Subject: [PATCH 41/87] fix vdj cmd --- scripts/cellranger.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index d238edd..73b7737 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -72,6 +72,8 @@ def generate_cellranger_cmd(sample_ID, tag, genome, fastq_file_path, sequencer_a transcriptome = CONFIG.config_dict[tag]["genome"][genome] project_ID = "Project_" + "_".join(sample_ID.split("_")[sample_ID.split("_").index("IGO") + 1:-1]) cellranger_cmd = "{}--id=Sample_{}__{}".format(tool, sample_ID, tag) + transcriptome + "--fastqs=" + ",".join(fastq_file_path) + CONFIG.OPTIONS + if tag == "vdj": + cellranger_cmd = cellranger_cmd.replace(" --create-bam=true", "") job_name = "{}_{}_{}_{}_cellranger".format(sequencer_and_run, project_ID, sample_ID, tag) bsub_cmd = "bsub -J {} -o {}.out{}".format(job_name, job_name, cellranger_cmd) return bsub_cmd From 9c72cf417f85551ff4cfa4f9d6d28e5f46a25404 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sat, 13 Apr 2024 11:24:10 -0700 Subject: [PATCH 42/87] Update LaunchMetrics.py bringing dragen servers ID02 and ID03 back online --- scripts/LaunchMetrics.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index db0fd1c..0573165 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -138,7 +138,6 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - # rna_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" rna_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: rna_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) @@ -147,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -176,14 +175,13 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - # dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38-alt_masked.cnv.graph.hla.rna-9-r3.0-1" else: dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/{}".format(sample_parameters["GTAG"]) metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 47dff759e73e50df69c471a465af333d0f3ef132 Mon Sep 17 00:00:00 2001 From: luc Date: Sun, 14 Apr 2024 21:52:09 -0400 Subject: [PATCH 43/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index 3bead5c..c2def93 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -2,7 +2,6 @@ import os import subprocess import glob -from subprocess import call import argparse from collections import OrderedDict import requests @@ -191,7 +190,7 @@ def ch_file_generation(project_id, sample_name): tag_seq_dict = pd.Series(df['Hashtag sequence'].values,index=df['Hashtag Name']).to_dict() sub_sample_dict = {} - sub_sample_lst = df[df["Sample Name in IGO"] == sample_name]["Sample Name"].tolist() + sub_sample_lst = df[str(df["Sample Name in IGO"]) == sample_name]["Sample Name"].tolist() for item in sub_sample_lst: sub_sample_dict[item] = sample_tag_dict[item] @@ -401,7 +400,10 @@ def gather_sample_set_info(sample_name): fb_type.append("Cell Hashing") if "Feature Barcoding" in tag_lst: fb_type.append("Feature Barcoding") - # TODO add vdj type + if "T Cells" in tag_lst: + vdj_type.append("VDJ-T") + if "B Cells" in tag_lst: + vdj_type.append("VDJ-B") print(fb_type, vdj_type) break @@ -417,7 +419,7 @@ def gather_sample_set_info(sample_name): sample_set["ch"] = "_IGO_".join([value[1], key]) if "10X_Genomics_VDJ" in value[2][0]: sample_set["vdj"] = "_IGO_".join([value[1], key]) - + # TODO add vdj type to the whole pipeline return sample_set # TODO check whether a project set is complete to launch pipeline From b809d8127a68e848a2bf888722abae8bf6b21368 Mon Sep 17 00:00:00 2001 From: luc Date: Sun, 14 Apr 2024 21:57:43 -0400 Subject: [PATCH 44/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index c2def93..ebe9ed6 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -190,7 +190,7 @@ def ch_file_generation(project_id, sample_name): tag_seq_dict = pd.Series(df['Hashtag sequence'].values,index=df['Hashtag Name']).to_dict() sub_sample_dict = {} - sub_sample_lst = df[str(df["Sample Name in IGO"]) == sample_name]["Sample Name"].tolist() + sub_sample_lst = df[df["Sample Name in IGO"].astype(str) == str(sample_name)]["Sample Name"].tolist() for item in sub_sample_lst: sub_sample_dict[item] = sample_tag_dict[item] From 90da28e3883b8c6cd5939c5fb56af27c137b6dc8 Mon Sep 17 00:00:00 2001 From: David McManamon Date: Mon, 15 Apr 2024 10:57:18 -0400 Subject: [PATCH 45/87] Update LaunchMetrics.py only host id01 is working currently --- scripts/LaunchMetrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 0573165..16bd85c 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) @@ -300,4 +300,4 @@ def launch_picard(bams_by_lane, run, sample, sample_parameters, work_directory): - \ No newline at end of file + From 9341c6c63a3a4ba5bb4476f4678143b84b56e4be Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 15 Apr 2024 15:38:13 -0400 Subject: [PATCH 46/87] add correct fastq list step after create folder --- scripts/organise_fastq_split_by_lane.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/organise_fastq_split_by_lane.py b/scripts/organise_fastq_split_by_lane.py index f0d81a7..bb7c108 100644 --- a/scripts/organise_fastq_split_by_lane.py +++ b/scripts/organise_fastq_split_by_lane.py @@ -92,7 +92,7 @@ def correct_fastq_list_csv(demux_reports_dir): demux_dir = sys.argv[2] if demux_type == "create": create_fastq_folders(demux_dir) - # add correct fastq list step? + correct_fastq_list_csv(demux_dir+"/Reports") elif demux_type == "correct": correct_sample_folder_name(demux_dir) else: From cc2e1a43c63a37bf8ce6527c560998cba113ed54 Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 16 Apr 2024 15:05:47 -0400 Subject: [PATCH 47/87] add demux stats option for stats dag --- scripts/get_total_reads_from_demux.py | 28 ++++++++++++++------------- stats_by_project_dag.py | 3 +++ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/scripts/get_total_reads_from_demux.py b/scripts/get_total_reads_from_demux.py index 612ef5e..87c0419 100644 --- a/scripts/get_total_reads_from_demux.py +++ b/scripts/get_total_reads_from_demux.py @@ -3,6 +3,7 @@ import numpy import json import re +import os # get total reads number from Demultiplex_Stats.csv file or json file and generate txt files for each sample # add DLP type function. For DLP, only total reads for each project is needed @@ -98,22 +99,23 @@ def run(sample_sheet, sequencer_and_run): print("generate AM txt files to folder: {}".format(stats_done_dir)) # generate AM txt files containing total reads by project ID such as "Project_12754_E" -def by_project(sample_sheet, project_id, sequencer_and_run): +def by_project_location(project_directory): + # get sample_ID list + sample_list_ori = os.listdir(project_directory) + sample_list = [] + for sample in sample_list_ori: + # remove Sample_ prefix + sample_list.append(sample[7:]) + # get run info from project_directory + sequencer_and_run = project_directory.split("/")[4] + sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3]) sequencer = sequencer_and_run.split("_")[0] stats_done_dir = STATS_DONE_DIR_PREFIX + sequencer + "/" - demux_report_file = "/igo/staging/FASTQ/" + sequencer_and_run + "/Reports/Demultiplex_Stats.csv" - # dictionary of Sample_ID->Project - sample_project_dict = pd.Series(sample_sheet.df_ss_data['Sample_Project'].values,index=sample_sheet.df_ss_data['Sample_ID']).to_dict() - - sample_ID_list = [] - # filter sample_ID by projectID and append to sample_ID_list - for sample, project in sample_project_dict.items(): - if project == project_id: - sample_ID_list.append(sample) - - total_reads_dict = get_total_reads(sample_ID_list, demux_report_file) - for sample in sample_ID_list: + demux_report_file = project_directory + "/Reports/Demultiplex_Stats.csv" + + total_reads_dict = get_total_reads(sample_list, demux_report_file) + for sample in sample_list: write_to_am_txt(sequencer_and_run_prefix, sample, total_reads_dict[sample], stats_done_dir) print("generate AM txt files to folder: {}".format(stats_done_dir)) diff --git a/stats_by_project_dag.py b/stats_by_project_dag.py index 2f21d2a..b99dc9e 100644 --- a/stats_by_project_dag.py +++ b/stats_by_project_dag.py @@ -23,6 +23,7 @@ def run_stats(ds, **kwargs): import subprocess import scripts.cellranger_multi import os + import scripts.get_total_reads_from_demux project_directory = kwargs["params"]["project_directory"] recipe = kwargs["params"]["recipe"] @@ -62,6 +63,8 @@ def run_stats(ds, **kwargs): cmd = "bsub -J ont_stats_{} -n 16 -M 16 /igo/work/nabors/tools/venvpy3/bin/python /igo/work/igo/igo-demux/scripts/ont_stats.py {}".format(project_id, project_directory) print(cmd) subprocess.run(cmd, shell=True) + elif recipe == "demux_stats": + scripts.get_total_reads_from_demux.by_project_location(project_directory) else: scripts.calculate_stats.main([project_directory, recipe, species]) From 1ec9c3bb77d20c9978b92218ebb2b3008e79687b Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 16 Apr 2024 15:13:29 -0400 Subject: [PATCH 48/87] Update get_total_reads_from_demux.py --- scripts/get_total_reads_from_demux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get_total_reads_from_demux.py b/scripts/get_total_reads_from_demux.py index 87c0419..9cf02b8 100644 --- a/scripts/get_total_reads_from_demux.py +++ b/scripts/get_total_reads_from_demux.py @@ -112,7 +112,7 @@ def by_project_location(project_directory): sequencer_and_run_prefix = "_".join(sequencer_and_run.split("_")[0:3]) sequencer = sequencer_and_run.split("_")[0] stats_done_dir = STATS_DONE_DIR_PREFIX + sequencer + "/" - demux_report_file = project_directory + "/Reports/Demultiplex_Stats.csv" + demux_report_file = "/igo/staging/FASTQ/" + sequencer_and_run + "/Reports/Demultiplex_Stats.csv" total_reads_dict = get_total_reads(sample_list, demux_report_file) for sample in sample_list: From 326971a511ad0d7324c4cade4686273ef1629c03 Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 17 Apr 2024 14:33:33 -0400 Subject: [PATCH 49/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index ebe9ed6..b04a987 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -234,8 +234,8 @@ def gather_config_info(sample_dict, genome, IGO_ID): config.gene_expression["cmo-set"] = CONFIG_AREA + "Project_{}/Project_{}_ch_{}.csv".format(project_ID, project_ID, sample_name) config.samples = ch_file_generation(project_ID, sample_name) - # if both ch and fb are there, change the ch name - if "ch" in sample_dict.keys() and "fb" in sample_dict.keys(): + # if both ch and fb are there and vdj not there, change the ch name + if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and "vdj" not in sample_dict.keys(): sample_dict["ch"] = sample_dict["ch"].replace("FB_IGO", "CH_IGO") # find fastq files for each sample and append information into config["libraries"] From 177a0ee1377217516e443d8c2ab1e9e88102428a Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 17 Apr 2024 14:46:38 -0400 Subject: [PATCH 50/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index b04a987..63562fe 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -449,6 +449,7 @@ def gather_sample_set_info(sample_name): genome = args.genome config = gather_config_info(sample_dict, genome, args.ge) + print(config.lirbaries) project_ID = "_".join(args.ge.split("IGO_")[1].split("_")[:-1]) file_name = "{}Project_{}/{}.csv".format(CONFIG_AREA, project_ID, args.ge) From ba8b9ed1d7f23bb48fa6ed517674499a685dbfb6 Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 17 Apr 2024 14:56:21 -0400 Subject: [PATCH 51/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index 63562fe..47f9b1d 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -235,7 +235,7 @@ def gather_config_info(sample_dict, genome, IGO_ID): config.samples = ch_file_generation(project_ID, sample_name) # if both ch and fb are there and vdj not there, change the ch name - if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and "vdj" not in sample_dict.keys(): + if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and ("vdj" not in sample_dict.keys()): sample_dict["ch"] = sample_dict["ch"].replace("FB_IGO", "CH_IGO") # find fastq files for each sample and append information into config["libraries"] @@ -244,6 +244,7 @@ def gather_config_info(sample_dict, genome, IGO_ID): sample_list.append(i) fastq_list = find_fastq_file(sample_list) for key, value in sample_dict.items(): + print("key: {}, value: {}".format(key, value)) if key == "ge": config.lirbaries[value] = [fastq_list[value], "Gene Expression"] elif key == "vdj": From 051e30b133b443e1b5f690bc8c7aa4240ddc9c3a Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 17 Apr 2024 15:17:38 -0400 Subject: [PATCH 52/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index 47f9b1d..47838b4 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -114,6 +114,7 @@ def write_ch_ge_only_to_csv(self, name_of_file): file.write("\n[libraries]\nfastq_id,fastqs,feature_types\n") for key, value in self.lirbaries.items(): + key.replace("_CHMARKER_", "") if value[1] == "Gene Expression" or value[1] == "Multiplexing Capture": for i in value[0]: file.write("{},{},{}\n".format(key, i, value[1])) @@ -252,7 +253,11 @@ def gather_config_info(sample_dict, genome, IGO_ID): elif key == "fb": config.lirbaries[value] = [fastq_list[value], "Antibody Capture"] elif key == "ch": - config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"] + # for case of all ch, fb and vdj exits and doesn't need to make two copies of fb fastq file + if "ch" in sample_dict.keys() and "fb" in sample_dict.keys() and "vdj" in sample_dict.keys(): + config.lirbaries[value + "_CHMARKER_"] = [fastq_list[value], "Multiplexing Capture"] + else: + config.lirbaries[value] = [fastq_list[value], "Multiplexing Capture"] return config From 7f974d891760cc8ae63b36eede8909fb7fa4510d Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 17 Apr 2024 15:21:09 -0400 Subject: [PATCH 53/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index 47838b4..dae2b40 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -114,7 +114,7 @@ def write_ch_ge_only_to_csv(self, name_of_file): file.write("\n[libraries]\nfastq_id,fastqs,feature_types\n") for key, value in self.lirbaries.items(): - key.replace("_CHMARKER_", "") + key = key.replace("_CHMARKER_", "") if value[1] == "Gene Expression" or value[1] == "Multiplexing Capture": for i in value[0]: file.write("{},{},{}\n".format(key, i, value[1])) From 766ee25ce427a0ab25a180fae592f5e3188e5ab3 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Fri, 19 Apr 2024 10:25:28 -0400 Subject: [PATCH 54/87] updating LaunchMetrics.py pointing script to new methylated tables for hg38 and grcm39 for dragen 4.2 added id02 and id03 dragen servers back to production. --- scripts/run_param_config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index f74814a..ef2e78a 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -18,11 +18,12 @@ HAPLOTYPE_MAP = "HAPLOTYPE_MAP" # 3) Determined by recipe (see: recipe_options_mapping) -BAITS="BAITS" -TARGETS="TARGETS" -MSKQ="MSKQ" -MD="MD" -DGN_REFERENCE="DGN_REFERENCE" +BAITS = "BAITS" +TARGETS = "TARGETS" +MSKQ = "MSKQ" +MD = "MD" +DGN_REFERENCE = "DGN_REFERENCE" +DGN_REFERENCE = "DGN_REFERENCE" """ D E P E N D E N C Y G R A P H +-----------+ From 13a544a9e6cfae0b25c2634ff0ac453792763515 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Fri, 19 Apr 2024 10:29:59 -0400 Subject: [PATCH 55/87] Update LaunchMetrics.py pointing script to new methylated tables for hg38 and grcm39 for dragen 4.2 added id02 and id03 dragen servers back to production. --- scripts/LaunchMetrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 0573165..1fcb676 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -16,7 +16,7 @@ # Global Variable : we do not want to process these experiments in this script DO_NOT_PROCESS = ["DLP"] # These recipes will be evaluated using DRAGEN because of their larger size of fastqs -RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq"] +RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq", "MethylCaptureSeq"] # these projects willl only need demux stats DEMUX_ONLY = ["SMARTSeq", "10X_Genomics"] @@ -218,13 +218,13 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di # get the correct path for the reference if (sample_parameters["GTAG"] == "GRCh38"): - dragen_path = "/igo/work/igo/dragen_hash_tables/hg38_methylated" + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/hg38_methylated" else: - dragen_path = "/igo/work/igo/dragen_hash_tables/grcm39_methylated" + dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/grcm39_methylated" metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From a08df4add833c69f2d26cd745456e9876ea826ae Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Fri, 19 Apr 2024 10:33:52 -0400 Subject: [PATCH 56/87] Update LaunchMetrics.py adding bin memory option to dragen rna and dragen methylation options --- scripts/LaunchMetrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 8f993dc..86907f2 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -145,7 +145,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo rna_dragen_job_name_header = "{}___RNA_DRAGEN___".format(run) - launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} ".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) + launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -223,7 +223,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di dragen_path = "/igo/work/igo/dragen_hash_tables/4.2/grcm39_methylated" metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) - launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) + launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From e9299aa04d5b5868d6075fd7fba884e7273d63f4 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 21 Apr 2024 10:01:34 -0400 Subject: [PATCH 57/87] Update LaunchMetrics.py bringing ID02 and ID03 back online --- scripts/LaunchMetrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 86907f2..7789d20 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -21,7 +21,7 @@ DEMUX_ONLY = ["SMARTSeq", "10X_Genomics"] # Organisms to have DRAGEN BAMS -DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"] +DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39", "dm6"] # this list contains the headers of the columns. we will access the data using these listings PICARD_VERSION = "2_23_2" PICARD_JAR = "/igo/home/igo/resources/picard2.23.2/picard.jar " @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From c7817240ebe84087daeb07f33c4d1671a9cb8cfb Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 21 Apr 2024 11:06:12 -0400 Subject: [PATCH 58/87] updates for drosophila changing entries in run param config script to point to correct location of drosophila genome. taking out dm6 (drosophila) from the DRAGEN_RNA_GENOMES list --- scripts/LaunchMetrics.py | 2 +- scripts/run_param_config.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 7789d20..014919a 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -21,7 +21,7 @@ DEMUX_ONLY = ["SMARTSeq", "10X_Genomics"] # Organisms to have DRAGEN BAMS -DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39", "dm6"] +DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"] # this list contains the headers of the columns. we will access the data using these listings PICARD_VERSION = "2_23_2" PICARD_JAR = "/igo/home/igo/resources/picard2.23.2/picard.jar " diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index ef2e78a..a8f4654 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -281,15 +281,15 @@ def get_ordered_dic(unordered_dic): }, "dm6": { DEFAULT: { - GENOME: "/igo/work/nabors/genomes/Drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa", - REFERENCE: "/igo/work/nabors/genomes/Drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa" + GENOME: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa", + REFERENCE: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa" }, "RNA": { - GENOME: "/igo/work/nabors/genomes/Drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa", - REFERENCE: "/igo/work/nabors/genomes/Drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa", - REF_FLAT: "/igo/work/nabors/genomes/Drosophila_melanogaster/GTF/Drosophila_melanogaster.BDGP6.46.110.gtf.ref.flat", - RIBOSOMAL_INTERVALS: "/igo/work/nabors/genomes/Drosophila_melanogaster/GTF/Drosophila_melanogaster.BDGP6.46.110.gtf.bed.rRNA.intervals", - GTF: "/igo/work/nabors/genomes/Drosophila_melanogaster/GTF/Drosophila_melanogaster.BDGP6.46.110.gtf", + GENOME: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa", + REFERENCE: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa", + REF_FLAT: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/GTF/Drosophila_melanogaster.BDGP6.46.110.gtf.refFlat", + RIBOSOMAL_INTERVALS: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/GTF/Drosophila_melanogaster.BDGP6.46.110.gtf.bed.rRNA.intervals", + GTF: "/igo/work/nabors/genomes/Drosophila_melanogaster/ENSEMBL/GTF/Drosophila_melanogaster.BDGP6.46.110.gtf", GTAG: "dm6" } }, From 4af41c4d277927524247a75ef2b9429bffd3d991 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 21 Apr 2024 11:31:39 -0400 Subject: [PATCH 59/87] Update LaunchMetrics.py took out MethylCaptureSeq from RUN_ON_DRAGEN table --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 014919a..c84d0b2 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -16,7 +16,7 @@ # Global Variable : we do not want to process these experiments in this script DO_NOT_PROCESS = ["DLP"] # These recipes will be evaluated using DRAGEN because of their larger size of fastqs -RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq", "MethylCaptureSeq"] +RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq"] # these projects willl only need demux stats DEMUX_ONLY = ["SMARTSeq", "10X_Genomics"] From ee933011d536ebecc5c5bbce454594c8dccfcb12 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 21 Apr 2024 12:34:09 -0400 Subject: [PATCH 60/87] Update LaunchMetrics.py taking ID01 out of production. right now it is at 98.5%. will let the PE150 jobs launch on ID02 and ID03 --- scripts/LaunchMetrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index c84d0b2..9fd76f2 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) @@ -224,7 +224,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 38530976a6419f8fa93776bed5b9aa255ae766f3 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 21 Apr 2024 15:33:53 -0400 Subject: [PATCH 61/87] Update LaunchMetrics.py switch RNA to ID01 to use up some of the license of ID01 --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 9fd76f2..4007eda 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) From a7d2eb49a2fa9e8cfc06d99585d65adeb7461708 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 21 Apr 2024 15:43:38 -0400 Subject: [PATCH 62/87] Update LaunchMetrics.py switch to dragen servers ID02 and ID03 so stats won't fail because of license expired on ID01 --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 4007eda..9fd76f2 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) From 2c0ff8109832d4045f800c15ac47bd1b8735eed6 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 29 Apr 2024 17:31:30 -0400 Subject: [PATCH 63/87] Update run_param_config.py update for genome for MIssionBio-Heme to run large samples on DRAGEN --- scripts/run_param_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index a8f4654..7fa0307 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -97,7 +97,7 @@ def get_ordered_dic(unordered_dic): "RDM": "hg19", "myTYPE_V1": "hg19", "PanCancerV2": "hg19", - "MissionBio-Heme": "hg19", + "MissionBio-Heme": "GRCh38", "WholeExome_v4": "hg19", "AmpliSeq": "hg19", "HemeBrainPACT_v1": "hg19" @@ -575,8 +575,8 @@ def get_ordered_dic(unordered_dic): MD: "yes" }, "MissionBio-Heme": { - BAITS: "/igo/home/igo/resources/ilist/MissionBio-Heme/AML_BAITS.iList", - TARGETS: "/igo/home/igo/resources/ilist/MissionBio-Heme/AML_BAITS.iList", + BAITS: "/igo/work/nabors/bed_files/Mission_Bio/hg38/MissionBio-Heme_BAITS.iList", + TARGETS: "/igo/work/nabors/bed_files/Mission_Bio/hg38/MissionBio-Heme_TARGETS.iList", MSKQ: "no", MD: "yes" }, From e1b3fbd336f6f945be7de609e6461cde26b500e7 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 9 May 2024 13:44:56 -0400 Subject: [PATCH 64/87] Update LaunchMetrics.py putting id01 back into production --- scripts/LaunchMetrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 9fd76f2..c84d0b2 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -146,7 +146,7 @@ def dragen_rna_alignment_and_metrics(sample, run, sample_parameters, rna_directo launch_dragen_rna = "/opt/edico/bin/dragen -f -r {} --fastq-list {} --fastq-list-sample-id {} -a {} --intermediate-results-dir /staging/temp --enable-map-align true --enable-sort true --enable-bam-indexing true --enable-map-align-output true --output-format BAM --enable-rna true --enable-duplicate-marking true --enable-rna-quantification true --output-file-prefix {} --output-directory {} --bin_memory 50000000000".format(rna_path, fastq_list, sample.sample_id, sample_parameters["GTF"], sample.sample_id, rna_directory) - bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) + bsub_launch_dragen_rna = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(rna_dragen_job_name_header, sample.sample_id, rna_directory, launch_dragen_rna) print(bsub_launch_dragen_rna) call(bsub_launch_dragen_rna, shell = True) @@ -181,7 +181,7 @@ def dragen(sample, run, sample_parameters, work_directory, dragen_directory, fas metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen = "/opt/edico/bin/dragen --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_job_name_header, sample.sample_id, dragen_directory, launch_dragen) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) @@ -224,7 +224,7 @@ def dragen_methylation(sample, run, sample_parameters, work_directory, dragen_di metric_file_prefix = "{}___P{}___{}___{}".format(run, sample.project[8:], sample.sample_id, sample_parameters["GTAG"]) launch_dragen_methylation = "/opt/edico/bin/dragen --enable-methylation-calling true --methylation-protocol directional --ref-dir {} --fastq-list {} --fastq-list-sample-id {} --intermediate-results-dir /staging/temp --output-directory {} --output-file-prefix {} --enable-sort true --enable-duplicate-marking true --bin_memory 50000000000".format(dragen_path, fastq_list, sample.sample_id, dragen_directory, sample.sample_id) - bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) + bsub_launch_dragen = "bsub -J {0}{1} -o {0}{1}.out -cwd \"{2}\" -m \"id01 id02 id03\" -q dragen -n 48 -M 4 {3}".format(dragen_methylation_job_name_header, sample.sample_id, dragen_directory, launch_dragen_methylation) print(bsub_launch_dragen) call(bsub_launch_dragen, shell = True) From 37e925793148a10523500b3bf490cd5dd91c800c Mon Sep 17 00:00:00 2001 From: luc Date: Fri, 10 May 2024 16:28:33 -0400 Subject: [PATCH 65/87] update recipe change --- deliver_pipeline_dag.py | 1 + scripts/cellranger_config.py | 24 ++------ scripts/deliver_pipeline.py | 106 +++++++++++++---------------------- stats_by_project_dag.py | 4 +- 4 files changed, 47 insertions(+), 88 deletions(-) diff --git a/deliver_pipeline_dag.py b/deliver_pipeline_dag.py index e3fe37b..dd1fde7 100644 --- a/deliver_pipeline_dag.py +++ b/deliver_pipeline_dag.py @@ -25,6 +25,7 @@ def deliver(ds, **kwargs): project = kwargs["params"]["project"] pi = kwargs["params"]["pi"] + # recipe here is actually request name recipe = kwargs["params"]["recipe"] print("Delivering the pipeline output and/or .bams for {} {} {}".format(project, pi, recipe)) diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py index 235488f..e4e6105 100644 --- a/scripts/cellranger_config.py +++ b/scripts/cellranger_config.py @@ -18,20 +18,6 @@ "Mouse": " --reference=/igo/work/genomes/10X_Genomics/VDJ/refdata-cellranger-vdj-GRCm38-alts-ensembl-7.0.0 " } }, - "atac_count": { - "tool": " /igo/work/nabors/tools/cellranger-atac-2.1.0/cellranger-atac count ", - "genome": { - "Human": " --reference=/igo/work/nabors/genomes/10X_Genomics/ATAC/refdata-cellranger-atac-GRCh38-1.0.1 ", - "Mouse": " --reference=/igo/work/nabors/genomes/10X_Genomics/ATAC/refdata-cellranger-atac-mm10-1.1.0 " - } - }, - "cnv": { - "tool": " /igo/work/nabors/tools/cellranger-dna-1.1.0/cellranger-dna cnv ", - "genome": { - "Human": " --reference=/igo/work/nabors/10X_Genomics/CNV/refdata-GRCh38-1.0.0 ", - "Mouse": " --reference=/igo/work/nabors/10X_Genomics/CNV/refdata-GRCm38-1.0.0 " - } - }, "multi": { "tool": " /igo/work/nabors/tools/cellranger-8.0.0/cellranger multi " }, @@ -62,12 +48,10 @@ ARC_OPTIONS = " --nopreflight --jobmode=lsf --mempercore=64 --disable-ui --maxjobs=200" # 10X recipe list for different pipelines -COUNT_FLAVORS = ["10X_Genomics_GeneExpression-3", "10X_Genomics_GeneExpression-5"] -VDJ_FLAVORS = ["10X_Genomics_VDJ"] -ATAC_FLAVORS = ["10X_Genomics_ATAC"] -CNV_FLAVORS = ["10X_Genomics_CNV"] -ARC_FLAVORS = ["10X_Genomics_Multiome", "10X_Genomics_Multiome_ATAC", "10X_Genomics_Multiome_GeneExpression"] -SPATIAL_FLAVORS = ["10X_Genomics_Visium"] +COUNT_FLAVORS = ["SC_Chromium-GEX-3", "SC_Chromium-GEX-5"] +VDJ_FLAVORS = ["SC_Chromium-TCR", "SC_Chromium-BCR"] +ARC_FLAVORS = ["SC_Chromium-Multiome", "SC_Chromium-Multiome_ATAC", "SC_Chromium-Multiome_GEX"] +SPATIAL_FLAVORS = ["ST_Visium"] # we do not want to PROCESS SAIL (15500) or SCRI (12437) projects SCRI = "12437" diff --git a/scripts/deliver_pipeline.py b/scripts/deliver_pipeline.py index 26e8af8..895dc63 100644 --- a/scripts/deliver_pipeline.py +++ b/scripts/deliver_pipeline.py @@ -27,70 +27,59 @@ PICARD = "java -jar /igo/home/igo/resources/picard2.23.2/picard.jar " NGS_STATS_FASTQ_ENDPOINT = "http://igodb.mskcc.org:8080/ngs-stats/permissions/getRequestPermissions/" -def deliver_pipeline_output(project, pi, recipe): - if not project or not pi or not recipe: +def deliver_pipeline_output(project, pi, requestName): + if not project or not pi or not requestName: return "Project, pi and recipe are all required arguments." # change pi to all lowercase pi = pi.lower() delivery_folder = LAB_SHARE_DIR + "/" + pi + "/Project_" + project + "/pipeline" - if recipe.startswith("RNASeq"): + if requestName == "RNALibraryPrep": print("Delivering all RNASeq .bams for {} {} {}".format(project, pi, recipe)) bamdict = find_bams(project, STATS_DIR) bsub_commands = write_bams_to_share(bamdict, delivery_folder) reconcile_bam_fastq_list(project, bamdict) return "Completed RNA bams delivery" - # if is missionbio recipe, find tapestri pipelie output and copy all sample folders - elif recipe == "MissionBio": - tapestri_path = "/igo/staging/stats/MissionBio/Project_" + project - if not os.path.exists(tapestri_path): - print("No tapestri result available") - else: - tapestri_delivery_folder = delivery_folder + "/Tapestri" - if not os.path.exists(tapestri_delivery_folder): - print("Creating pipeline delivery folder {}".format(tapestri_delivery_folder)) - os.makedirs(tapestri_delivery_folder) - - # copy each sample folder to the delivery folder - tapestri_path = tapestri_path + "/" - sample_list = os.listdir(tapestri_path) - for sample in sample_list: - sample_folder = tapestri_path + sample - destination = tapestri_delivery_folder + "/" + sample - print("copy {}".format(sample_folder)) - shutil.copytree(sample_folder, destination, symlinks=True) - - # if recipe is CRISPRSeq or GeoMx, go to pipeline folder and find output, if exists the copy - # add cellranger multi output for featurebarcoding project here for now - elif recipe == "CRISPRSeq" or recipe == "GeoMx" or recipe == "GeoMX" or recipe == "10XGenomics_FeatureBarcoding": - pipeline_path = "/igo/staging/PIPELINE/Project_" + project - if not os.path.exists(pipeline_path): - print("No pipeline result available") - else: - if not os.path.exists(delivery_folder): - print("Creating pipeline delivery folder {}".format(delivery_folder)) - os.makedirs(delivery_folder) - - # copy each sample folder to the delivery folder - pipeline_path = pipeline_path + "/" - sample_list = os.listdir(pipeline_path) - for sample in sample_list: - sample_path = pipeline_path + sample - destination = delivery_folder + "/" + sample - print("copy {}".format(sample_path)) - if os.path.isdir(sample_path): - shutil.copytree(sample_path, destination, symlinks=True) - else: - cmd = "cp {} {}".format(sample_path, destination) - print(cmd) - call(cmd, shell=True) - - # if 10X recipe or SCRI project starting with 12437, copy cell ranger result to project folder - elif recipe.startswith("10XGenomics") or project.startswith("12437_"): + # TCR seq only need deliver manifest, those files located under viale lab drive + # example file: /pskis34/LIMS/TCRseqManifest/Project_13545_TCRseq_Manifest_Beta.csv + elif requestName == "TCRSeq": + pipeline_path_prefix = "/rtssdc/mohibullahlab/LIMS/TCRseqManifest/Project_" + project + "_TCRseq" + TCR_delivery_folder = delivery_folder + "/Manifest" + if not os.path.exists(TCR_delivery_folder): + print("Creating pipeline delivery folder {}".format(TCR_delivery_folder)) + os.makedirs(TCR_delivery_folder) + + cmd = "cp {}* {}/".format(pipeline_path_prefix, TCR_delivery_folder) + print(cmd) + call(cmd, shell=True) + + # For all other projects, check CELLRANGER folder first then PIPELINE folder + else: folder_list = scripts.deliver_cellranger.find_cellranger(project) if len(folder_list) == 0: - print("No cellranger result available") + # check PIPELINE folder + pipeline_path = "/igo/staging/PIPELINE/Project_" + project + if not os.path.exists(pipeline_path): + print("No cellranger/pipeline result available") + else: + if not os.path.exists(delivery_folder): + print("Creating pipeline delivery folder {}".format(delivery_folder)) + os.makedirs(delivery_folder) + + # copy each sample folder to the delivery folder + pipeline_path = pipeline_path + "/" + sample_list = os.listdir(pipeline_path) + for sample in sample_list: + sample_path = pipeline_path + sample + destination = delivery_folder + "/" + sample + print("copy {}".format(sample_path)) + if os.path.isdir(sample_path): + shutil.copytree(sample_path, destination, symlinks=True) + else: + cmd = "cp {} {}".format(sample_path, destination) + print(cmd) + call(cmd, shell=True) else: # create pipeline folder if not exists cellranger_delivery_folder = delivery_folder + "/cellranger" @@ -105,21 +94,6 @@ def deliver_pipeline_output(project, pi, recipe): print("copy {}".format(folder)) shutil.copytree(folder, sample_delivery_name, symlinks=True) - # TCR seq only need deliver manifest, those files located under viale lab drive - # example file: /pskis34/LIMS/TCRseqManifest/Project_13545_TCRseq_Manifest_Beta.csv - elif recipe == "TCRSeq-IGO": - pipeline_path_prefix = "/rtssdc/mohibullahlab/LIMS/TCRseqManifest/Project_" + project + "_TCRseq" - TCR_delivery_folder = delivery_folder + "/Manifest" - if not os.path.exists(TCR_delivery_folder): - print("Creating pipeline delivery folder {}".format(TCR_delivery_folder)) - os.makedirs(TCR_delivery_folder) - - cmd = "cp {}* {}/".format(pipeline_path_prefix, TCR_delivery_folder) - print(cmd) - call(cmd, shell=True) - - else: - print("Pipeline delivery is not needed for recipe {} and project {}".format(recipe, project)) return "Completed pipeline delivery" def find_bams(project, stats_base_dir): diff --git a/stats_by_project_dag.py b/stats_by_project_dag.py index b99dc9e..ddb104d 100644 --- a/stats_by_project_dag.py +++ b/stats_by_project_dag.py @@ -57,9 +57,9 @@ def run_stats(ds, **kwargs): print(cmd) subprocess.run(cmd, shell=True) - elif "10X_" in recipe: + elif "SC_Chromium" in recipe: scripts.cellranger.launch_cellranger_by_project_location(project_directory, recipe, species) - elif "ONT" in recipe: + elif "Nanopore" in recipe: cmd = "bsub -J ont_stats_{} -n 16 -M 16 /igo/work/nabors/tools/venvpy3/bin/python /igo/work/igo/igo-demux/scripts/ont_stats.py {}".format(project_id, project_directory) print(cmd) subprocess.run(cmd, shell=True) From dc946b440818b2b07f2d944e6bbb487ff7b3ca02 Mon Sep 17 00:00:00 2001 From: luc Date: Fri, 10 May 2024 16:38:19 -0400 Subject: [PATCH 66/87] update recipe --- SampleSheet.py | 4 ++-- demux_run_dag.py | 14 +++++--------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/SampleSheet.py b/SampleSheet.py index b41f43d..7907ef8 100644 --- a/SampleSheet.py +++ b/SampleSheet.py @@ -99,7 +99,7 @@ def split_sample_sheet(self): if sample sheet recipes have mixed DLP and other all DLP need to go on a separate sample sheet named "_DLP" """ # if 10x DRAGEN demux add to header CreateFastqForIndexReads,1,,,,,,, - if any("10X_" in s for s in self.recipe_set): + if any("SC_Chromium" in s for s in self.recipe_set): print("Adding CreateFastqForIndexReads,1 to sample sheet header since 10X samples are present") self.df_ss_header.loc[len(self.df_ss_header.index)-1] = ["CreateFastqForIndexReads",1,"","","","","","",""] self.df_ss_header.loc[len(self.df_ss_header.index)] = ["[Data]","","","","","","","",""] @@ -111,7 +111,7 @@ def split_sample_sheet(self): split_ss_list = [ss_copy, self] was_split = False - if "DLP" in self.recipe_set and len(self.recipe_set) > 1: + if "SC_DLP" in self.recipe_set and len(self.recipe_set) > 1: print("Copying all DLP samples to a new sample sheet") # copy all DLP rows to a new sample sheet dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == True].copy() diff --git a/demux_run_dag.py b/demux_run_dag.py index 12e3733..aa026bb 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -66,7 +66,7 @@ def demux(ds, **kwargs): # check if the sample sheet contains DLP project is_DLP = False - if "DLP" in sample_sheet.recipe_set: + if "SC_DLP" in sample_sheet.recipe_set: is_DLP = True dragen_demux = True @@ -214,7 +214,7 @@ def stats(ds, **kwargs): def fingerprinting(ds, **kwargs): # read in sample sheet as arguments, filter out projects that need to run fingerprinting - recipe_list_for_fp = [".*IMPACT*", ".*Heme*", "IDT_Exome*", "WholeExomeSequencing", "Twist_Exome", "MSK-ACCESS*", "CMO-CH", "HumanWholeGenome"] + recipe_list_for_fp = ["PED-PEG", "WGS_Deep", "HC_IMPACT", "HC_IMPACT-Heme", "HC_ACCESS", "WES_Human", "HC_CMOCH"] # call fingerprinting_dag.py for each project samplesheet_path = kwargs["params"]["samplesheet"] @@ -228,13 +228,9 @@ def fingerprinting(ds, **kwargs): project_list_to_run = [] for project, recipe in sample_sheet.project_dict.items(): # fingerprinting only support human - if project_genome_dict[project] == "Human": - for recipe_list_item in recipe_list_for_fp: - print(project, recipe) - expr = re.compile(recipe_list_item) - if expr.match(recipe): - project_list_to_run.append(project) - break + if project_genome_dict[project] == "Human" and recipe in recipe_list_for_fp: + project_list_to_run.append(project) + print("Projects need to run fp: {}".format(project_list_to_run)) if len(project_list_to_run) == 0: return "No project need to run fingerprinting" From c51746524be1f3c280b2eacb6755a29e6b2f5392 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 16 May 2024 10:10:28 -0400 Subject: [PATCH 67/87] Update dragen_csv_to_html.py changing index and index2 to create a demux html file for a run that was demuxed using index2 only --- scripts/dragen_csv_to_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dragen_csv_to_html.py b/scripts/dragen_csv_to_html.py index 9c871b4..ee3c0d1 100644 --- a/scripts/dragen_csv_to_html.py +++ b/scripts/dragen_csv_to_html.py @@ -25,9 +25,9 @@ def build_lane_summary_html(demux_reports_dir, write_to_file): for i in range(1, lane_number + 1): df_name = "top_unknown_lane" + str(i) df_by_lanes[df_name] = top_unknown_barcodes_csv_covert.loc[top_unknown_barcodes_csv_covert["Lane"] == i] - if not df_by_lanes[df_name]["index2"].isnull().values.any(): - df_by_lanes[df_name]["index"] = df_by_lanes[df_name]["index"].str.cat(df_by_lanes[df_name]["index2"], sep="-") - df_by_lanes[df_name] = df_by_lanes[df_name].drop("index2", axis=1) + if not df_by_lanes[df_name]["index"].isnull().values.any(): + df_by_lanes[df_name]["index2"] = df_by_lanes[df_name]["index2"].str.cat(df_by_lanes[df_name]["index"], sep="-") + df_by_lanes[df_name] = df_by_lanes[df_name].drop("index", axis=1) # format two tables in the html with different column headers with open(write_to_file, 'w') as _file: _file.write("

Lane Summary

" + demux_stats_csv_convert.to_html(index = False, float_format = '{:,.0f}'.format) + "\n

Top Unknown Barcodes

\n" + "\n" ) From 3c7a34b647a2a4257a93c445e7dd86065e9187b0 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 16 May 2024 10:15:38 -0400 Subject: [PATCH 68/87] Update dragen_csv_to_html.py changing code back to original for the index,index2 for creating the lane_barcode.html file --- scripts/dragen_csv_to_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dragen_csv_to_html.py b/scripts/dragen_csv_to_html.py index ee3c0d1..9c871b4 100644 --- a/scripts/dragen_csv_to_html.py +++ b/scripts/dragen_csv_to_html.py @@ -25,9 +25,9 @@ def build_lane_summary_html(demux_reports_dir, write_to_file): for i in range(1, lane_number + 1): df_name = "top_unknown_lane" + str(i) df_by_lanes[df_name] = top_unknown_barcodes_csv_covert.loc[top_unknown_barcodes_csv_covert["Lane"] == i] - if not df_by_lanes[df_name]["index"].isnull().values.any(): - df_by_lanes[df_name]["index2"] = df_by_lanes[df_name]["index2"].str.cat(df_by_lanes[df_name]["index"], sep="-") - df_by_lanes[df_name] = df_by_lanes[df_name].drop("index", axis=1) + if not df_by_lanes[df_name]["index2"].isnull().values.any(): + df_by_lanes[df_name]["index"] = df_by_lanes[df_name]["index"].str.cat(df_by_lanes[df_name]["index2"], sep="-") + df_by_lanes[df_name] = df_by_lanes[df_name].drop("index2", axis=1) # format two tables in the html with different column headers with open(write_to_file, 'w') as _file: _file.write("

Lane Summary

" + demux_stats_csv_convert.to_html(index = False, float_format = '{:,.0f}'.format) + "\n

Top Unknown Barcodes

\n" + "

\n" ) From 3daa6d043dcbd3498f7b0376c087bf402e50ce23 Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 20 May 2024 08:12:42 -0400 Subject: [PATCH 69/87] Update cellranger.py --- scripts/cellranger.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index 73b7737..0b7e266 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -55,12 +55,8 @@ def get_tag(recipe): tag = "Skip" if recipe in CONFIG.COUNT_FLAVORS: tag = "count" - if recipe in CONFIG.CNV_FLAVORS: - tag = "cnv" if recipe in CONFIG.VDJ_FLAVORS: tag = "vdj" - if recipe in CONFIG.ATAC_FLAVORS: - tag = "atac_count" if recipe in CONFIG.ARC_FLAVORS: tag = "arc" if recipe in CONFIG.SPATIAL_FLAVORS: From e9bb520b9ece7bbfe91e308c5a54ea5ca7af51d1 Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 20 May 2024 08:19:07 -0400 Subject: [PATCH 70/87] update test code according to new recipe --- test/SampleSheet_DLP.csv | 38 +++++++++++++++++++------------------- test_scripts.py | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/test/SampleSheet_DLP.csv b/test/SampleSheet_DLP.csv index 987b100..a811a3f 100644 --- a/test/SampleSheet_DLP.csv +++ b/test/SampleSheet_DLP.csv @@ -1,20 +1,20 @@ -[Header],,,,,,,, -IEMFileVersion,4,,,,,,, -Date,11/30/2021,,,,,,, -Workflow,GenerateFASTQ,,,,,,, -Application,MICHELLE,,,,,,, -Assay,,,,,,,, -Description,,,,,,,, -Chemistry,Default,,,,,,, -,,,,,,,, -[Reads],,,,,,,, -151,,,,,,,, -151,,,,,,,, -,,,,,,,, -[Settings],,,,,,,, -BarcodeMismatchesIndex1,0,,,,,,, -BarcodeMismatchesIndex2,0,,,,,,, -[Data],,,,,,,, -Lane,Sample_ID,Sample_Plate,Sample_Well,I7_Index_ID,index,index2,Sample_Project,Description -1,DLPNegativeCONTROL_12345A_3_3_IGO_DLPNegativeCONTROL-2710,Mouse,DLP,DLPi7_03-i5_03,AAGGACAT,AACCCCGT,Project_12345,someone@mskcc.org +[Header],,,,,,,, +IEMFileVersion,4,,,,,,, +Date,11/30/21,,,,,,, +Workflow,GenerateFASTQ,,,,,,, +Application,MICHELLE,,,,,,, +Assay,,,,,,,, +Description,,,,,,,, +Chemistry,Default,,,,,,, +,,,,,,,, +[Reads],,,,,,,, +151,,,,,,,, +151,,,,,,,, +,,,,,,,, +[Settings],,,,,,,, +BarcodeMismatchesIndex1,0,,,,,,, +BarcodeMismatchesIndex2,0,,,,,,, +[Data],,,,,,,, +Lane,Sample_ID,Sample_Plate,Sample_Well,I7_Index_ID,index,index2,Sample_Project,Description +1,DLPNegativeCONTROL_12345A_3_3_IGO_DLPNegativeCONTROL-2710,Mouse,SC_DLP,DLPi7_03-i5_03,AAGGACAT,AACCCCGT,Project_12345,someone@mskcc.org 1,IM-1613_RU1697a_IGO_12437_AD_11,Mouse,10X_Genomics,SI-GA-G9,SI-GA-G9,SI-GA-G9,Project_12437_AD,peerd@mskcc.org \ No newline at end of file diff --git a/test_scripts.py b/test_scripts.py index 8feba1f..1152ff1 100644 --- a/test_scripts.py +++ b/test_scripts.py @@ -30,7 +30,7 @@ def testCellranger_generate_cellranger_cmd(): def testCellranger_get_tag(): assert(cellranger.get_tag("10X_genomic") == "Skip") - assert(cellranger.get_tag("10X_Genomics_GeneExpression-3") == "count") + assert(cellranger.get_tag("SC_Chromium-GEX-3") == "count") def testCellranger_get_sequencer_runID(): fastq_path = "/igo/staging/FASTQ/DIANA_0453_AHFKJ5DRXY/Project_06265_AG/Sample_06265_8869_1_IGO_06265_AG_3" From 2a3904407adf39e3ee458f350d016879d0906619 Mon Sep 17 00:00:00 2001 From: luc Date: Mon, 20 May 2024 10:28:25 -0400 Subject: [PATCH 71/87] Update cellranger_multi.py --- scripts/cellranger_multi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cellranger_multi.py b/scripts/cellranger_multi.py index dae2b40..dd79db2 100644 --- a/scripts/cellranger_multi.py +++ b/scripts/cellranger_multi.py @@ -418,12 +418,12 @@ def gather_sample_set_info(sample_name): for key, value in sample.items(): if value[0].startswith(ilab_request) and key.endswith(sample_number): value[2] = value[2].split(",") - if "10X_Genomics_FeatureBarcoding" in value[2][0]: + if "SC_Chromium-FB-5" in value[2][0]: if "Feature Barcoding" in fb_type: sample_set["fb"] = "_IGO_".join([value[1], key]) if "Cell Hashing" in fb_type: sample_set["ch"] = "_IGO_".join([value[1], key]) - if "10X_Genomics_VDJ" in value[2][0]: + if "SC_Chromium-BCR" in value[2][0] or "SC_Chromium-TCR" in value[2][0]: sample_set["vdj"] = "_IGO_".join([value[1], key]) # TODO add vdj type to the whole pipeline return sample_set From d15734f48ecdc43c9f63b582e7b035066877acf5 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 20 May 2024 13:12:42 -0400 Subject: [PATCH 72/87] Update dragen_csv_to_html.py temporary change to create html file from run demexed with index2 only --- scripts/dragen_csv_to_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dragen_csv_to_html.py b/scripts/dragen_csv_to_html.py index 9c871b4..ee3c0d1 100644 --- a/scripts/dragen_csv_to_html.py +++ b/scripts/dragen_csv_to_html.py @@ -25,9 +25,9 @@ def build_lane_summary_html(demux_reports_dir, write_to_file): for i in range(1, lane_number + 1): df_name = "top_unknown_lane" + str(i) df_by_lanes[df_name] = top_unknown_barcodes_csv_covert.loc[top_unknown_barcodes_csv_covert["Lane"] == i] - if not df_by_lanes[df_name]["index2"].isnull().values.any(): - df_by_lanes[df_name]["index"] = df_by_lanes[df_name]["index"].str.cat(df_by_lanes[df_name]["index2"], sep="-") - df_by_lanes[df_name] = df_by_lanes[df_name].drop("index2", axis=1) + if not df_by_lanes[df_name]["index"].isnull().values.any(): + df_by_lanes[df_name]["index2"] = df_by_lanes[df_name]["index2"].str.cat(df_by_lanes[df_name]["index"], sep="-") + df_by_lanes[df_name] = df_by_lanes[df_name].drop("index", axis=1) # format two tables in the html with different column headers with open(write_to_file, 'w') as _file: _file.write("

Lane Summary

" + demux_stats_csv_convert.to_html(index = False, float_format = '{:,.0f}'.format) + "\n

Top Unknown Barcodes

\n" + "

\n" ) From ac2024c4bb71928b55fb22afa12e3c25c9c69937 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 20 May 2024 13:17:01 -0400 Subject: [PATCH 73/87] Update dragen_csv_to_html.py change code back --- scripts/dragen_csv_to_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dragen_csv_to_html.py b/scripts/dragen_csv_to_html.py index ee3c0d1..9c871b4 100644 --- a/scripts/dragen_csv_to_html.py +++ b/scripts/dragen_csv_to_html.py @@ -25,9 +25,9 @@ def build_lane_summary_html(demux_reports_dir, write_to_file): for i in range(1, lane_number + 1): df_name = "top_unknown_lane" + str(i) df_by_lanes[df_name] = top_unknown_barcodes_csv_covert.loc[top_unknown_barcodes_csv_covert["Lane"] == i] - if not df_by_lanes[df_name]["index"].isnull().values.any(): - df_by_lanes[df_name]["index2"] = df_by_lanes[df_name]["index2"].str.cat(df_by_lanes[df_name]["index"], sep="-") - df_by_lanes[df_name] = df_by_lanes[df_name].drop("index", axis=1) + if not df_by_lanes[df_name]["index2"].isnull().values.any(): + df_by_lanes[df_name]["index"] = df_by_lanes[df_name]["index"].str.cat(df_by_lanes[df_name]["index2"], sep="-") + df_by_lanes[df_name] = df_by_lanes[df_name].drop("index2", axis=1) # format two tables in the html with different column headers with open(write_to_file, 'w') as _file: _file.write("

Lane Summary

" + demux_stats_csv_convert.to_html(index = False, float_format = '{:,.0f}'.format) + "\n

Top Unknown Barcodes

\n" + "

\n" ) From 0114d986ac2afda070a81b906c31437f582a5ee1 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 20 May 2024 16:03:09 -0400 Subject: [PATCH 74/87] Update dragen_csv_to_html.py change again for latest demux --- scripts/dragen_csv_to_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dragen_csv_to_html.py b/scripts/dragen_csv_to_html.py index 9c871b4..ee3c0d1 100644 --- a/scripts/dragen_csv_to_html.py +++ b/scripts/dragen_csv_to_html.py @@ -25,9 +25,9 @@ def build_lane_summary_html(demux_reports_dir, write_to_file): for i in range(1, lane_number + 1): df_name = "top_unknown_lane" + str(i) df_by_lanes[df_name] = top_unknown_barcodes_csv_covert.loc[top_unknown_barcodes_csv_covert["Lane"] == i] - if not df_by_lanes[df_name]["index2"].isnull().values.any(): - df_by_lanes[df_name]["index"] = df_by_lanes[df_name]["index"].str.cat(df_by_lanes[df_name]["index2"], sep="-") - df_by_lanes[df_name] = df_by_lanes[df_name].drop("index2", axis=1) + if not df_by_lanes[df_name]["index"].isnull().values.any(): + df_by_lanes[df_name]["index2"] = df_by_lanes[df_name]["index2"].str.cat(df_by_lanes[df_name]["index"], sep="-") + df_by_lanes[df_name] = df_by_lanes[df_name].drop("index", axis=1) # format two tables in the html with different column headers with open(write_to_file, 'w') as _file: _file.write("

Lane Summary

" + demux_stats_csv_convert.to_html(index = False, float_format = '{:,.0f}'.format) + "\n

Top Unknown Barcodes

\n" + "

\n" ) From d4c5e5b6e7beca53b53c708d809ca1376cb7a3d0 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 20 May 2024 16:10:41 -0400 Subject: [PATCH 75/87] Update dragen_csv_to_html.py change code back to original --- scripts/dragen_csv_to_html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dragen_csv_to_html.py b/scripts/dragen_csv_to_html.py index ee3c0d1..9c871b4 100644 --- a/scripts/dragen_csv_to_html.py +++ b/scripts/dragen_csv_to_html.py @@ -25,9 +25,9 @@ def build_lane_summary_html(demux_reports_dir, write_to_file): for i in range(1, lane_number + 1): df_name = "top_unknown_lane" + str(i) df_by_lanes[df_name] = top_unknown_barcodes_csv_covert.loc[top_unknown_barcodes_csv_covert["Lane"] == i] - if not df_by_lanes[df_name]["index"].isnull().values.any(): - df_by_lanes[df_name]["index2"] = df_by_lanes[df_name]["index2"].str.cat(df_by_lanes[df_name]["index"], sep="-") - df_by_lanes[df_name] = df_by_lanes[df_name].drop("index", axis=1) + if not df_by_lanes[df_name]["index2"].isnull().values.any(): + df_by_lanes[df_name]["index"] = df_by_lanes[df_name]["index"].str.cat(df_by_lanes[df_name]["index2"], sep="-") + df_by_lanes[df_name] = df_by_lanes[df_name].drop("index2", axis=1) # format two tables in the html with different column headers with open(write_to_file, 'w') as _file: _file.write("

Lane Summary

" + demux_stats_csv_convert.to_html(index = False, float_format = '{:,.0f}'.format) + "\n

Top Unknown Barcodes

\n" + "

\n" ) From 49420eb70865a2d96049f4a0df82afa8465d77af Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Tue, 21 May 2024 16:23:50 -0400 Subject: [PATCH 76/87] new recipe names Updating LaunchMetrics and run_param_config to be able to handle new recipe names from sample sheet --- scripts/LaunchMetrics.py | 6 +-- scripts/run_param_config.py | 83 ++++++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index c84d0b2..94ca747 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -16,9 +16,9 @@ # Global Variable : we do not want to process these experiments in this script DO_NOT_PROCESS = ["DLP"] # These recipes will be evaluated using DRAGEN because of their larger size of fastqs -RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "MouseWholeGenome", "HumanWholeGenome", "PombeWholeGenome", "ChIPSeq", "AmpliconSeq"] +RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "WGS_Deep", "ChIP", "CUT&RUN","Amplicon"] # these projects willl only need demux stats -DEMUX_ONLY = ["SMARTSeq", "10X_Genomics"] +DEMUX_ONLY = ["SMARTSeq", "Chromium", "10X_Genomics"] # Organisms to have DRAGEN BAMS DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"] @@ -85,7 +85,7 @@ def launch_metrics(self, all_samples, run, project_directory): self.dragen(sample, run, sample_parameters, work_directory, dragen_directory, fastq_list) continue # check for methylated samples - if ((sample.recipe == "MethylCaptureSeq") or (sample.recipe == "WholeGenomeBisulfiteSequencing")): + if ("Methyl" in sample.recipe): pathlib.Path(dragen_directory).mkdir(parents = True, exist_ok = True) self.dragen_methylation(sample, run, sample_parameters, work_directory, dragen_directory, fastq_list) continue diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index 7fa0307..a35ab98 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -59,25 +59,25 @@ def get_ordered_dic(unordered_dic): Returns: type, OrderedDict: Ordered dictionary by key-length """ - return OrderedDict(sorted(unordered_dic.items(), key=lambda t: -len(t[0]))) + return OrderedDict(sorted(unordered_dic.items(), key = lambda t: -len(t[0]))) """ Mapping of recipes to their type, default should be DNA """ recipe_type_mapping_UNORDERED = { - "MouseWholeGenome": { TYPE: "WGS" }, - "PigWholeGenome": { TYPE: "WGS" }, - "PombeWholeGenome": { TYPE: "WGS" }, - "ShallowWGS": { TYPE: "WGS" }, - "10X_Genomics_WGS": { TYPE: "WGS" }, - "WholeGenomeSequencing": { TYPE: "WGS" }, - "HumanWholeGenome": { TYPE: "WGS" }, + # "MouseWholeGenome": { TYPE: "WGS" }, + # "PigWholeGenome": { TYPE: "WGS" }, + # "PombeWholeGenome": { TYPE: "WGS" }, + "WGS_Shallow": { TYPE: "WGS" }, + # "10X_Genomics_WGS": { TYPE: "WGS" }, + "WGS_Metagenomic": { TYPE: "WGS" }, + "WGS_Deep": { TYPE: "WGS" }, ".*RNA.*": { TYPE: "RNA" }, - ".*96Well_SmartSeq2": { TYPE: "RNA" }, + # ".*96Well_SmartSeq2": { TYPE: "RNA" }, ".*SMARTer.*": { TYPE: "RNA" }, - "FusionDiscoverySeq": { TYPE: "RNA" }, + # "FusionDiscoverySeq": { TYPE: "RNA" }, ".*Ribo.*": { TYPE: "RNA" }, - "SMART-Seq": { TYPE: "RNA" }, + # "SMART-Seq": { TYPE: "RNA" }, "SMARTSeq": { TYPE: "RNA" }, - ".*CDH1_RNA.*": { TYPE: "CAPTURE" }, + # ".*CDH1_RNA.*": { TYPE: "CAPTURE" }, # FOR NEW ENTRIES # "{regex}": { TYPE: type } ".*": { TYPE: "DNA" } # DEFAULT @@ -97,16 +97,16 @@ def get_ordered_dic(unordered_dic): "RDM": "hg19", "myTYPE_V1": "hg19", "PanCancerV2": "hg19", - "MissionBio-Heme": "GRCh38", + "User_MissionBio": "GRCh38", "WholeExome_v4": "hg19", "AmpliSeq": "hg19", "HemeBrainPACT_v1": "hg19" }, "Mouse": { "M-IMPACT_v1": "mm10", - "M-IMPACT_v2": "mm10", + "HC_IMPACT-Mouse": "mm10", "Twist_mWES": "mm10", - "10X_Genomics_Multiome": "mm10" + "SC_Chromium-Multiome": "mm10" } } """ Mapping of species to their genome-type """ @@ -432,7 +432,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "yes", MD: "yes" }, - "IMPACT505": { + "HC_IMPACT": { + # IMPACT505 # NOTE: interval list file name "IMPACT468_BAITS" is stored in LIMS and passed to pipelines, change file name with caution BAITS: "/igo/home/igo/resources/ilist/hg38/IMPACT505/IMPACT505_BAITS.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/IMPACT505/IMPACT505_TARGETS.targets", @@ -445,13 +446,15 @@ def get_ordered_dic(unordered_dic): MSKQ: "yes", MD: "yes" }, - "IMPACT-Heme": { + "HC_IMPACT-Heme": { + # IMPACT-Heme BAITS: "/igo/home/igo/resources/ilist/hg38/IMPACT-Heme_v2/IMPACT-Heme_v2_BAITS.iList", TARGETS: "/igo/home/igo/resources/ilist/hg38/IMPACT-Heme_v2/IMPACT-Heme_v2_TARGETS.iList", MSKQ: "yes", MD: "yes" }, - "IMPACT_Heme_v2": { + "HC_IMPACT-Heme": { + # IMPACT_Heme_v2 BAITS: "/igo/home/igo/resources/ilist/hg38/IMPACT-Heme_v2/IMPACT-Heme_v2_BAITS.iList", TARGETS: "/igo/home/igo/resources/ilist/hg38/IMPACT-Heme_v2/IMPACT-Heme_v2_TARGETS.iList", MSKQ: "yes", @@ -463,7 +466,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "yes", MD: "yes" }, - "M-IMPACT_v2": { + "HC_IMPACT-Mouse": { + # M-IMPACT_v2 BAITS: "/home/igo/resources/BED-Targets/IMPACT/MM_IMPACT/M-IMPACT_v2.baits", TARGETS: "/home/igo/resources/BED-Targets/IMPACT/MM_IMPACT/M-IMPACT_v2.targets", MSKQ: "yes", @@ -501,10 +505,11 @@ def get_ordered_dic(unordered_dic): MD: "yes" }, "IDT_Exome_v2_FP_Viral_Probes": { - BAITS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_BAITS.baits", - TARGETS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_TARGETS.targets", - MSKQ: "no", - MD: "yes" + # IDT_Exome_v2_FP_Viral_Probes or WES_Human + BAITS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_BAITS.baits", + TARGETS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_TARGETS.targets", + MSKQ: "no", + MD: "yes" }, "IDT_Exome_v1": { BAITS: "/home/igo/resources/BED-Targets/xgen-exome-research-panel-BAITS.iList", @@ -548,14 +553,16 @@ def get_ordered_dic(unordered_dic): MSKQ: "yes", MD: "yes" }, - "MSK-ACCESS_v1": { + "HC_ACCESS": { + # MSK-ACCESS_v1 BAITS: "/igo/home/igo/resources/ilist/hg38/MSK-ACCESS-v1/MSK-ACCESS-v1_0-probesAllwFP.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/MSK-ACCESS-v1/MSK-ACCESS-v1_0-probesAllwFP.targets", MSKQ: "no", MD: "yes", HAPLOTYPE_MAP: "/home/igo/fingerprint_maps/map_files/hg38_no_chr_ACCESS_unordered.map" }, - "MSK-ACCESS_v2": { + "HC_ACCESS": { + # MSK-ACCESS_v2 BAITS: "/igo/home/igo/resources/ilist/hg38/MSK-ACCESS-v2/MSK-ACCESS-v2_0-probesAllwFP.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/MSK-ACCESS-v2/MSK-ACCESS-v2_0-probesAllwFP.targets", MSKQ: "no", @@ -574,7 +581,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "MissionBio-Heme": { + "User_MissionBio": { + # MissionBio-Heme BAITS: "/igo/work/nabors/bed_files/Mission_Bio/hg38/MissionBio-Heme_BAITS.iList", TARGETS: "/igo/work/nabors/bed_files/Mission_Bio/hg38/MissionBio-Heme_TARGETS.iList", MSKQ: "no", @@ -648,25 +656,29 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "MethylCaptureSeq": { + "Methyl_Capture": { + # MethylCaptureSeq BAITS: "/igo/home/igo/resources/ilist/hg38/MethylCaptureSeq/truseq-methyl-capture-epic-manifest-file-hg38.baits.ilist", TARGETS: "/igo/home/igo/resources/ilist/hg38/MethylCaptureSeq/truseq-methyl-capture-epic-manifest-file-hg38.targets.ilist", MSKQ: "no", MD: "yes" }, - "MSK-CH": { + "HC_CMOCH": { + # MSK-CH BAITS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.targets", MSKQ: "no", MD: "yes" }, - "CMO-CH": { + "HC_CMOCH": { + # CMO-CH BAITS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/CMO-CH/CMO-CH.hg38.targets", MSKQ: "no", MD: "yes" }, - "HumanWholeGenome": { + "WGS_Deep": { + # HumanWholeGenome MSKQ: "no", MD: "yes", HAPLOTYPE_MAP: "", # TODO - Add this @@ -674,7 +686,8 @@ def get_ordered_dic(unordered_dic): REFERENCE: "/igo/work/genomes/H.sapiens/GRCh38.p13/ncbi-genomes-2021-09-23/GCF_000001405.39_GRCh38.p13_genomic.fna", DGN_REFERENCE: "/staging/ref/hg38_alt_masked_graph_v2+cnv+graph+rna-8-1644018559" }, - "MouseWholeGenome": { + "WGS_Deep": { + # MouseWholeGenome MSKQ: "no", MD: "yes" # TODO @@ -692,7 +705,7 @@ def get_ordered_dic(unordered_dic): # TODO # sh $DIR/../PicardScripts/LaunchPipelines.sh $RUNTYPE --input /igo/work/FASTQ/$RUNNAME/$PROJECT/ --genome $GENOME --type WGS --md $MARKDUPLICATES --mskq $MSKQ }, - "ShallowWGS": { + "WGS_Shallow": { MSKQ: "no", MD: "yes" # TODO @@ -710,13 +723,15 @@ def get_ordered_dic(unordered_dic): # TODO # sh $DIR/../PicardScripts/LaunchPipelines.sh $RUNTYPE --input /igo/work/FASTQ/$RUNNAME/$PROJECT/ --genome $GENOME --md $MARKDUPLICATES --mskq $MSKQ }, - "AmpliconSeq": { + "DNA_Amplicon": { + # AmpliconSeq MSKQ: "no", MD: "yes" # TODO # sh $DIR/../PicardScripts/LaunchPipelines.sh $RUNTYPE --input /igo/work/FASTQ/$RUNNAME/$PROJECT/ --genome $GENOME --md $MARKDUPLICATES --mskq $MSKQ }, - "CRISPRSeq": { + "DNA_CRISPR": { + # CRISPRSeq MSKQ: "no", MD: "yes" # TODO From fcfe5d11fa60a488dfcb6b410b49199e8141826c Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Wed, 22 May 2024 15:01:23 -0400 Subject: [PATCH 77/87] Update LaunchMetrics.py Added Visium to the list that just processes demuxed reads only --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index 94ca747..a200b4b 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -18,7 +18,7 @@ # These recipes will be evaluated using DRAGEN because of their larger size of fastqs RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "WGS_Deep", "ChIP", "CUT&RUN","Amplicon"] # these projects willl only need demux stats -DEMUX_ONLY = ["SMARTSeq", "Chromium", "10X_Genomics"] +DEMUX_ONLY = ["SMARTSeq", "Chromium", "10X_Genomics", "Visium"] # Organisms to have DRAGEN BAMS DRAGEN_RNA_GENOMES = ["GRCh38", "grcm39"] From 37a96244d102f69d8e4cca3ab44ecd6357115bae Mon Sep 17 00:00:00 2001 From: luc Date: Wed, 29 May 2024 08:43:34 -0400 Subject: [PATCH 78/87] Update deliver_pipeline.py --- scripts/deliver_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/deliver_pipeline.py b/scripts/deliver_pipeline.py index 895dc63..4e0213f 100644 --- a/scripts/deliver_pipeline.py +++ b/scripts/deliver_pipeline.py @@ -35,7 +35,7 @@ def deliver_pipeline_output(project, pi, requestName): delivery_folder = LAB_SHARE_DIR + "/" + pi + "/Project_" + project + "/pipeline" if requestName == "RNALibraryPrep": - print("Delivering all RNASeq .bams for {} {} {}".format(project, pi, recipe)) + print("Delivering all RNASeq .bams for {} {} {}".format(project, pi, requestName)) bamdict = find_bams(project, STATS_DIR) bsub_commands = write_bams_to_share(bamdict, delivery_folder) reconcile_bam_fastq_list(project, bamdict) From a964db0c5bd39ce881fc75393b5b9942a439535c Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sat, 1 Jun 2024 10:36:53 -0400 Subject: [PATCH 79/87] Update run_param_config.py needed to update the recipe for Whole Exome Sequencing for Human and Mouse --- scripts/run_param_config.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index a35ab98..5bf9564 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -473,7 +473,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "yes", MD: "yes" }, - "WholeExomeSequencing": { + "WES_Human": { + # WholeExomeSequencing BAITS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_BAITS.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_TARGETS.targets", MSKQ: "no", @@ -504,7 +505,7 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "IDT_Exome_v2_FP_Viral_Probes": { + "WES_Human": { # IDT_Exome_v2_FP_Viral_Probes or WES_Human BAITS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_BAITS.baits", TARGETS: "/igo/home/igo/resources/ilist/hg38/IDT_Exome_v2_FP/IDT_Exome_v2_FP_TARGETS.targets", @@ -612,7 +613,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "Twist_mWES": { + "WES_Mouse": { + # Twist_mWES BAITS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_BAITS.IntervalList", TARGETS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_TARGETS.IntervalList" }, From ef568930f5ce256ce45b36dc4de4ae1f5453aebf Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Sun, 2 Jun 2024 12:49:30 -0400 Subject: [PATCH 80/87] Update run_param_config.py changing WES_Mouse to point to the Agilent mouse bait set --- scripts/run_param_config.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index 5bf9564..5cdc458 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -105,6 +105,7 @@ def get_ordered_dic(unordered_dic): "Mouse": { "M-IMPACT_v1": "mm10", "HC_IMPACT-Mouse": "mm10", + "WES_Mouse": "mm10", "Twist_mWES": "mm10", "SC_Chromium-Multiome": "mm10" } @@ -493,7 +494,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "Agilent_MouseAllExonV1": { + "WES_Mouse": { + # Agilent_MouseAllExonV1 BAITS: "/home/igo/resources/BED-Targets/Agilent_MouseAllExonV1_mm10_v1_baits.ilist", TARGETS: "/home/igo/resources/BED-Targets/Agilent_MouseAllExonV1_mm10_v1_targets.ilist", MSKQ: "no", @@ -613,10 +615,12 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "WES_Mouse": { - # Twist_mWES + "Twist_mWES": { + # WES_Mouse ?? BAITS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_BAITS.IntervalList", - TARGETS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_TARGETS.IntervalList" + TARGETS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_TARGETS.IntervalList", + MSKQ: "no", + MD: "yes" }, "Twist_Kentsis_spikeinWES_RK_V3": { BAITS: "/home/igo/resources/ilist/Twist_Kentsis_spikeinWES_RK_V3/Twist_Kentsis_spikeinWES_RK_V3_BAITS.intervalList", From 6a5be030baefa83964c1f1cb8a12af579c1164e8 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 3 Jun 2024 13:19:30 -0400 Subject: [PATCH 81/87] Update LaunchMetrics.py changing DLP recipe name to new name to skip and DLP samples --- scripts/LaunchMetrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/LaunchMetrics.py b/scripts/LaunchMetrics.py index a200b4b..6f3c9f7 100644 --- a/scripts/LaunchMetrics.py +++ b/scripts/LaunchMetrics.py @@ -14,7 +14,7 @@ # Global Variable : we do not want to process these experiments in this script -DO_NOT_PROCESS = ["DLP"] +DO_NOT_PROCESS = ["SC_DLP"] # These recipes will be evaluated using DRAGEN because of their larger size of fastqs RUN_ON_DRAGEN = ["MissionBio", "SingleCellCNV", "WGS_Deep", "ChIP", "CUT&RUN","Amplicon"] # these projects willl only need demux stats From 8407306612f2473bb12758e3034298391ded11d9 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 3 Jun 2024 13:52:23 -0400 Subject: [PATCH 82/87] Update demux_run_dag.py updated recipes from DLP to SC_DLP --- demux_run_dag.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demux_run_dag.py b/demux_run_dag.py index aa026bb..c69ccd4 100644 --- a/demux_run_dag.py +++ b/demux_run_dag.py @@ -107,7 +107,7 @@ def demux(ds, **kwargs): def get_dlp_chip(samplesheet, project): samplesheet.df_ss_data.reset_index() for index, row in samplesheet.df_ss_data.iterrows(): - if row['Sample_Well'] == 'DLP' and project == row['Sample_Project']: + if row['Sample_Well'] == 'SC_DLP' and project == row['Sample_Project']: # return chip from 071PP_DLP_UNSORTED_128624A_13_12_IGO_09443_CU_1_1_121 sample = row['Sample_ID'] return get_dlp_chip_from_sample_name(sample) @@ -135,7 +135,7 @@ def stats(ds, **kwargs): if "REFERENCE" in samplesheet_path: return "No stats for reference " + samplesheet_path - if "DLP" in sample_sheet.recipe_set: + if "SC_DLP" in sample_sheet.recipe_set: scripts.get_total_reads_from_demux.run_DLP(sample_sheet, sequencer_and_run) scripts.upload_stats.upload_stats(sequencer_and_run) From 8dcb8c2b84d58369b7236175e420b2bd51a53de1 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 3 Jun 2024 14:14:40 -0400 Subject: [PATCH 83/87] Update run_param_config.py let recipes with WES_Mouse use the Twist_mWES bait set --- scripts/run_param_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index 5cdc458..61f5609 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -494,7 +494,7 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "WES_Mouse": { + "Agilent_MouseAllExonV1": { # Agilent_MouseAllExonV1 BAITS: "/home/igo/resources/BED-Targets/Agilent_MouseAllExonV1_mm10_v1_baits.ilist", TARGETS: "/home/igo/resources/BED-Targets/Agilent_MouseAllExonV1_mm10_v1_targets.ilist", @@ -615,8 +615,8 @@ def get_ordered_dic(unordered_dic): MSKQ: "no", MD: "yes" }, - "Twist_mWES": { - # WES_Mouse ?? + "WES_Mouse": { + # Twist_mWES ?? BAITS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_BAITS.IntervalList", TARGETS: "/home/igo/resources/ilist/Twist_mWES/Twist_mWES_TARGETS.IntervalList", MSKQ: "no", From 6b08ed7c4ee28af3dd1de5b3c37f83578a3c7b65 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 10 Jun 2024 09:44:44 -0400 Subject: [PATCH 84/87] Update SampleSheet.py updating this script to recognize new SC_DLP name so the sample sheet can split correctly --- SampleSheet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SampleSheet.py b/SampleSheet.py index 7907ef8..266d9f3 100644 --- a/SampleSheet.py +++ b/SampleSheet.py @@ -114,9 +114,9 @@ def split_sample_sheet(self): if "SC_DLP" in self.recipe_set and len(self.recipe_set) > 1: print("Copying all DLP samples to a new sample sheet") # copy all DLP rows to a new sample sheet - dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == True].copy() + dlp_data = self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == True].copy() # and remove DLP samples from the main sample sheet - self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("DLP") == False].copy() + self.df_ss_data= self.df_ss_data[self.df_ss_data["Sample_Well"].str.match("SC_DLP") == False].copy() # rename DLP sample sheet w/"_DLP.csv" dlp_path = os.path.splitext(self.path)[0]+'_DLP.csv' header_copy = self.df_ss_header.copy(deep=True) From 05c46ae5a9a02e827003c23ad62d2b1f7364bdc5 Mon Sep 17 00:00:00 2001 From: luc Date: Tue, 11 Jun 2024 11:56:24 -0400 Subject: [PATCH 85/87] add HD image info to visium pipeline --- scripts/cellranger.py | 37 ++++++++++++++++++++--------------- scripts/cellranger_spatial.py | 18 +++++++++++++++-- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/scripts/cellranger.py b/scripts/cellranger.py index 0b7e266..18816c2 100644 --- a/scripts/cellranger.py +++ b/scripts/cellranger.py @@ -6,6 +6,7 @@ import json import subprocess import os.path +import shutil import scripts.get_sequencing_read_data import scripts.cellranger_spatial import scripts.cellranger_config as CONFIG @@ -144,18 +145,11 @@ def lanuch_by_project(sequencer_and_run, project, sample_id_list, sample_genome_ sample_fastqfile_dict = find_fastq_file(sample_id_list) send_json = {} send_json["samples"] = [] - # CREATE RUN FOLDER AND PROJECT FOLDER IF NOT ALREADY THERE - os.chdir(CONFIG.STATS_AREA) - runs = next(os.walk("."))[1] - if sequencer_and_run not in runs: - os.mkdir(sequencer_and_run, CONFIG.ACCESS) - - stats_and_run = CONFIG.STATS_AREA + sequencer_and_run - os.chdir(stats_and_run) - projects = next(os.walk("."))[1] - if project not in projects: - os.mkdir(project, CONFIG.ACCESS) - work_area = stats_and_run + "/" + project + "/" + # CREATE RUN FOLDER AND PROJECT FOLDER IF NOT ALREADY THERE + work_area = CONFIG.STATS_AREA + sequencer_and_run + "/" + project + "/" + if not os.path.exists(work_area): + os.makedirs(work_area, CONFIG.ACCESS) + # GO TO project ID LOCATION to start cellranger command os.chdir(work_area) @@ -190,15 +184,26 @@ def lanuch_by_project(sequencer_and_run, project, sample_id_list, sample_genome_ cmd = "{}--id=Sample_{}{}".format(tool, sample, transcriptome) + "--fastqs=" + ",".join(sample_fastqfile_dict[sample]) + " --cytaimage={} --slide={} --area={}".format(sample_info.tiff_image, sample_info.chip_id, sample_info.chip_position) if sample_genome_dict[sample] == "Human": probe = CONFIG.config_dict[tag]["probe"]["Human_CytAssist"] - cmd = cmd + " --probe-set={}".format(probe) elif sample_genome_dict[sample] == "Mouse": - probe = CONFIG.config_dict[tag]["probe"][sample_genome_dict[sample]] - cmd = cmd + " --probe-set={}".format(probe) + if sample_info.slide.startswith("H1"): + probe = CONFIG.config_dict[tag]["probe"]["Mouse_HD"] + else: + probe = CONFIG.config_dict[tag]["probe"]["Mouse"] + cmd = cmd + " --probe-set={}".format(probe) elif sample_info.preservation == "FFPE": probe = CONFIG.config_dict[tag]["probe"][sample_genome_dict[sample]] cmd = cmd + " --probe-set={}".format(probe) + # Eventhough HE image is required internal, the pipeline doesn't need it. Add it if exists + if sample_info.HE_tiff_image != "EMPTY": + cmd = cmd + " --image={}".format(sample_info.HE_tiff_image) + # copy microsope image here in sub folder for delivery + HE_folder_loc = work_area + "Microscope/" + if not os.path.exists(HE_folder_loc): + os.makedirs(HE_folder_loc) + shutil.copy(sample_info.HE_tiff_image , HE_folder_loc) + # if there is manual alignment json file availabe, add that to the cmd if sample_info.json != "EMPTY": cmd = cmd + " --loupe-alignment={}".format(sample_info.json) @@ -206,7 +211,7 @@ def lanuch_by_project(sequencer_and_run, project, sample_id_list, sample_genome_ bsub_cmd = "bsub -J {}_{}_{}_SPATIAL -o {}_SPATIAL.out{}{}".format(sequencer_and_run, project, sample, sample, cmd, CONFIG.OPTIONS) print(bsub_cmd) subprocess.run(bsub_cmd, shell=True) - + elif tag != "Skip": cmd = generate_cellranger_cmd(sample, tag, sample_genome_dict[sample], sample_fastqfile_dict[sample], sequencer_and_run) print(cmd) diff --git a/scripts/cellranger_spatial.py b/scripts/cellranger_spatial.py index f28e9de..01d905e 100644 --- a/scripts/cellranger_spatial.py +++ b/scripts/cellranger_spatial.py @@ -1,4 +1,3 @@ -import pandas as pd import os import json import os.path @@ -16,6 +15,7 @@ def __init__(self, sample, project_id): self.preservation = "EMPTY" self.tiff_image = "EMPTY" self.json = "EMPTY" + self.HE_tiff_image = "EMPTY" self.get_info_from_LIMS() self.copy_tiff(project_id) self.copy_json(project_id) @@ -33,9 +33,14 @@ def copy_tiff(self, project_id): source_loc_dir = CONFIG.original_tiff_images_directory + project_id destination_loc = CONFIG.tiff_images_directory + project_id destination_file = destination_loc + "/" + self.sample_name + ".tif" + destination_HE_loc = destination_loc + "/Microscope" + destination_HE_file = destination_HE_loc + "/HE_" + self.sample_name + ".tif" # create TIFF_images director if not exists if not os.path.exists(destination_loc): os.makedirs(destination_loc) + # create microscope image director if not exists + if not os.path.exists(destination_HE_loc): + os.makedirs(destination_HE_loc) # copy image file per sample original_tiff_image = source_loc_dir + "/" + self.sample_name + ".tif" @@ -45,7 +50,16 @@ def copy_tiff(self, project_id): print("copy {} to {}".format(original_tiff_image, destination_file)) else: print("tif file is not in proper format for sample {}, please check".format(self.IGO_ID)) - + + # copy HE file per sample if exists + original_HE_tiff_image = source_loc_dir + "/Microscope/HE_" + self.sample_name + ".tif" + if os.path.isfile(original_HE_tiff_image): + shutil.copy(original_HE_tiff_image, destination_HE_file) + self.HE_tiff_image = destination_HE_file + print("copy {} to {}".format(original_HE_tiff_image, destination_HE_file)) + else: + print("HE tif file does not exist for sample {}, please check".format(self.IGO_ID)) + # copy json file if exists def copy_json(self, project_id): # project_id format as Project_12345 From b4519accc8986e4d8d174535c6596c0fb6365d6f Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Mon, 17 Jun 2024 09:30:48 -0400 Subject: [PATCH 86/87] Update run_param_config.py to run WGS metrics on User_WGS recipe --- scripts/run_param_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/run_param_config.py b/scripts/run_param_config.py index 61f5609..5085ff6 100644 --- a/scripts/run_param_config.py +++ b/scripts/run_param_config.py @@ -70,6 +70,7 @@ def get_ordered_dic(unordered_dic): # "10X_Genomics_WGS": { TYPE: "WGS" }, "WGS_Metagenomic": { TYPE: "WGS" }, "WGS_Deep": { TYPE: "WGS" }, + "User_WGS": { TYPE: "WGS" }, ".*RNA.*": { TYPE: "RNA" }, # ".*96Well_SmartSeq2": { TYPE: "RNA" }, ".*SMARTer.*": { TYPE: "RNA" }, From e512fe7c76f7c7bbe824bb9731b354ad978f87e2 Mon Sep 17 00:00:00 2001 From: darrelln32 Date: Thu, 20 Jun 2024 10:27:37 -0400 Subject: [PATCH 87/87] Update cellranger_config.py changing names for ARC/Multiome to match recipe names on sample sheet --- scripts/cellranger_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cellranger_config.py b/scripts/cellranger_config.py index e4e6105..c41c53e 100644 --- a/scripts/cellranger_config.py +++ b/scripts/cellranger_config.py @@ -50,7 +50,7 @@ # 10X recipe list for different pipelines COUNT_FLAVORS = ["SC_Chromium-GEX-3", "SC_Chromium-GEX-5"] VDJ_FLAVORS = ["SC_Chromium-TCR", "SC_Chromium-BCR"] -ARC_FLAVORS = ["SC_Chromium-Multiome", "SC_Chromium-Multiome_ATAC", "SC_Chromium-Multiome_GEX"] +ARC_FLAVORS = ["SC_Chromium-Multiome", "SC_Chromium-Multiome-ATAC", "SC_Chromium-Multiome-GEX"] SPATIAL_FLAVORS = ["ST_Visium"] # we do not want to PROCESS SAIL (15500) or SCRI (12437) projects