Skip to content

Commit

Permalink
Revert "Fix compute mean coverage by running stat over single intervals"
Browse files Browse the repository at this point in the history
This reverts commit 901c7ea.
  • Loading branch information
Chiara Rasi committed May 24, 2024
1 parent 901c7ea commit fedae81
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 43 deletions.
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
### Fixed
- Position of `Show genes` checkbox on report page
- Updating gene panel name using the web form on report page
- Workaround for https://github.com/38/d4-format/issues/78 -> run d4tool stat for each single region

## [1.6]
### Added
Expand Down
87 changes: 45 additions & 42 deletions src/chanjo2/meta/handle_d4.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def get_d4tools_chromosome_mean_coverage(
return chromosomes_coverage


def get_d4tools_intervals_mean_coverage(d4_file_path: str, intervals: List[str]) -> List[float]:
def get_d4tools_intervals_mean_coverage(
d4_file_path: str, intervals: List[str]
) -> List[float]:
"""Return the mean value over a list of intervals of a d4 file."""

tmp_bed_file = tempfile.NamedTemporaryFile()
Expand All @@ -55,35 +57,19 @@ def get_d4tools_intervals_mean_coverage(d4_file_path: str, intervals: List[str])
)


def get_d4tools_intervals_coverage(d4_file_path: str, bed_file_path: str) -> List[float]:
def get_d4tools_intervals_coverage(
d4_file_path: str, bed_file_path: str
) -> List[float]:
"""Return the coverage for intervals of a d4 file that are found in a bed file."""

d4tools_stats_mean_cmd: str = subprocess.check_output(
["d4tools", "stat", "--region", bed_file_path, d4_file_path, "--stat", "mean"],
text=True,
)
return [float(line.rstrip().split("\t")[3]) for line in d4tools_stats_mean_cmd.splitlines()]


def get_d4tools_bed_lines_coverage(d4_file_path: str, intervals: List[str]) -> List[float]:
"""Return the coverage for one line of a bed file.
This is a workaround to fix the following issue -> https://github.com/38/d4-format/issues/78
"""
intervals_coverage: List[float] = []
for interval in intervals:
cmd = f'd4tools stat -s mean {d4_file_path} -r <(echo -e "{interval}")'
p_out, p_err = subprocess.Popen(
cmd,
shell=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
executable="/bin/bash",
).communicate()
if p_err:
continue
intervals_coverage.append(float(p_out.split("\t")[3]))
return intervals_coverage
return [
float(line.rstrip().split("\t")[3])
for line in d4tools_stats_mean_cmd.splitlines()
]


def get_report_sample_interval_coverage(
Expand All @@ -102,8 +88,7 @@ def get_report_sample_interval_coverage(
return

# Compute intervals coverage
# This is a workaround to fix the following issue -> https: // github.com / 38 / d4 - format / issues / 78
intervals_coverage: List[float] = get_d4tools_bed_lines_coverage(
intervals_coverage: List[float] = get_d4tools_intervals_mean_coverage(
d4_file_path=d4_file_path, intervals=intervals_coords
)
completeness_row_dict: dict = {"mean_coverage": mean(intervals_coverage)}
Expand All @@ -113,10 +98,12 @@ def get_report_sample_interval_coverage(
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
for interval in sql_intervals
]
intervals_coverage_completeness: Dict[str, dict] = coverage_completeness_multitasker(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
intervals_coverage_completeness: Dict[str, dict] = (
coverage_completeness_multitasker(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
)
)

interval_ids = set()
Expand All @@ -142,8 +129,12 @@ def get_report_sample_interval_coverage(
if interval.ensembl_id.startswith("ENSG")
else interval.ensembl_gene_id
)
interval_hgnc_id: int = gene_ids_mapping[interval_ensembl_gene]["hgnc_id"]
interval_hgnc_symbol: str = gene_ids_mapping[interval_ensembl_gene]["hgnc_symbol"]
interval_hgnc_id: int = gene_ids_mapping[interval_ensembl_gene][
"hgnc_id"
]
interval_hgnc_symbol: str = gene_ids_mapping[interval_ensembl_gene][
"hgnc_symbol"
]
genes_covered_under_custom_threshold.add(interval_hgnc_symbol)
incomplete_coverages_rows.append(
(
Expand All @@ -165,7 +156,9 @@ def get_report_sample_interval_coverage(
report_data["completeness_rows"].append((sample_name, completeness_row_dict))
report_data["incomplete_coverage_rows"] += incomplete_coverages_rows
fully_covered_intervals_percent = round(
100 * (len(interval_ids) - nr_intervals_covered_under_custom_threshold) / len(interval_ids),
100
* (len(interval_ids) - nr_intervals_covered_under_custom_threshold)
/ len(interval_ids),
2,
)
report_data["default_level_completeness_rows"].append(
Expand Down Expand Up @@ -199,10 +192,12 @@ def get_sample_interval_coverage(
(interval.ensembl_id, (interval.chromosome, interval.start, interval.stop))
for interval in sql_intervals
]
intervals_coverage_completeness: Dict[str, dict] = coverage_completeness_multitasker(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
intervals_coverage_completeness: Dict[str, dict] = (
coverage_completeness_multitasker(
d4_file_path=d4_file_path,
thresholds=completeness_thresholds,
interval_ids_coords=interval_ids_coords,
)
)

# Create GeneCoverage objects
Expand All @@ -227,7 +222,9 @@ def get_sample_interval_coverage(
intervals=[f"{gene.chromosome}\t{gene.start}\t{gene.stop}"],
)
)
gene_coverage.completeness = intervals_coverage_completeness.get(gene.ensembl_id, {})
gene_coverage.completeness = intervals_coverage_completeness.get(
gene.ensembl_id, {}
)

else: # Retrieve transcripts or exons for this gene

Expand Down Expand Up @@ -274,18 +271,24 @@ def get_sample_interval_coverage(
],
)
),
"completeness": intervals_coverage_completeness[interval.ensembl_id],
"completeness": intervals_coverage_completeness[
interval.ensembl_id
],
}
)

gene_coverage.inner_intervals.append(interval_coverage)
inner_intervals_ensembl_ids.add(interval.ensembl_id)

gene_intervals_mean_coverage: List[float] = get_d4tools_intervals_mean_coverage(
d4_file_path=d4_file_path, intervals=intervals_bed_coords
gene_intervals_mean_coverage: List[float] = (
get_d4tools_intervals_mean_coverage(
d4_file_path=d4_file_path, intervals=intervals_bed_coords
)
)
gene_coverage.mean_coverage = (
mean(gene_intervals_mean_coverage) if gene_intervals_mean_coverage else 0
mean(gene_intervals_mean_coverage)
if gene_intervals_mean_coverage
else 0
)

for threshold in completeness_thresholds:
Expand Down

0 comments on commit fedae81

Please sign in to comment.