Skip to content

Commit

Permalink
HVG genes setting through files
Browse files Browse the repository at this point in the history
  • Loading branch information
pcm32 committed Feb 16, 2024
1 parent 52003ca commit 4edae2a
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 2 deletions.
28 changes: 28 additions & 0 deletions scanpy-scripts-tests.bats
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ setup() {
norm_opt="--save-layer filtered -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout"
norm_obj="${output_dir}/norm.h5ad"
hvg_opt="-m 0.0125 3 -d 0.5 inf -s --show-obj stdout"
always_hvg="${data_dir}/always_hvg.txt"
never_hvg="${data_dir}/never_hvg.txt"
hvg_opt_always_never="--always-hv-genes-file ${always_hvg} --never-hv-genes-file ${never_hvg}"
hvg_obj="${output_dir}/hvg.h5ad"
hvg_obj_on_off="${output_dir}/hvg_on_off.h5ad"
regress_opt="-k n_counts --show-obj stdout"
regress_obj="${output_dir}/regress.h5ad"
scale_opt="--save-layer normalised -m 10 --show-obj stdout"
Expand Down Expand Up @@ -131,6 +135,22 @@ setup() {
[ -f "$raw_matrix_from_raw" ]
}

@test "Add genes to be considered HVGs" {
if [ "$resume" = 'true' ] && [ -f "$always_hvg" ]; then
skip "$always_hvg exists"
fi

run eval "echo -e 'MIR1302-10\nFAM138A' > $always_hvg"
}

@test "Add genes not to be considered HVGs" {
if [ "$resume" = 'true' ] && [ -f "$never_hvg" ]; then
skip "$never_hvg exists"
fi

run eval "echo -e 'ISG15\nTNFRSF4' > $never_hvg"
}

@test "Test MTX write from layers" {
if [ "$resume" = 'true' ] && [ -f "$raw_matrix_from_layer" ]; then
skip "$raw_matrix exists"
Expand Down Expand Up @@ -219,6 +239,14 @@ setup() {
[ -f "$hvg_obj" ]
}

@test "Find variable genes with optional turn on/off lists" {
if [ "$resume" = 'true' ] && [ -f "$hvg_obj_on_off" ]; then
skip "$hvg_obj_on_off exists and resume is set to 'true'"
fi

run rm -f $hvg_obj_on_off && eval "$scanpy hvg $hvg_opt_always_never $norm_obj $hvg_obj_on_off"
}

# Do separate doublet simulation step (normally we'd just let the main scrublet
# process do this).

Expand Down
19 changes: 17 additions & 2 deletions scanpy_scripts/cmd_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
"""

import click

from .click_utils import (
CommaSeparatedText,
Dictionary,
valid_limit,
valid_parameter_limits,
mutually_exclusive_with,
required_by,
valid_limit,
valid_parameter_limits,
)

COMMON_OPTIONS = {
Expand Down Expand Up @@ -856,6 +857,20 @@
"'seurat_v3', ties are broken by the median (across batches) rank based on "
"within-batch normalized variance.",
),
click.option(
"--always-hv-genes-file",
"always_hv_genes_file",
type=click.Path(exists=True),
default=None,
help="If specified, the gene identifers in this file will be set as highly variable in the var dataframe after HVGs are computed.",
),
click.option(
"--never-hv-genes-file",
"never_hv_genes_file",
type=click.Path(exists=True),
default=None,
help="If specified, the gene identifers in this file will be removed from highly variable in the var dataframe (set to false) after HVGs are computed.",
),
],
"scale": [
*COMMON_OPTIONS["input"],
Expand Down
45 changes: 45 additions & 0 deletions scanpy_scripts/lib/_hvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@ def hvg(
if "n_top_genes" in kwargs and kwargs["n_top_genes"] is not None:
kwargs["n_top_genes"] = min(adata.n_vars, kwargs["n_top_genes"])

always_hv_genes = None
if "always_hv_genes_file" in kwargs and kwargs["always_hv_genes_file"] is not None:
with open(kwargs["always_hv_genes_file"], "r") as f:
always_hv_genes = f.read().splitlines()
# to avoid upsetting the scanpy function with unexpected keyword arguments
del kwargs["always_hv_genes_file"]

never_hv_genes = None
if "never_hv_genes_file" in kwargs and kwargs["never_hv_genes_file"] is not None:
with open(kwargs["never_hv_genes_file"], "r") as f:
never_hv_genes = f.read().splitlines()
# to avoid upsetting the scanpy function with unexpected keyword arguments
del kwargs["never_hv_genes_file"]

sc.pp.highly_variable_genes(
adata,
min_mean=mean_limits[0],
Expand All @@ -30,4 +44,35 @@ def hvg(
**kwargs,
)

return switch_hvgs(adata, always_hv_genes, never_hv_genes)


def switch_hvgs(adata, always_hv_genes=None, never_hv_genes=None):
"""
Function to switch on/off highly variable genes based on a list of genes.
>>> adata = sc.datasets.pbmc3k()
>>> sc.pp.normalize_total(adata)
>>> sc.pp.log1p(adata)
>>> sc.pp.highly_variable_genes(adata)
>>> adata = switch_hvgs(adata, always_hv_genes=['MIR1302-10', 'FAM138A'], never_hv_genes=['ISG15', 'TNFRSF4'])
>>> adata.var.loc['ISG15'].highly_variable
False
>>> adata.var.loc['TNFRSF4'].highly_variable
False
>>> adata.var.loc['MIR1302-10'].highly_variable
True
>>> adata.var.loc['CPSF3L'].highly_variable
True
"""
if always_hv_genes is not None:
adata.var.highly_variable = np.logical_or(
adata.var.highly_variable, adata.var_names.isin(always_hv_genes)
)

if never_hv_genes is not None:
adata.var.highly_variable = np.logical_and(
adata.var.highly_variable, ~adata.var_names.isin(never_hv_genes)
)

return adata

0 comments on commit 4edae2a

Please sign in to comment.