Merge branch 'release/0.1.0'

ebi-gene-expression-group · Jun 27, 2019 · 9bf4ae4 · 9bf4ae4
2 parents d41acc7 + 7396ecf
commit 9bf4ae4
Show file tree

Hide file tree

Showing 32 changed files with 2,847 additions and 1 deletion.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,15 @@
+[submodule "util/galaxy-workflow-executor"]
+	path = util/galaxy-workflow-executor
+	url = https://github.com/ebi-gene-expression-group/galaxy-workflow-executor
+[submodule "util/scxa-smartseq-quantification-workflow"]
+	path = w_smart-seq_quantification
+	url = https://github.com/ebi-gene-expression-group/scxa-smartseq-quantification-workflow
+[submodule "util/scxa-droplet-quantification-workflow"]
+	path = w_droplet_quantification
+	url = https://github.com/ebi-gene-expression-group/scxa-droplet-quantification-workflow
+[submodule "util/scxa-aggregation-workflow"]
+	path = w_aggregation
+	url = https://github.com/ebi-gene-expression-group/scxa-aggregation-workflow
+[submodule "util/scxa-bundle-workflow"]
+	path = w_bundle
+	url = https://github.com/ebi-gene-expression-group/scxa-bundle-workflow
diff --git a/README.md b/README.md
@@ -1,2 +1,79 @@
-# scxa-workflows
+# scxa-workflows v0.1.0
+
 Higher level repo for aggregating all of Atlas workflow logic for Single Cell
+towards execution purposes. Version 0.1.0 was used to run all data analysis for
+the [Release 6](https://www.ebi.ac.uk/gxa/sc/release-notes.html) of
+[Single Cell Expression Atlas](https://www.ebi.ac.uk/gxa/sc/home).
+
+Alignment and quantification workflows are developed in NextFlow and can be
+found in the `w_*_quantification` directories, whereas the clustering and
+downstream analysis was built on Galaxy and can be found in `w_*_clustering`
+directories. All of the Galaxy tools used here are available from the
+[Galaxy Toolshed](https://toolshed.g2.bx.psu.edu/view/ebi-gxa) to be installed
+on any instance. The tools are available for direct use as well on the
+[Human Cell Atlas European Galaxy](https://humancellatlas.usegalaxy.eu/) instance.
+
+## Organization
+
+Each workflow type will live in a `w_` prefixed directory, where the execution should fine everything needed to run it, with execution configuration being inyectable. The content of the directory should include:
+
+- Parameters
+- Workflow definition
+- Any software requisites (ie. do you need to install a conda package for the executor of the workflow?)
+
+The definition for fulfilling this is rather flexible (it could be either the file itself or something that links to that information, provided it can be followed by the automatic executor).
+
+### Technology
+
+The `w_` prefix is followed by a technology name. The name should not use underscores, but dashes instead if needed. Current technologies supported will be:
+
+- smart-seq
+- smart-seq2
+- smarter
+- smart-like
+- 10xv1*
+- 10xv1a*
+- 10xv1i*
+- 10xv2
+- drop-seq
+
+ \* Not to be supported in first droplet Alevin-based pipeline
+
+### Phase of analysis
+
+Given that we currently have separation between quantification and clustering, the `w_technology_` prefix will be followed by a phase of analysis section. Currently:
+
+- quantification
+- clustering
+
+### Running the clustering part manually
+
+To run the clustering part manually, define the following environment variables
+in the environment (example below for E-ENAD-15):
+
+```
+export species=mus_musculus
+export expName=E-ENAD-15
+
+export matrix_file=$BUNDLE/raw/matrix.mtx.gz
+export genes_file=$BUNDLE/raw/genes.tsv.gz
+export barcodes_file=$BUNDLE/raw/barcodes.tsv.gz
+export gtf_file=$REF/$species/Mus_musculus.GRCm38.95.gtf.gz
+
+export tpm_matrix_file=$BUNDLE/tpm/matrix.mtx.gz
+export tpm_genes_file=$BUNDLE/tpm/genes.tsv.gz
+export tpm_barcodes_file=$BUNDLE/tpm/barcodes.tsv.gz
+
+export create_conda_env=yes
+export GALAXY_CRED_FILE=../galaxy_credentials.yaml
+export GALAXY_INSTANCE=ebi_cluster
+
+export FLAVOUR=w_smart-seq_clustering
+```
+
+If `create_conda_env` is set to `no`, then the the following script should be executed
+in an environment that has bioblend and Python 3. Then execute:
+
+```
+run_flavour_workflows.sh
+```
diff --git a/bin/run_flavour_workflows.sh b/bin/run_flavour_workflows.sh
@@ -0,0 +1,40 @@
+#!/bin/env bash
+set -e
+
+# This file runs the Galaxy smart-seq Scanpy clustering workflow
+
+# Experiment related, needs to be inyected
+export EXP_ID=${1:-$expName}
+export EXP_SPECIE=${2:-$species}
+export EXP_BUNDLE=${BUNDLE_PATH}
+
+# GALAXY Related, needs to be inyected
+export GALAXY_INSTANCE=${GALAXY_INSTANCE}
+export GALAXY_CRED_FILE=${GALAXY_CRED_FILE}
+
+export WORKDIR=${WORKDIR:-$(pwd)}
+
+[ ! -z ${FLAVOUR+x} ] || ( echo "Env var FLAVOUR for the type of workflow to be run, matching one of the w_* directories" && exit 1 )
+[ ! -z ${GALAXY_INSTANCE+x} ] || ( echo "Env var GALAXY_INSTANCE must be set." && exit 1 )
+[ ! -z ${GALAXY_CRED_FILE+x} ] || ( echo "Env var GALAXY_CRED_FILE pointing to the credentials file must be set." && exit 1 )
+[ ! -z ${EXP_SPECIE+x} ] || ( echo "Env var EXP_SPECIE for the species of the experiment needs to be defined." && exit 1 )
+[ ! -z ${EXP_ID+x} ] || ( echo "Env var EXP_ID for the id/accession of the experiment needs to be defined." && exit 1 )
+if [ ! -z ${EXP_BUNDLE} ]; then
+  [ ! -z ${REF_DIR+x} ] || ( echo "Env var REF_DIR for the reference genome annot needs to be defined." && exit 1 )
+fi
+
+scriptDir=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+export baseDir=$scriptDir/..
+
+for mod in util util/galaxy-workflow-executor; do
+  PATH=$baseDir/$mod:$PATH
+done
+export PATH
+
+# This is where additional variables defined during the inputs_yaml setup will be left
+export envs_for_workflow_run=$WORKDIR/envs_for_workflow_run.sh
+
+bash $baseDir/$FLAVOUR/setup.sh
+source $envs_for_workflow_run
+
+bash $baseDir/$FLAVOUR/run_flavour.sh
diff --git a/exec/submitControlWorkflow.sh b/exec/submitControlWorkflow.sh
@@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+
+usage() { echo "Usage: $0 [-e <experiment ID>] [-q <re-use existing quantifications where present, yes or no>] [-a <re-use existing aggregations where present, yes or no>] [-t <tertiary workflow>] [-w <overwrite exising results, yes or no>]"  1>&2; }  
+
+e=
+s=no
+t=none
+w=no
+
+while getopts ":e:q:a:t:w:" o; do
+    case "${o}" in
+        e)
+            e=${OPTARG}
+            ;;
+        q)
+            q=${OPTARG}
+            ;;
+        a)
+            a=${OPTARG}
+            ;;
+        t)
+            t=${OPTARG}
+            ;;
+        w)
+            w=${OPTARG}
+            ;;
+        *)
+            usage
+            exit 0
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+
+# Simple script to trigger SCXA workflow
+
+expName=$e
+skipQuantification=$q
+skipAggregation=$a
+tertiaryWorkflow=$t
+overwrite=$w
+
+workflow=scxa-control-workflow
+
+# Change working dir for experiment-specific runs
+
+workingDir="$SCXA_WORKFLOW_ROOT/work/${workflow}"
+if [ -n "$expName" ]; then
+    workingDir="${workingDir}_$expName"
+fi
+
+cd $SCXA_WORKFLOW_ROOT
+
+# Are we prod or test?
+
+scxaBranch='master'
+if [ "$SCXA_ENV" == 'test' ]; then
+    scxaBranch='develop'
+fi
+
+export SCXA_BRANCH=$scxaBranch
+
+# Build the Nextflow command
+
+expNamePart=
+if [ -n "$expName" ]; then
+    expNamePart="--expName $expName"
+fi
+
+skipQuantificationPart=
+if [ -n "$skipQuantification" ]; then
+    if [ "$skipQuantification" != 'yes' ] && [ "$skipQuantification" != 'no' ]; then
+        echo "Skip quantification (-q) must be 'yes' or 'no', $skipQuantification provided." 1>&2
+        exit 1
+    fi
+
+    skipQuantificationPart="--skipQuantification $skipQuantification"
+fi
+
+skipAggregationPart=
+if [ -n "$skipAggregation" ]; then
+    if [ "$skipAggregation" != 'yes' ] && [ "$skipAggregation" != 'no' ]; then
+        echo "Skip aggregation (-a) must be 'yes' or 'no', $skipAggregation provided." 1>&2
+        exit 1
+    fi
+
+    skipAggregationPart="--skipAggregation $skipAggregation"
+fi
+
+overwritePart=
+if [ -n "$overwrite" ]; then
+    if [ "$overwrite" != 'yes' ] && [ "$overwrite" != 'no' ]; then
+        echo "Overwrite (-w) must be 'yes' or 'no', $overwrite provided" 1>&2
+        exit 1
+    fi
+
+    overwritePart="--overwrite $overwrite"
+fi
+
+tertiaryWorkflowPart=
+if [ -n "$tertiaryWorkflow" ]; then
+    tertiaryWorkflowPart="--tertiaryWorkflow $tertiaryWorkflow"
+fi
+
+nextflowCommand="nextflow run -N $SCXA_REPORT_EMAIL -r $scxaBranch -resume ${workflow} $expNamePart $skipQuantificationPart $skipAggregationPart $tertiaryWorkflowPart $overwritePart --enaSshUser fg_atlas_sc --sdrfDir $SCXA_SDRF_DIR -work-dir $workingDir"
+
+# Run the LSF submission if it's not already running
+
+bjobs -w | grep "${SCXA_ENV}_$workflow" > /dev/null 2>&1
+
+if [ $? -ne 0 ]; then
+
+    # If workflow completed successfully we can clean up the work dir. If not,
+    # then the caching from the work dir will be useful to resume
+
+    successMarker="$SCXA_WORKFLOW_ROOT/work/.success"
+
+    if [ -e "$successMarker" ]; then
+        echo "Previous run succeeded, cleaning up $workingDir"
+        mv $workingDir ${workingDir}_removing_$$ 
+        nohup rm -rf ${workingDir}_removing_$$ &
+        rm -rf $successMarker
+    fi
+
+    for subworkflow in scxa-droplet-quantification-workflow scxa-control-workflow scxa-smartseq-quantification-workflow scxa-aggregation-workflow scanpy-workflow scxa-bundle-workflow; do
+        nextflow pull $subworkflow  -r $scxaBranch
+    done
+
+    echo "Submitting job"
+    rm -rf run.out run.err .nextflow.log*  
+    bsub \
+        -J ${SCXA_ENV}_$workflow \
+        -M 4096 -R "rusage[mem=4096]" \
+        -u $SCXA_REPORT_EMAIL \
+        -o run.out \
+        -e run.err \
+        "$nextflowCommand" 
+else
+    echo "Workflow process already running" 
+fi   
diff --git a/util/choose_resolution_per_clustering.py b/util/choose_resolution_per_clustering.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+"""
+Given a set of clustering files, choose resolutions to use for colliding number of clusters, merge cluster files
+into a single file and move markers from resolutions to applicable clusters number.
+"""
+
+import argparse
+from os import listdir, path
+import re
+import shutil
+
+
+def num_of_clusters(file_path):
+    clusters = set()
+    f = open(file_path)
+    f.readline()  # discard header
+    for line in f:
+        cluster = line.split(sep="\t")[1]
+        clusters.add(cluster)
+    return len(clusters)
+
+
+def main():
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('-c', '--clusters-path',
+                            required=True,
+                            help='Path to where cluster files are found')
+    arg_parser.add_argument('-o', '--output-dir',
+                            required=True,
+                            help='Path for results')
+
+    args = arg_parser.parse_args()
+
+    # Pattern to extract resolution value from clusters file residing in clusters-path, supports integer or floats res.
+    clusters_file_pat = re.compile(r'clusters_resolution_(?P<resolution>\d+\.?\d*).tsv')
+
+    clusters_to_res = {}
+    # Choose resolutions and map to cluster numbers
+    for cluster_file in listdir(args.clusters_path):
+        match = re.search(clusters_file_pat, cluster_file)
+        if match is not None:
+            resolution = float(match.group('resolution'))
+            clusters = num_of_clusters(args.clusters_path+"/"+cluster_file)
+            if clusters in clusters_to_res:
+                if clusters_to_res[clusters] > resolution >= 1 or 1 >= resolution > clusters_to_res[clusters]:
+                    # the current resolution is closer to 1, we prefer it
+                    clusters_to_res[clusters] = resolution
+            else:
+                clusters_to_res[clusters] = resolution
+
+    # Merge all clusters for the selected resolutions into memory and
+    # create individual marker files for each clustering value
+    cells_set = set()
+    cells_clusters = {}
+    for clusters, resolution in sorted(clusters_to_res.items()):
+        print("Res: {} --> Cluster: {}".format(resolution, clusters))
+        clusters_source = open(args.clusters_path+"/clusters_resolution_"+str(resolution)+".tsv", mode="r")
+        header = clusters_source.readline()
+        cluster_assignment = {}
+        for line in clusters_source:
+            cell, cluster_num = line.rstrip().split(sep="\t")
+            cells_set.add(cell)
+            cluster_assignment[cell] = cluster_num
+        cells_clusters[clusters] = cluster_assignment
+
+        source = args.clusters_path+"/markers_clusters_resolution_"+str(resolution)+".csv"
+        dest = args.output_dir+"/markers_"+str(clusters)+".csv"
+        if path.isfile(source):
+            shutil.copy(source, dest)
+
+    print("Cell set has {} entries".format(len(cells_set)))
+    print("Cell clusters has {} entries".format(len(cells_clusters)))
+    # Write the complete clusters file in order
+    clusters_output = open(args.output_dir + "/clusters_for_bundle.txt", mode="w")
+    clusters_output.write("sel.K\tK\t"+"\t".join(sorted(cells_set))+"\n")
+    for k, cluster_assignment in sorted(cells_clusters.items()):
+        selected = 'TRUE' if float(clusters_to_res[k]) == 1.0 else 'FALSE'
+        clusters_output.write("\t".join([selected, str(k)]))
+        print("Cluster assignments for k {} has {} entries".format(k, len(cluster_assignment)))
+        for cell in sorted(cells_set):
+            clusters_output.write("\t"+str(cluster_assignment[cell]))
+        clusters_output.write("\n")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/util/create_conda_env.sh b/util/create_conda_env.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+conda_env_name=${1:-$CONDA_ENV}
+conda_packages=${2:-$CONDA_PACKAGES}
+conda_channels=${3:-$CONDA_CHANNELS}
+
+conda env list | grep $conda_env_name
+
+if [ $? -eq 1 ]; then
+  # conda environment doesn't exist, create it
+  if [ ! -z ${conda_channels+x} ]; then
+    channels="-c $conda_channels"
+  fi
+  conda create -y $channels -n $conda_env_name $conda_packages
+fi
+# we currently trust that if the environment with that name is created, then it contains what we need.
diff --git a/util/galaxy-workflow-executor b/util/galaxy-workflow-executor
diff --git a/util/manifest_handling.sh b/util/manifest_handling.sh
@@ -0,0 +1,6 @@
+function file_for_desc_param() {
+  manifest_path=$1
+  description=$2
+  parameter=$3
+  awk -F'\t' -v desc=$description -v param=$parameter  '$1 == desc && $3 == param { print $2 }' $manifest_path
+}
diff --git a/w_aggregation b/w_aggregation
diff --git a/w_bundle b/w_bundle