From 14de666b8364b0e84b578c723e9647b2c08beba1 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 14:13:45 -0500 Subject: [PATCH 01/10] Separate package installation from R/binary dependency management --- README.md | 2 +- pisces/__init__.py | 37 +++++++++++++++++++++++++++++++++++- pisces/cli.py | 17 ----------------- setup.py | 47 +--------------------------------------------- 4 files changed, 38 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index 5e2b8f9..6f64e15 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Matthew D Shirley, Viveksagar K Radhakrishna, Javad Golji, Joshua M Korn. (Prepr Installation: ``` $ pip install --user novartis-pisces -$ pisces dependencies +$ pisces_dependencies ``` Submitting jobs to an HPC cluster: diff --git a/pisces/__init__.py b/pisces/__init__.py index 6d7c300..8d62550 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -22,11 +22,46 @@ unique_id = ''.join(random.choice(string.digits) for _ in range(10)) - def find_data_directory(): """ Returns the path to the module directory """ return os.path.dirname(os.path.abspath(__file__)) +def install_salmon(): + """Install a version of salmon from bioconda""" + import glob + import requests + from io import BytesIO + from urllib.request import urlopen + from shutil import rmtree + from subprocess import call + + redist = os.path.join(find_data_directory(), 'redist') + rmtree(os.path.join(redist, "salmon"), ignore_errors=True) + + if platform.system() == "Linux": + salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2" + elif platform.system() == "Darwin": + salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/osx-64/salmon-1.3.0-hb70dc8d_0.tar.bz2" + + print("Installing salmon") + with requests.get(salmon_url, allow_redirects=True) as response: + tar_file = BytesIO(response.content) + tar_file.seek(0) + with tarfile.open(fileobj=tar_file) as tar: + tar.extractall(path=os.path.join(redist, "salmon")) + +def install_r_dependencies(): + """Install R dependencies""" + cmd = [ + 'Rscript', + os.path.join(find_data_directory(), 'R/set_up_dependencies.R') + ] + call(cmd) + +def install_dependencies(): + install_salmon() + install_r_dependencies() + def sra_valid_accession(accession): """ Test whether a string is an SRA accession """ if accession.startswith('SRR') and len(accession) == 10: diff --git a/pisces/cli.py b/pisces/cli.py index 3dabd81..518bdb0 100755 --- a/pisces/cli.py +++ b/pisces/cli.py @@ -59,16 +59,6 @@ def call_Rscript(args, unknown_args): ] cmd.extend(unknown_args) call(cmd) - -def call_Rscript_dependencies(args, unknown_args): - """ Call an R script, passing through arguments from argparse. """ - cmd = [ - 'Rscript', - os.path.join(data_dir, 'R/set_up_dependencies.R') - ] - cmd.extend(unknown_args) - call(cmd) - def default_species_index(conf): """ Return a list of the default index builds for each species defined in args.conf """ @@ -249,13 +239,6 @@ def create_parser(args=None): "Compile an expression matrix from multiple individual PISCES runs", add_help=False) matrix.set_defaults(func=call_Rscript) - - dependencies = subparsers.add_parser( - 'dependencies', - help= - "Install R dependencies for PISCES", - add_help=False) - dependencies.set_defaults(func=call_Rscript_dependencies) qc = subparsers.add_parser( 'summarize-qc', diff --git a/setup.py b/setup.py index 79a07dc..b5d3bdd 100644 --- a/setup.py +++ b/setup.py @@ -6,58 +6,13 @@ import logging import platform from setuptools import setup -from setuptools.command.develop import develop -from setuptools.command.install import install install_requires = [ 'six', 'setuptools >= 0.7', 'simplesam >= 0.0.3', 'tqdm', 'fastqp >= 0.2', 'pandas', 'strandex', 'gffutils', 'pyfaidx', 'drmaa', 'twobitreader', 'requests' ] - -class PostDevelopCommand(develop): - def run(self): - develop.run(self) - install_binary_dependencies() - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_binary_dependencies() - - -def install_binary_dependencies(): - sys.path.pop(0) # ignore local search path - import pisces - import glob - import requests - from io import BytesIO - from urllib.request import urlopen - from shutil import rmtree - from subprocess import call - - redist = os.path.join(os.path.dirname(pisces.__file__), 'redist') - rmtree(os.path.join(redist, "salmon"), ignore_errors=True) - local_redist = os.path.join(os.path.dirname(__file__), 'pisces/redist') - - if platform.system() == "Linux": - salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/linux-64/salmon-1.3.0-hf69c8f4_0.tar.bz2" - elif platform.system() == "Darwin": - salmon_url = "https://anaconda.org/bioconda/salmon/1.3.0/download/osx-64/salmon-1.3.0-hb70dc8d_0.tar.bz2" - - print("Installing salmon") - with requests.get(salmon_url, allow_redirects=True) as response: - tar_file = BytesIO(response.content) - tar_file.seek(0) - with tarfile.open(fileobj=tar_file) as tar: - tar.extractall(path=os.path.join(redist, "salmon")) - setup( - cmdclass={ - 'develop': PostDevelopCommand, - 'install': PostInstallCommand, - }, name='novartis-pisces', author='Matthew Shirley', author_email='matt_d.shirley@novartis.com', @@ -70,7 +25,7 @@ def install_binary_dependencies(): install_requires=install_requires, use_scm_version={"local_scheme": "no-local-version"}, setup_requires=['setuptools_scm'], - entry_points={'console_scripts': ['pisces = pisces.cli:main']}, + entry_points={'console_scripts': ['pisces = pisces.cli:main', 'pisces_dependencies = pisces:install_dependencies']}, classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License", From e0177a7571ad99e84470d807d4dc6e8b02e102d3 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 14:35:14 -0500 Subject: [PATCH 02/10] missing import --- pisces/__init__.py | 1 + setup.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 8d62550..f0b38e3 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -11,6 +11,7 @@ import string import random import shutil +import platform from pprint import pformat from subprocess import Popen, PIPE from multiprocessing import Process diff --git a/setup.py b/setup.py index b5d3bdd..53238d5 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,6 @@ import zipfile import subprocess import logging -import platform from setuptools import setup install_requires = [ From dea7a0b9419e14a09efe3634bb2650146ef6a6ba Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 19:35:56 -0500 Subject: [PATCH 03/10] Move imports --- pisces/__init__.py | 1 + setup.py | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index f0b38e3..6ce91e4 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -12,6 +12,7 @@ import random import shutil import platform +import tarfile from pprint import pformat from subprocess import Popen, PIPE from multiprocessing import Process diff --git a/setup.py b/setup.py index 53238d5..32e0329 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,3 @@ -import os -import sys -import tarfile -import zipfile -import subprocess -import logging from setuptools import setup install_requires = [ From 47335dd8a43d876c287a0a07780386f0c68cf95d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 4 Mar 2021 19:51:37 -0500 Subject: [PATCH 04/10] import call --- pisces/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 6ce91e4..2858ec3 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -14,7 +14,7 @@ import platform import tarfile from pprint import pformat -from subprocess import Popen, PIPE +from subprocess import Popen, PIPE, call from multiprocessing import Process from tempfile import NamedTemporaryFile, mkdtemp from functools import partial From 0e44850d0931a66c9f9f3cc0191723b43d80bd8d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 10:49:22 -0400 Subject: [PATCH 05/10] Negate filtering --- pisces/R/make_expression_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index 369b302..c071679 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -389,7 +389,7 @@ deseq_analysis <- function(contrasts, txi.gene, contrast.metadata, formula, fail } else { deseq.dataset <- DESeqDataSetFromTximport(txi.gene, contrast.metadata, as.formula(paste0("~", contrasts$Factor[1])))} message(paste("Filtering", length(failing.quartile.filter), "genes failing --quartile-expression cutoff from DESeq2 dataset.")) - deseq.dataset <- deseq.dataset[failing.quartile.filter,] + deseq.dataset <- deseq.dataset[-failing.quartile.filter,] message("Running DESeq2...") deseq.dataset <- DESeq(deseq.dataset) From f321fc1456cca931ff1f01c9bd1a8a92d16b9e14 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 10:54:48 -0400 Subject: [PATCH 06/10] Better document first_quartile --- pisces/R/make_expression_matrix.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index c071679..e4155fb 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -269,7 +269,14 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { "genes.")) ribo.genes <- which(grepl("^RP[SL]", scaled_df[, "symbol"], ignore.case = T)) - first_quartile <- function(x) { y <- quantile(x, c(0.25, 0.5, 0.75), type=1) + + + first_quartile <- function(x) { + # > seq(10) + # [1] 1 2 3 4 5 6 7 8 9 10 + # > first_quartile(seq(10)) + #[1] 3 + y <- quantile(x, c(0.25, 0.5, 0.75), type=1) return(y[[1]]) } From bfc6805d9e66a03d9d0ea82ab15695b43756c342 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 11:01:40 -0400 Subject: [PATCH 07/10] Take the top quantile instead of bottom --- pisces/R/make_expression_matrix.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index e4155fb..b7dda00 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -275,9 +275,9 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { # > seq(10) # [1] 1 2 3 4 5 6 7 8 9 10 # > first_quartile(seq(10)) - #[1] 3 + #[1] 8 y <- quantile(x, c(0.25, 0.5, 0.75), type=1) - return(y[[1]]) + return(y[[3]]) } message(paste("Excluding", length(ribo.genes), "ribosomal genes from TMM scaling factor calulation.")) From ab6760a1e72604a1dca873375bac81646c772a08 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Wed, 24 Mar 2021 11:09:29 -0400 Subject: [PATCH 08/10] function doc --- pisces/R/make_expression_matrix.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pisces/R/make_expression_matrix.R b/pisces/R/make_expression_matrix.R index b7dda00..4f74ca4 100755 --- a/pisces/R/make_expression_matrix.R +++ b/pisces/R/make_expression_matrix.R @@ -272,10 +272,8 @@ Summarize <- function(txi, tx2gene, annotation, args, metadata, species) { first_quartile <- function(x) { - # > seq(10) - # [1] 1 2 3 4 5 6 7 8 9 10 - # > first_quartile(seq(10)) - #[1] 8 + # > first_quartile(c(0,0,0,0,0,0,0,1,10,10)) + # [1] 1 y <- quantile(x, c(0.25, 0.5, 0.75), type=1) return(y[[3]]) } From 89aadee0b196c569729358fc7b8300efbab6d4c5 Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Thu, 15 Apr 2021 10:50:55 -0400 Subject: [PATCH 09/10] Fix exon ordering for minus strand transcripts in #15 --- pisces/index.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pisces/index.py b/pisces/index.py index 62184b6..5d2dc0f 100755 --- a/pisces/index.py +++ b/pisces/index.py @@ -266,10 +266,16 @@ def features_to_string(features, fasta_in, masked=True, strand=True): """ """ sequences = [] + feature_strand = "." for feature in features: + feature_strand = feature.strand sequences.append( feature.sequence( fasta_in, use_strand=strand)) + # if the transcript is on the reverse strand, reverse order of exons + # before concatenating + if feature_strand == "-": + sequences = sequences[::-1] seq = ''.join(sequences) mask_count = sum(seq.count(a) for a in soft_chars) if masked: From f0a0f0f3a7c07e767cd3b1006ae2fa5c7116c17d Mon Sep 17 00:00:00 2001 From: Matt Shirley Date: Fri, 9 Jul 2021 14:44:43 -0400 Subject: [PATCH 10/10] Enable selective alignment and gcBias flags --- pisces/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pisces/__init__.py b/pisces/__init__.py index 2858ec3..b41e8d2 100755 --- a/pisces/__init__.py +++ b/pisces/__init__.py @@ -271,7 +271,7 @@ def format_salmon_command(libtype, threads, index, output_dir, read1, read2, 'bin', 'salmon'), '--no-version-check', 'quant', '-q', '--index', index, '--libType', libtype, '--mates1', ' '.join(read1), '--mates2', ' '.join(read2), '--output', output_dir, '--threads', - str(threads), '--seqBias', '--useVBOpt' + str(threads), '--seqBias', '--gcBias', '--validateMappings', '--dumpEq' ] elif not read2: cmd = [ @@ -279,7 +279,7 @@ def format_salmon_command(libtype, threads, index, output_dir, read1, read2, 'bin', 'salmon'), '--no-version-check', 'quant', '-q', '--index', index, '--libType', libtype, '-r', ' '.join(read1), '--output', output_dir, '--threads', - str(threads), '--seqBias', '--useVBOpt' + str(threads), '--seqBias', '--gcBias', '--validateMappings', '--dumpEq' ] if make_bam: import shlex