Skip to content

Commit

Permalink
Merge pull request #17 from ENCODE-DCC/3486-spp1-14
Browse files Browse the repository at this point in the history
3486 spp1 14
  • Loading branch information
strattan authored Oct 13, 2016
2 parents c99af4a + bafbe83 commit 0f30f40
Show file tree
Hide file tree
Showing 21 changed files with 749 additions and 624 deletions.
42 changes: 32 additions & 10 deletions dnanexus/accession_analysis/src/accession_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,11 +774,22 @@ def get_raw_mapping_stages(mapping_analysis, keypair, server, fqcheck, repn):
'crop_length %s. Inferring mapped_read_length from fastqs'
% (crop_length))
native_lengths = set([fq.get('read_length') for fq in fastqs])
assert (len(native_lengths) == 1 and
all([isinstance(rl, int) for rl in native_lengths])), \
('fastqs with different or non-integer read_lengths: %s'
% ([(fq.get('accession'), fq.get('read_length'))
for fq in fastqs]))
try:
assert (len(native_lengths) == 1 and
all([isinstance(rl, int) for rl in native_lengths])), \
('fastqs with different or non-integer read_lengths: %s'
% ([(fq.get('accession'), fq.get('read_length'))
for fq in fastqs]))
except AssertionError:
if fqcheck:
raise
else:
logger.warning(
'fastqs with different or non-integer read_lengths: %s But fqcheck is False so ignoring'
% ([(fq.get('accession'), fq.get('read_length'))
for fq in fastqs]))
except:
raise
mapped_read_length = int(next(l for l in native_lengths))
else:
mapped_read_length = int(crop_length)
Expand Down Expand Up @@ -948,11 +959,22 @@ def get_mapping_stages(mapping_analysis, keypair, server, fqcheck, repn):
'crop_length %s. Inferring mapped_read_length from fastqs'
% (crop_length))
native_lengths = set([fq.get('read_length') for fq in fastqs])
assert (len(native_lengths) == 1 and
all([isinstance(rl, int) for rl in native_lengths])), \
('fastqs with different or non-integer read_lengths: %s'
% ([(fq.get('accession'), fq.get('read_length'))
for fq in fastqs]))
try:
assert (len(native_lengths) == 1 and
all([isinstance(rl, int) for rl in native_lengths])), \
('fastqs with different or non-integer read_lengths: %s'
% ([(fq.get('accession'), fq.get('read_length'))
for fq in fastqs]))
except AssertionError:
if fqcheck:
raise
else:
logger.warning(
'fastqs with different or non-integer read_lengths: %s But fqcheck is False so ignoring'
% ([(fq.get('accession'), fq.get('read_length'))
for fq in fastqs]))
except:
raise
mapped_read_length = int(next(l for l in native_lengths))
else:
mapped_read_length = int(crop_length)
Expand Down
11 changes: 11 additions & 0 deletions dnanexus/bioconductor_asset/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SHELL=/bin/bash -e

all:
# Trust the signing key for this repo. Reference: http://cran.rstudio.com/bin/linux/ubuntu/README.html
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
sudo rm -f /etc/apt/apt.conf.d/99dnanexus
sudo pip install biopython
# install bioconductor and it's packages
#R -e "install.packages(c('ape', 'biom', 'optparse', 'RColorBrewer', 'randomForest', 'vegan'), repos='http://cran.rstudio.com/'); source('http://bioconductor.org/biocLite.R'); biocLite(); biocLite(c('DESeq2', 'metagenomeSeq'))"
sudo R < installbioc.R --save

9 changes: 9 additions & 0 deletions dnanexus/bioconductor_asset/dxasset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "bioconductorAsset",
"title": "Bioconductor",
"description": "R package bioconductor, required for SPP 1.14",
"version": "0.0.1",
"distribution": "Ubuntu",
"instanceType": "mem1_ssd1_x2",
"release": "12.04"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
## try http:// if https:// URLs are not supported
install.packages(c("caTools", "snow"),repos="http://cran.us.r-project.org")
#source("https://bioconductor.org/biocLite.R")
source("http://bioconductor.org/biocLite.R")
biocLite(ask=FALSE)
biocLite("Rsamtools",ask=FALSE)
4 changes: 3 additions & 1 deletion dnanexus/call_chip_from_tas.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def get_args():
# parser.add_argument('--idr', help="Run IDR. If not specified, run IDR for non-histone targets.", default=False, action='store_true')
# parser.add_argument('--idrversion', help="IDR version (relevant only if --idr is specified", default="2")
parser.add_argument('--dryrun', help="Formulate the run command, but don't actually run", default=False, action='store_true')
parser.add_argument('--spp_version', help="spp version", default="1.14")

args = parser.parse_args()

Expand Down Expand Up @@ -622,7 +623,8 @@ def main():
'--rep2 %s' % (tas['rep2_ta'].get('file_id')),
'--ctl1 %s' % (tas['rep1_ta'].get('control_id')),
'--ctl2 %s' % (tas['rep2_ta'].get('control_id')),
'--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes)]
'--genomesize %s --chrom_sizes "%s"' % (genomesize, chrom_sizes),
'--spp_version %s' % (args.spp_version)]

if blacklist:
command_strings.append('--blacklist "%s"' % (blacklist))
Expand Down
16 changes: 12 additions & 4 deletions dnanexus/chip_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,10 @@ def t_or_f(arg):
'--yes',
help='Run the workflow',
default=False, action='store_true')
parser.add_argument(
'--spp_version',
help="Version string for spp")

# parser.add_argument('--idr', help='Report peaks with and without IDR analysis', default=False, action='store_true')
# parser.add_argument('--idronly', help='Only report IDR peaks', default=None, action='store_true')
# parser.add_argument('--idrversion', help='Version of IDR to use (1 or 2)', default="2")
Expand Down Expand Up @@ -543,7 +547,8 @@ def main():
folder=xcor_output_folder,
stage_input={
'input_bam': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'filtered_bam'}),
'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'})
'paired_end': dxpy.dxlink({'stage': filter_qc_stage_id, 'outputField': 'paired_end'}),
'spp_version': args.spp_version
}
)
mapping_superstage.update({'xcor_stage_id': xcor_stage_id})
Expand Down Expand Up @@ -603,7 +608,8 @@ def main():
folder=xcor_output_folder,
stage_input={
'input_tagAlign': exp_rep1_ta,
'paired_end': rep1_paired_end
'paired_end': rep1_paired_end,
'spp_version': args.spp_version
}
)
xcor_only_stages.append({'xcor_only_rep1_id': exp_rep1_cc_stage_id})
Expand All @@ -617,7 +623,8 @@ def main():
folder=xcor_output_folder,
stage_input={
'input_tagAlign': exp_rep2_ta,
'paired_end': rep2_paired_end
'paired_end': rep2_paired_end,
'spp_version': args.spp_version
}
)
xcor_only_stages.append({'xcor_only_rep2_id': exp_rep2_cc_stage_id})
Expand Down Expand Up @@ -671,7 +678,8 @@ def main():
'rep1_paired_end': rep1_paired_end,
'rep2_paired_end': rep2_paired_end,
'as_file': dxpy.dxlink(resolve_file(args.narrowpeak_as)),
'idr_peaks': True
'idr_peaks': True,
'spp_version': args.spp_version
}
if chrom_sizes:
peaks_stage_input.update({'chrom_sizes': chrom_sizes})
Expand Down
111 changes: 67 additions & 44 deletions dnanexus/encode_idr/src/encode_idr.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,45 +13,49 @@
# DNAnexus Python Bindings (dxpy) documentation:
# http://autodoc.dnanexus.com/bindings/python/current/

import os, subprocess, logging, re, shlex, sys
import subprocess
import shlex
import dxpy
import common
import logging

logger = logging.getLogger(__name__)
logger.addHandler(dxpy.DXLogHandler())
logger.propagate = False


def blacklist_filter(input_fname, output_fname, input_blacklist_fname):

with open(input_fname, 'rb') as fh:
gzipped = fh.read(2) == b'\x1f\x8b'
if gzipped:
peaks_fname = 'peaks.bed'
out,err = common.run_pipe(['gzip -dc %s' %(input_fname)], peaks_fname)
out, err = \
common.run_pipe(['gzip -dc %s' % (input_fname)], peaks_fname)
else:
peaks_fname = input_fname

with open(input_blacklist_fname, 'rb') as fh:
gzipped = fh.read(2) == b'\x1f\x8b'
if gzipped:
blacklist_fname = 'blacklist.bed'
out, err = common.run_pipe(['gzip -dc %s' %(input_blacklist_fname)], blacklist_fname)
out, err = \
common.run_pipe(['gzip -dc %s' % (input_blacklist_fname)], blacklist_fname)
else:
blacklist_fname = input_blacklist_fname

out, err = common.run_pipe([
'subtractBed -A -a %s -b %s' %(peaks_fname, blacklist_fname)
'subtractBed -A -a %s -b %s' % (peaks_fname, blacklist_fname)
], output_fname)
#subprocess.check_call(shlex.split('cp %s %s' %(peaks_fname, output_fname)))


@dxpy.entry_point("main")
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
chrom_sizes, as_file, blacklist=None,
rep1_signal=None, rep2_signal=None, pooled_signal=None):

#TODO for now just taking the peak files. This applet should actually call IDR instead of
#putting that in the workflow populator script

# Initialize the data object inputs on the platform into
# dxpy.DXDataObject instances.
# TODO for now just taking the peak files. This applet should actually
# call IDR instead of putting that in the workflow populator script

reps_peaks_file = dxpy.DXFile(reps_peaks)
r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
Expand All @@ -61,17 +65,15 @@ def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
as_file_file = dxpy.DXFile(as_file)
if blacklist is not None:
blacklist_file = dxpy.DXFile(blacklist)
blacklist_filename = 'blacklist_%s' %(blacklist_file.name)
blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
blacklist_filename = common.uncompress(blacklist_filename)

# Download the file inputs to the local file system.

#Need to prepend something to ensure the local filenames will be unique
reps_peaks_filename = 'true_%s' %(reps_peaks_file.name)
r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name)
r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name)
pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name)
# Need to prepend something to ensure the local filenames will be unique
reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
chrom_sizes_filename = chrom_sizes_file.name
as_file_filename = as_file_file.name

Expand All @@ -82,47 +84,61 @@ def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

print subprocess.check_output('ls -l', shell=True)
subprocess.check_output('set -x; ls -l', shell=True)

reps_peaks_filename = common.uncompress(reps_peaks_filename)
r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

Nt = common.count_lines(reps_peaks_filename)
print "%d peaks from true replicates" %(Nt)
logger.info("%d peaks from true replicates (Nt)" % (Nt))
N1 = common.count_lines(r1pr_peaks_filename)
print "%d peaks from rep1 self-pseudoreplicates" %(N1)
logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
N2 = common.count_lines(r2pr_peaks_filename)
print "%d peaks from rep2 self-pseudoreplicates" %(N2)
logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
Np = common.count_lines(pooledpr_peaks_filename)
print "%d peaks from pooled pseudoreplicates" %(Np)
logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment)
# generate the conservative set, which is always based on the IDR peaks
# from true replicates
conservative_set_filename = \
'%s_final_conservative.narrowPeak' % (experiment)
if blacklist is not None:
blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename)
blacklist_filter(reps_peaks_filename, conservative_set_filename,
blacklist_filename)
Ncb = common.count_lines(conservative_set_filename)
logger.info(
"%d peaks blacklisted from the conservative set" % (Nt-Ncb))
else:
conservative_set_filename = reps_peaks_filename
Ncb = common.count_lines(conservative_set_filename)
print "%d peaks blacklisted from the conservative set" %(Nt-Ncb)

subprocess.check_output(shlex.split(
'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
Ncb = Nt
logger.info("No blacklist filter applied to the conservative set")

# generate the optimal set, which is based on the longest of IDR peaks
# list from true reps or the IDR peaks from the pseudoreplicates of the
# pool
if Nt >= Np:
peaks_to_filter_filename = reps_peaks_filename
No = Nt
else:
peaks_to_filter_filename = pooledpr_peaks_filename
No = Np

optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment)
optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
if blacklist is not None:
blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename)
blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
blacklist_filename)
Nob = common.count_lines(optimal_set_filename)
logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
else:
optimal_set_filename = peaks_to_filter_filename
Nob = common.count_lines(optimal_set_filename)
print "%d peaks blacklisted from the optimal set" %(No-Nob)
subprocess.check_output(shlex.split(
'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
Nob = No
logger.info("No blacklist filter applied to the optimal set")

rescue_ratio = float(max(Np,Nt)) / float(min(Np,Nt))
self_consistency_ratio = float(max(N1,N2)) / float(min(N1,N2))
rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

if rescue_ratio > 2 and self_consistency_ratio > 2:
reproducibility = 'fail'
Expand All @@ -133,15 +149,22 @@ def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,

output = {}

#bedtobigbed often fails, so skip creating the bb if it does
conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename)
optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename)
# bedtobigbed often fails, so skip creating the bb if it does
conservative_set_bb_filename = \
common.bed2bb(conservative_set_filename, chrom_sizes_filename,
as_file_filename)
optimal_set_bb_filename = \
common.bed2bb(optimal_set_filename, chrom_sizes_filename,
as_file_filename)
if conservative_set_bb_filename:
conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename)
output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
conservative_set_bb_output = \
dxpy.upload_local_file(conservative_set_bb_filename)
output.update(
{"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
if optimal_set_bb_filename:
optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})
output.update(
{"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

output.update({
"Nt": Nt,
Expand Down
Loading

0 comments on commit 0f30f40

Please sign in to comment.