-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from malariagen/dev_prep
data prep script dev
- Loading branch information
Showing
6 changed files
with
449 additions
and
127 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
.DS_Store | ||
test_data/ | ||
*.code-workspace | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,36 +1,72 @@ | ||
# anospp-analysis | ||
Python package for ANOSPP data analysis | ||
|
||
## Usage | ||
## Installation | ||
|
||
TODO | ||
## Development | ||
|
||
Clone this repository | ||
``` | ||
git clone [email protected]:malariagen/anospp-analysis.git | ||
``` | ||
## Documentation | ||
|
||
Install poetry | ||
``` | ||
curl -sSL https://install.python-poetry.org | python3 - | ||
``` | ||
TODO | ||
|
||
Create development environment | ||
## Development | ||
|
||
### Setup | ||
|
||
Installation is hybrid with conda + poetry: | ||
``` | ||
git clone [email protected]:malariagen/anospp-analysis.git | ||
cd anospp-analysis | ||
mamba env create -f environment.yml | ||
conda activate anospp_analysis | ||
poetry install | ||
``` | ||
Activate development environment | ||
|
||
Activate development environment: | ||
``` | ||
poetry shell | ||
``` | ||
Example wrapper script run | ||
|
||
### Usage & testing | ||
|
||
The code in this repository can be accessed via wrapper scripts: | ||
``` | ||
python anospp_analysis/qc.py --haplotypes test_data/haplotypes.tsv \ | ||
python anospp_analysis/qc.py \ | ||
--haplotypes test_data/haplotypes.tsv \ | ||
--samples test_data/samples.csv \ | ||
--stats test_data/stats.tsv \ | ||
--outdir test_data/qc | ||
``` | ||
|
||
TODO checks & hooks | ||
Besides, individual functions are available as an API | ||
|
||
TODO testing & CI | ||
|
||
### Adding Python deps | ||
|
||
Introducing python dependencies should be done via poetry: | ||
``` | ||
poetry add package_name | ||
``` | ||
This should update both `pyproject.toml` and `poetry.lock` files | ||
|
||
If the package should be used in development environment only, use | ||
``` | ||
poetry add package_name --dev | ||
``` | ||
|
||
### Adding non-Python deps | ||
|
||
Introducing non-python dependencies should be done via conda: edit `environment.yml`, | ||
then re-create the conda environment and poetry deps: | ||
``` | ||
mamba env create -f environment.yml | ||
conda activate anospp_analysis | ||
poetry install | ||
``` | ||
|
||
Changes in conda environment might also introduce changes to the python installation, | ||
in which case one should update poetry lock file | ||
``` | ||
poetry lock | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import pandas as pd | ||
import argparse | ||
import subprocess | ||
import glob | ||
import sys | ||
import os | ||
|
||
from util import * | ||
|
||
# optimised cutadapt args | ||
CUTADAPT_ARGS = '-O 10 --match-read-wildcards' | ||
|
||
def cutadapt_deplex(dada_table, adapters, cutadapt_args=CUTADAPT_ARGS, work_dir='work'): | ||
|
||
logging.info('cutadapt deplexing started') | ||
|
||
os.makedirs(work_dir, exist_ok = True) | ||
|
||
# generate input fasta from DADA2 table | ||
|
||
dada_df = pd.read_csv(dada_table, sep='\t', index_col=0) | ||
|
||
fasta = f'{work_dir}/input_seqs.fasta' | ||
|
||
with open(fasta, 'w') as outfile: | ||
for seqid, seq in dada_df['sequence'].items(): | ||
outfile.write(f'>{seqid}\n{seq}\n') | ||
|
||
cmd = f"cutadapt {cutadapt_args} -g file:{adapters} " | ||
cmd += f"-o {work_dir}/ASV_{{name}}.fa {fasta}" | ||
|
||
process = subprocess.run(cmd.split(), capture_output=True, text=True) | ||
process.check_returncode() | ||
|
||
logging.info('cutadapt deplexing complete') | ||
|
||
return 0 | ||
|
||
def get_deplex_df(deplex_dir): | ||
''' | ||
Read demultiplexed fasta into dataframe with | ||
seqid, target, trimmed sequence | ||
''' | ||
logging.info('parsing deplexed sequences') | ||
|
||
deplex_dict = dict() | ||
# iterate over deplexed fasta files | ||
for fa in sorted(glob.glob(f'{deplex_dir}/ASV_*.fa')): | ||
target = fa.split('/')[-1].split('.')[0].split('_')[1] | ||
if target not in CUTADAPT_TARGETS: | ||
logging.warning(f'Target {target} not recognised for {fa}, skipping') | ||
continue | ||
# basic parser | ||
with open(fa) as f: | ||
for line in f: | ||
line = line.strip() | ||
if line.startswith('>'): | ||
seqid = line[1:] | ||
else: | ||
deplex_dict[seqid] = {'target':target,'trimmed_sequence':line} | ||
# proper parser | ||
# for record in SeqIO.parse(fa, format='fasta'): | ||
# deplex_dict[record.name] = {'target':target,'trimmed_sequence':str(record.seq)} | ||
deplex_df = pd.DataFrame(deplex_dict).T | ||
dup_seqid = deplex_df.index.duplicated() | ||
if dup_seqid.any(): | ||
raise ValueError(f'Duplicate seqids generated by dada2: {deplex_df.index[dup_seqid]}') | ||
return deplex_df | ||
|
||
def get_hap_df(dada_table, demult_dir, samples_manifest): | ||
|
||
deplex_df = get_deplex_df(demult_dir) | ||
|
||
logging.info('combining dada output with deplexing info') | ||
|
||
dada_df = pd.read_csv(dada_table, sep='\t', index_col=0) | ||
|
||
dada_deplex_df = pd.merge(dada_df, deplex_df, left_index=True, right_index=True) | ||
dada_deplex_df.index.name = 'dada2_id' | ||
assert dada_deplex_df.shape[0] == dada_df.shape[0] == deplex_df.shape[0], \ | ||
'Lost some sequences in combining deplexing and original DADA2 data' | ||
|
||
hap_df = pd.melt(dada_deplex_df.reset_index(), | ||
id_vars=['dada2_id','sequence','target','trimmed_sequence'], | ||
var_name='sample_id', | ||
value_name='reads') | ||
hap_df = hap_df[hap_df.reads > 0].copy() | ||
hap_df['target'] = hap_df.target.astype(str) | ||
hap_df.rename(columns={ | ||
'sequence':'untrimmed_sequence', | ||
'trimmed_sequence':'consensus' | ||
}, inplace=True) | ||
|
||
return hap_df | ||
|
||
def prep_dada2(args): | ||
|
||
setup_logging(verbose=args.verbose) | ||
|
||
logging.info('ANOSPP data prep started') | ||
|
||
cutadapt_deplex(args.dada_table, args.adapters, CUTADAPT_ARGS, args.work_dir) | ||
|
||
hap_df = get_hap_df(args.dada_table, args.work_dir, args.manifest) | ||
|
||
hap_df[[ | ||
'sample_id', | ||
'target', | ||
'consensus', | ||
'reads' | ||
]].to_csv(args.out_haps, sep='\t', index=False) | ||
|
||
logging.info('ANOSPP data prep complete') | ||
|
||
def main(cmd): | ||
|
||
parser = argparse.ArgumentParser("Prepare ANOSPP data for analysis") | ||
parser.add_argument('-t', '--dada_table', help='DADA2 stats tsv file', required=True) | ||
parser.add_argument('-a', '--adapters', help='adapters fasta file for deplexing with cutadapt', required=True) | ||
parser.add_argument('-m', '--manifest', help='ANOSPP samples manifest', required=True) | ||
parser.add_argument('-u', '--out_haps', help='output haplotypes file', default='haps.tsv') | ||
parser.add_argument('-w', '--work_dir', help='working directory for intermediate files', | ||
default='work') | ||
parser.add_argument('-v', '--verbose', | ||
help='Include INFO level log messages', action='store_true') | ||
|
||
args = parser.parse_args(cmd) | ||
|
||
args.work_dir=args.work_dir.rstrip('/') | ||
for fn in args.adapters, args.dada_table, args.manifest: | ||
assert os.path.isfile(fn), f'{fn} not found' | ||
|
||
prep_dada2(args) | ||
|
||
if __name__ == '__main__': | ||
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
name: anospp_analysis | ||
channels: | ||
- conda-forge | ||
- bioconda | ||
- nodefaults | ||
dependencies: | ||
# non-python requirements | ||
- mamba | ||
- blast=2.12.0 | ||
- fasttree=2.1.10 | ||
- mafft=7.490 | ||
# python requirements should go to pyproject.toml | ||
# this section only deals with basic poetry installation | ||
- python=3.10.* | ||
- pip | ||
- pip: | ||
- "poetry>=1.2" |
Oops, something went wrong.