Skip to content

Commit

Permalink
Merge pull request #3 from malariagen/dev_prep
Browse files Browse the repository at this point in the history
data prep script dev
  • Loading branch information
amakunin authored Mar 28, 2023
2 parents 274eb81 + 6355486 commit 18d546b
Show file tree
Hide file tree
Showing 6 changed files with 449 additions and 127 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.DS_Store
test_data/
*.code-workspace

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
66 changes: 51 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,36 +1,72 @@
# anospp-analysis
Python package for ANOSPP data analysis

## Usage
## Installation

TODO
## Development

Clone this repository
```
git clone [email protected]:malariagen/anospp-analysis.git
```
## Documentation

Install poetry
```
curl -sSL https://install.python-poetry.org | python3 -
```
TODO

Create development environment
## Development

### Setup

Installation is hybrid with conda + poetry:
```
git clone [email protected]:malariagen/anospp-analysis.git
cd anospp-analysis
mamba env create -f environment.yml
conda activate anospp_analysis
poetry install
```
Activate development environment

Activate development environment:
```
poetry shell
```
Example wrapper script run

### Usage & testing

The code in this repository can be accessed via wrapper scripts:
```
python anospp_analysis/qc.py --haplotypes test_data/haplotypes.tsv \
python anospp_analysis/qc.py \
--haplotypes test_data/haplotypes.tsv \
--samples test_data/samples.csv \
--stats test_data/stats.tsv \
--outdir test_data/qc
```

TODO checks & hooks
Besides, individual functions are available as an API

TODO testing & CI

### Adding Python deps

Introducing python dependencies should be done via poetry:
```
poetry add package_name
```
This should update both `pyproject.toml` and `poetry.lock` files

If the package should be used in development environment only, use
```
poetry add package_name --dev
```

### Adding non-Python deps

Introducing non-python dependencies should be done via conda: edit `environment.yml`,
then re-create the conda environment and poetry deps:
```
mamba env create -f environment.yml
conda activate anospp_analysis
poetry install
```

Changes in conda environment might also introduce changes to the python installation,
in which case one should update poetry lock file
```
poetry lock
```
136 changes: 136 additions & 0 deletions anospp_analysis/prep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import pandas as pd
import argparse
import subprocess
import glob
import sys
import os

from util import *

# optimised cutadapt args
CUTADAPT_ARGS = '-O 10 --match-read-wildcards'

def cutadapt_deplex(dada_table, adapters, cutadapt_args=CUTADAPT_ARGS, work_dir='work'):

logging.info('cutadapt deplexing started')

os.makedirs(work_dir, exist_ok = True)

# generate input fasta from DADA2 table

dada_df = pd.read_csv(dada_table, sep='\t', index_col=0)

fasta = f'{work_dir}/input_seqs.fasta'

with open(fasta, 'w') as outfile:
for seqid, seq in dada_df['sequence'].items():
outfile.write(f'>{seqid}\n{seq}\n')

cmd = f"cutadapt {cutadapt_args} -g file:{adapters} "
cmd += f"-o {work_dir}/ASV_{{name}}.fa {fasta}"

process = subprocess.run(cmd.split(), capture_output=True, text=True)
process.check_returncode()

logging.info('cutadapt deplexing complete')

return 0

def get_deplex_df(deplex_dir):
'''
Read demultiplexed fasta into dataframe with
seqid, target, trimmed sequence
'''
logging.info('parsing deplexed sequences')

deplex_dict = dict()
# iterate over deplexed fasta files
for fa in sorted(glob.glob(f'{deplex_dir}/ASV_*.fa')):
target = fa.split('/')[-1].split('.')[0].split('_')[1]
if target not in CUTADAPT_TARGETS:
logging.warning(f'Target {target} not recognised for {fa}, skipping')
continue
# basic parser
with open(fa) as f:
for line in f:
line = line.strip()
if line.startswith('>'):
seqid = line[1:]
else:
deplex_dict[seqid] = {'target':target,'trimmed_sequence':line}
# proper parser
# for record in SeqIO.parse(fa, format='fasta'):
# deplex_dict[record.name] = {'target':target,'trimmed_sequence':str(record.seq)}
deplex_df = pd.DataFrame(deplex_dict).T
dup_seqid = deplex_df.index.duplicated()
if dup_seqid.any():
raise ValueError(f'Duplicate seqids generated by dada2: {deplex_df.index[dup_seqid]}')
return deplex_df

def get_hap_df(dada_table, demult_dir, samples_manifest):

deplex_df = get_deplex_df(demult_dir)

logging.info('combining dada output with deplexing info')

dada_df = pd.read_csv(dada_table, sep='\t', index_col=0)

dada_deplex_df = pd.merge(dada_df, deplex_df, left_index=True, right_index=True)
dada_deplex_df.index.name = 'dada2_id'
assert dada_deplex_df.shape[0] == dada_df.shape[0] == deplex_df.shape[0], \
'Lost some sequences in combining deplexing and original DADA2 data'

hap_df = pd.melt(dada_deplex_df.reset_index(),
id_vars=['dada2_id','sequence','target','trimmed_sequence'],
var_name='sample_id',
value_name='reads')
hap_df = hap_df[hap_df.reads > 0].copy()
hap_df['target'] = hap_df.target.astype(str)
hap_df.rename(columns={
'sequence':'untrimmed_sequence',
'trimmed_sequence':'consensus'
}, inplace=True)

return hap_df

def prep_dada2(args):

setup_logging(verbose=args.verbose)

logging.info('ANOSPP data prep started')

cutadapt_deplex(args.dada_table, args.adapters, CUTADAPT_ARGS, args.work_dir)

hap_df = get_hap_df(args.dada_table, args.work_dir, args.manifest)

hap_df[[
'sample_id',
'target',
'consensus',
'reads'
]].to_csv(args.out_haps, sep='\t', index=False)

logging.info('ANOSPP data prep complete')

def main(cmd):

parser = argparse.ArgumentParser("Prepare ANOSPP data for analysis")
parser.add_argument('-t', '--dada_table', help='DADA2 stats tsv file', required=True)
parser.add_argument('-a', '--adapters', help='adapters fasta file for deplexing with cutadapt', required=True)
parser.add_argument('-m', '--manifest', help='ANOSPP samples manifest', required=True)
parser.add_argument('-u', '--out_haps', help='output haplotypes file', default='haps.tsv')
parser.add_argument('-w', '--work_dir', help='working directory for intermediate files',
default='work')
parser.add_argument('-v', '--verbose',
help='Include INFO level log messages', action='store_true')

args = parser.parse_args(cmd)

args.work_dir=args.work_dir.rstrip('/')
for fn in args.adapters, args.dada_table, args.manifest:
assert os.path.isfile(fn), f'{fn} not found'

prep_dada2(args)

if __name__ == '__main__':
main(sys.argv[1:])
17 changes: 17 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: anospp_analysis
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
# non-python requirements
- mamba
- blast=2.12.0
- fasttree=2.1.10
- mafft=7.490
# python requirements should go to pyproject.toml
# this section only deals with basic poetry installation
- python=3.10.*
- pip
- pip:
- "poetry>=1.2"
Loading

0 comments on commit 18d546b

Please sign in to comment.