Merge pull request #3 from malariagen/dev_prep

data prep script dev
malariagen · Mar 28, 2023 · 18d546b · 18d546b
2 parents 274eb81 + 6355486
commit 18d546b
Show file tree

Hide file tree

Showing 6 changed files with 449 additions and 127 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .DS_Store
 test_data/
+*.code-workspace
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -1,36 +1,72 @@
 # anospp-analysis
 Python package for ANOSPP data analysis
 
-## Usage
+## Installation
 
 TODO
-## Development
 
-Clone this repository 
-```
-git clone [email protected]:malariagen/anospp-analysis.git
-```
+## Documentation
 
-Install poetry
-```
-curl -sSL https://install.python-poetry.org | python3 -
-```
+TODO
 
-Create development environment
+## Development
+
+### Setup
+
+Installation is hybrid with conda + poetry:
 ```
+git clone [email protected]:malariagen/anospp-analysis.git
 cd anospp-analysis
+mamba env create -f environment.yml
+conda activate anospp_analysis
 poetry install
 ```
-Activate development environment
+
+Activate development environment:
 ```
 poetry shell
 ```
-Example wrapper script run
+
+### Usage & testing
+
+The code in this repository can be accessed via wrapper scripts:
 ```
-python anospp_analysis/qc.py --haplotypes test_data/haplotypes.tsv \
+python anospp_analysis/qc.py \
+    --haplotypes test_data/haplotypes.tsv \
     --samples test_data/samples.csv \
     --stats test_data/stats.tsv \
     --outdir test_data/qc
 ```
 
-TODO checks & hooks
+Besides, individual functions are available as an API 
+
+TODO testing & CI
+
+### Adding Python deps
+
+Introducing python dependencies should be done via poetry:
+```
+poetry add package_name
+``` 
+This should update both `pyproject.toml` and `poetry.lock` files
+
+If the package should be used in development environment only, use
+```
+poetry add package_name --dev
+```
+
+### Adding non-Python deps
+
+Introducing non-python dependencies should be done via conda: edit `environment.yml`, 
+then re-create the conda environment and poetry deps:
+```
+mamba env create -f environment.yml
+conda activate anospp_analysis
+poetry install
+```
+
+Changes in conda environment might also introduce changes to the python installation, 
+in which case one should update poetry lock file
+```
+poetry lock
+```
diff --git a/anospp_analysis/prep.py b/anospp_analysis/prep.py
@@ -0,0 +1,136 @@
+import pandas as pd
+import argparse
+import subprocess
+import glob
+import sys
+import os
+
+from util import *
+
+# optimised cutadapt args
+CUTADAPT_ARGS = '-O 10 --match-read-wildcards'
+
+def cutadapt_deplex(dada_table, adapters, cutadapt_args=CUTADAPT_ARGS, work_dir='work'):
+
+    logging.info('cutadapt deplexing started')
+
+    os.makedirs(work_dir, exist_ok = True)
+
+    # generate input fasta from DADA2 table
+
+    dada_df = pd.read_csv(dada_table, sep='\t', index_col=0)
+
+    fasta = f'{work_dir}/input_seqs.fasta'
+
+    with open(fasta, 'w') as outfile:
+        for seqid, seq in dada_df['sequence'].items():
+            outfile.write(f'>{seqid}\n{seq}\n')
+
+    cmd = f"cutadapt {cutadapt_args} -g file:{adapters} "
+    cmd += f"-o {work_dir}/ASV_{{name}}.fa {fasta}"
+
+    process = subprocess.run(cmd.split(), capture_output=True, text=True)
+    process.check_returncode()
+
+    logging.info('cutadapt deplexing complete')
+
+    return 0
+
+def get_deplex_df(deplex_dir):
+    '''
+    Read demultiplexed fasta into dataframe with
+    seqid, target, trimmed sequence
+    '''
+    logging.info('parsing deplexed sequences')
+
+    deplex_dict = dict()
+    # iterate over deplexed fasta files
+    for fa in sorted(glob.glob(f'{deplex_dir}/ASV_*.fa')):
+        target = fa.split('/')[-1].split('.')[0].split('_')[1]
+        if target not in CUTADAPT_TARGETS:
+            logging.warning(f'Target {target} not recognised for {fa}, skipping')
+            continue
+        # basic parser
+        with open(fa) as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith('>'):
+                    seqid = line[1:]
+                else:
+                    deplex_dict[seqid] = {'target':target,'trimmed_sequence':line}
+        # proper parser
+        # for record in SeqIO.parse(fa, format='fasta'):
+        #     deplex_dict[record.name] = {'target':target,'trimmed_sequence':str(record.seq)}
+    deplex_df = pd.DataFrame(deplex_dict).T
+    dup_seqid = deplex_df.index.duplicated()
+    if dup_seqid.any():
+        raise ValueError(f'Duplicate seqids generated by dada2: {deplex_df.index[dup_seqid]}')
+    return deplex_df
+
+def get_hap_df(dada_table, demult_dir, samples_manifest):
+
+    deplex_df = get_deplex_df(demult_dir)
+
+    logging.info('combining dada output with deplexing info')
+
+    dada_df = pd.read_csv(dada_table, sep='\t', index_col=0)
+
+    dada_deplex_df = pd.merge(dada_df, deplex_df, left_index=True, right_index=True)
+    dada_deplex_df.index.name = 'dada2_id'
+    assert dada_deplex_df.shape[0] == dada_df.shape[0] == deplex_df.shape[0], \
+        'Lost some sequences in combining deplexing and original DADA2 data'
+
+    hap_df = pd.melt(dada_deplex_df.reset_index(), 
+            id_vars=['dada2_id','sequence','target','trimmed_sequence'],
+            var_name='sample_id',
+            value_name='reads')
+    hap_df = hap_df[hap_df.reads > 0].copy()
+    hap_df['target'] = hap_df.target.astype(str)
+    hap_df.rename(columns={
+        'sequence':'untrimmed_sequence',
+        'trimmed_sequence':'consensus'
+    }, inplace=True)
+
+    return hap_df
+
+def prep_dada2(args):
+
+    setup_logging(verbose=args.verbose)
+
+    logging.info('ANOSPP data prep started')
+
+    cutadapt_deplex(args.dada_table, args.adapters, CUTADAPT_ARGS, args.work_dir)
+
+    hap_df = get_hap_df(args.dada_table, args.work_dir, args.manifest)
+
+    hap_df[[
+        'sample_id',
+        'target',
+        'consensus',
+        'reads'
+    ]].to_csv(args.out_haps, sep='\t', index=False)
+
+    logging.info('ANOSPP data prep complete')
+
+def main(cmd):
+
+    parser = argparse.ArgumentParser("Prepare ANOSPP data for analysis")
+    parser.add_argument('-t', '--dada_table', help='DADA2 stats tsv file', required=True)
+    parser.add_argument('-a', '--adapters', help='adapters fasta file for deplexing with cutadapt', required=True)
+    parser.add_argument('-m', '--manifest', help='ANOSPP samples manifest', required=True)
+    parser.add_argument('-u', '--out_haps', help='output haplotypes file', default='haps.tsv')
+    parser.add_argument('-w', '--work_dir', help='working directory for intermediate files',
+                        default='work')
+    parser.add_argument('-v', '--verbose', 
+                        help='Include INFO level log messages', action='store_true')
+
+    args = parser.parse_args(cmd)
+
+    args.work_dir=args.work_dir.rstrip('/')
+    for fn in args.adapters, args.dada_table, args.manifest:
+        assert os.path.isfile(fn), f'{fn} not found'
+
+    prep_dada2(args)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,17 @@
+name: anospp_analysis
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  # non-python requirements
+  - mamba
+  - blast=2.12.0
+  - fasttree=2.1.10
+  - mafft=7.490
+  # python requirements should go to pyproject.toml
+  # this section only deals with basic poetry installation
+  - python=3.10.*
+  - pip
+  - pip:
+    - "poetry>=1.2"