Skip to content

Commit

Permalink
Merge pull request #8 from opentargets/js29
Browse files Browse the repository at this point in the history
Changes for genetics R8
  • Loading branch information
Jeremy37 authored Mar 23, 2022
2 parents 3504fb4 + 0a3fe0d commit ed71469
Show file tree
Hide file tree
Showing 23 changed files with 435 additions and 159 deletions.
3 changes: 3 additions & 0 deletions 0_plink_extract.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

plink --bfile $1 --out $2 --make-bed --allow-extra-chr --extract range <(echo -e "$3")
68 changes: 68 additions & 0 deletions 0_split_ld_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Splits the LD reference panel into multiple subfiles, to greatly speed up
# GCTA-cojo. Each subfile is a 3-Mb window, so that for any top_loci variant,
# we can use a subfile that has the top_loci variant +- 1 Mb.

import pandas as pd
import os
import subprocess as sp
import argparse


def main():

# Parse args
args = parse_args()

window_size = int(3e6)
window_spacing = int(1e6)

chrom_lengths = pd.read_csv('configs/grch38_chrom_lengths.tsv', sep='\t')

# Loop through each chromosome and use plink to split the ref panel into
# overlapping windows
for index, row in chrom_lengths.iterrows():
chr_ld_path = args.path.format(chrom=row['chrom'])
print(chr_ld_path)

window_start = int(0)
while (window_start + window_size - window_spacing) < row['length']:
# Define window and output path for subfile of main LD file
window_end = window_start + window_size
out_ld_path = chr_ld_path + '.{:d}_{:d}'.format(window_start, window_end)
print("chr_ld_path:" + chr_ld_path)
print("out_ld_path:" + out_ld_path)
# plink requires a file to define the range to extract
# We don't want a temp file, so we use bash process substitution <(...)
range_str = ' '.join([str(row['chrom']), str(window_start), str(window_end), 'range1'])
cmd = '/bin/bash 0_plink_extract.sh {} {} \'{}\''.format(chr_ld_path, out_ld_path, range_str)
print(cmd)

# Run plink
os.system(cmd)
cp = sp.run(cmd, shell=True, stderr=sp.STDOUT)
if cp.returncode != 0:
print('Failed on plink command:\n{}'.format(cmd))
#return cp.returncode

window_start = window_start + window_spacing

return 0


def parse_args():
""" Load command line args """
parser = argparse.ArgumentParser()
parser.add_argument('--path',
metavar="<string>",
help='Path to LD reference; {chrom} in place of each chromosome name',
type=str,
required=True)
args = parser.parse_args()
return args


if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions 1_scan_input_parquets.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def main():
gwas_pval_threshold = 5e-8

# Paths
gwas_pattern = '/home/js29/genetics-finemapping/data/filtered/significant_window_2mb/*/gwas/*.parquet'
mol_pattern = '/home/js29/genetics-finemapping/data/filtered/significant_window_2mb/*/molecular_trait/*.parquet'
gwas_pattern = '/home/js29/genetics-finemapping/data/filtered/significant_window_2mb/gwas/*.parquet'
mol_pattern = '/home/js29/genetics-finemapping/data/filtered/significant_window_2mb/molecular_trait/*.parquet'
out_path = '/home/js29/genetics-finemapping/tmp/filtered_input'

# Load GWAS dfs
Expand Down
19 changes: 3 additions & 16 deletions 2_make_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,25 @@
#

import json
import os
from pprint import pprint
import pandas as pd
from numpy import nan
from glob import glob
import gzip

def main():

# Args
input_pattern = '/home/ubuntu/results/finemapping/tmp/filtered_input/*.json.gz'
out_json = 'configs/manifest.json.gz'
valid_chrom = set([str(chrom) for chrom in range(1, 23)])
method = 'conditional'

# Path patterns (local)
#root = '/Users/em21/Projects/genetics-finemapping'
# out_path = root + '/output/study_id={0}/phenotype_id={1}/bio_feature={2}/chrom={3}'
# log_path = root + '/logs/study_id={0}/phenotype_id={1}/bio_feature={2}/chrom={3}'
# tmp_path = root + '/tmp/study_id={0}/phenotype_id={1}/bio_feature={2}/chrom={3}'
# ld_ref = '/Users/em21/Projects/reference_data/uk10k_2019Feb/3_liftover_to_GRCh38/output/{chrom}.ALSPAC_TWINSUK.maf01.beagle.csq.shapeit.20131101'

# Path patterns (server)
# root = '/home/ubuntu/results/finemapping'
# ld_ref = '/home/ubuntu/data/genotypes/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
root = '/home/js29/genetics-finemapping'
input_pattern = root + '/tmp/filtered_input/*.json.gz'
out_path = root + '/output/study_id={0}/phenotype_id={1}/bio_feature={2}/chrom={3}'
log_path = root + '/logs/study_id={0}/phenotype_id={1}/bio_feature={2}/chrom={3}'
tmp_path = root + '/tmp/study_id={0}/phenotype_id={1}/bio_feature={2}/chrom={3}'
#ld_ref = root + '/data/1000Genomes_phase3/EUR/EUR.{chrom}.1000Gp3.20130502'
ld_ref = root + '/data/ukb_v3_downsampled10k/ukb_v3_chr{chrom}.downsampled10k'

# In base folder rather than genenetics-finemapping for sharing with coloc pipeline
ld_ref = '/home/js29/data/ukb_v3_downsampled10k/ukb_v3_chr{chrom}.downsampled10k'

# Create manifest
manifest = []
Expand Down
1 change: 1 addition & 0 deletions 3_make_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def main():
os.path.abspath(script),
'--pq', os.path.abspath(rec['in_pq']),
'--ld', os.path.abspath(rec['in_ld']),
'--split_ld',
'--config_file', os.path.abspath(analysis_config),
'--study_id', rec['study_id'],
'--phenotype_id', rec['phenotype_id'],
Expand Down
6 changes: 2 additions & 4 deletions 5_combine_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
import os
from shutil import copyfile
from glob import glob
import yaml
import subprocess as sp


def main():

# Make spark session
Expand All @@ -37,9 +37,7 @@ def main():
root = '/home/js29/genetics-finemapping'
in_top_loci_pattern = root + '/output/study_id=*/phenotype_id=*/bio_feature=*/chrom=*/top_loci.json.gz'
in_credset_pattern = root + '/output/study_id=*/phenotype_id=*/bio_feature=*/chrom=*/credible_set.json.gz'
#in_top_loci_pattern = root + '/output/study_id=*/phenotype_id=*/bio_feature=*/chrom=21/top_loci.json.gz'
#in_credset_pattern = root + '/output/study_id=*/phenotype_id=*/bio_feature=*/chrom=21/credible_set.json.gz'


out_top_loci = root + '/results/top_loci'
out_credset = root + '/results/credset'

Expand Down
7 changes: 4 additions & 3 deletions 6_copy_results_to_gcs.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env bash
#

version_date=`date +%y%m%d`
version_date=$1

# Copy results
gsutil -m rsync -r $HOME/genetics-finemapping/results/ gs://genetics-portal-dev-staging/finemapping/$version_date

# Tar the logs and copy over
tar -zcvf logs.tar.gz $HOME/genetics-finemapping/logs
gsutil -m cp logs.tar.gz gs://genetics-portal-dev-staging/finemapping/$version_date/logs.tar.gz
# This can take a very long time, so you may not want to keep the logs at all
#tar -zcvf logs.tar.gz $HOME/genetics-finemapping/logs
#gsutil -m cp logs.tar.gz gs://genetics-portal-dev-staging/finemapping/$version_date/logs.tar.gz
106 changes: 102 additions & 4 deletions 7_merge_finemap_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
#
# Jeremy Schwartzentruber
#
import os
import argparse
import pandas as pd
import pyspark.sql
from pyspark.sql.functions import *
import subprocess as sp

def main():
# Don't merge top_loci in this way, because it causes all loci to have the same
Expand All @@ -25,6 +29,7 @@ def main():
# compression='gzip',
# double_precision=15
#)
args = parse_args()

spark = (
pyspark.sql.SparkSession.builder
Expand All @@ -33,18 +38,111 @@ def main():
.getOrCreate()
)

in_credset_pattern = 'finemapping_to_merge/*/credset'
out_credset = 'finemapping_merged/credset'
credset = spark.read.json(in_credset_pattern)
# Read in the top loci
top_loci_prev = spark.read.json(os.path.join(args.prev_results, 'top_loci.json.gz'))
top_loci_new = spark.read.json(os.path.join(args.new_results, 'top_loci.json.gz'))

nrows_prev_start = top_loci_prev.count()
print(f'Rows in previous top_loci: {nrows_prev_start}')
print('Rows in new top_loci: {0}'.format(top_loci_new.count()))

if args.remove_previous_finngen:
# Remove FinnGen rows from the previous results
top_loci_prev = top_loci_prev.filter(~col('study_id').contains('FINNGEN'))
nrows_prev_end = top_loci_prev.count()
print('FinnGen rows removed from previous top_loci: {0}'.format(nrows_prev_start - nrows_prev_end))

top_loci = top_loci_prev.unionByName(top_loci_new, allowMissingColumns=True)

# Write out the top loci
print('Writing top_loci rows: {0}'.format(top_loci.count()))
top_loci_folder = os.path.join(args.output, 'top_loci')
(
top_loci
.write.json(top_loci_folder,
compression='gzip',
mode='overwrite')
)
cmd = 'zcat {0}/part*.json.gz | gzip > {0}.json.gz'.format(top_loci_folder)
cp = sp.run(cmd, shell=True, stderr=sp.STDOUT)

# copyfile(
# glob(os.path.join(args.output, 'top_loci') + '/part-*.json.gz')[0],
# os.path.join(args.output, 'top_loci.json.gz')
# )

# Read in the credsets
credset_prev = spark.read.json(os.path.join(args.prev_results, 'credset'))
credset_new = spark.read.json(os.path.join(args.new_results, 'credset'))

nrows_prev_start = credset_prev.count()
print(f'Rows in previous credsets: {nrows_prev_start}')
print('Rows in new credsets: {0}'.format(credset_new.count()))

if args.remove_previous_finngen:
# Remove FinnGen rows from the previous results
credset_prev = credset_prev.filter(~col('study_id').contains('FINNGEN'))
nrows_prev_end = credset_prev.count()
print('FinnGen rows removed from previous credsets: {0}'.format(nrows_prev_start - nrows_prev_end))

credset = credset_prev.unionByName(credset_new, allowMissingColumns=True)

# Write out the credsets
print('Writing credset rows: {0}'.format(credset.count()))
(
credset
.repartitionByRange('lead_chrom', 'lead_pos')
.sortWithinPartitions('lead_chrom', 'lead_pos')
.write.json(out_credset,
.write.json(os.path.join(args.output, 'credset'),
compression='gzip',
mode='overwrite')
)

# in_credset_pattern = 'finemapping_to_merge/*/credset'
# out_credset = 'finemapping_merged/credset'
# credset = spark.read.json(in_credset_pattern)
# (
# credset
# .repartitionByRange('lead_chrom', 'lead_pos')
# .sortWithinPartitions('lead_chrom', 'lead_pos')
# .write.json(out_credset,
# compression='gzip',
# mode='overwrite')
# )


def parse_args():
''' Load command line args
'''
p = argparse.ArgumentParser()

# Add input files
p.add_argument('--prev_results',
metavar="<file>",
help=("Input: previous fine-mapping results folder"),
type=str,
required=True)

p.add_argument('--new_results',
metavar="<file>",
help=("Input: new fine-mapping results folder"),
type=str,
required=True)

p.add_argument('--output',
metavar="<file>",
help=("Output root folder"),
type=str,
required=True)

p.add_argument('--remove_previous_finngen',
help=("If set, remove FinnGen rows from the previous results"),
action='store_true')

args = p.parse_args()

return args


if __name__ == '__main__':

Expand Down
2 changes: 2 additions & 0 deletions 7a_credset_remove_finngen.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#
# Jeremy Schwartzentruber
#
# This code was used for genetics R6 and is no longer needed.
#
import pandas as pd
import pyspark.sql

Expand Down
26 changes: 26 additions & 0 deletions configs/grch38_chrom_lengths.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
chrom length
1 248956422
2 242193529
3 198295559
4 190214555
5 181538259
6 170805979
7 159345973
8 145138636
9 138394717
10 133797422
11 135086622
12 133275309
13 114364328
14 107043718
15 101991189
16 90338345
17 83257441
18 80373285
19 58617616
20 64444167
21 46709983
22 50818468
X 156040895
Y 57227415
MT 16569
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- numpy
- python-snappy
- pyspark
- jq
7 changes: 5 additions & 2 deletions finemapping/credible_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def run_credible_set_for_locus(
cojo_collinear,
pp_threshold,
method='conditional',
logger=None):
logger=None,
split_ld=False):
''' Run credible set analysis at a given locus (speficied by index_info)
Args:
index_info (dict): index locus information
Expand Down Expand Up @@ -71,7 +72,9 @@ def run_credible_set_for_locus(
cond_list,
cojo_window,
cojo_collinear,
logger=logger
logger=logger,
split_ld=split_ld,
var_pos=index_info['pos']
)
else:
sumstat_wind = sumstats
Expand Down
Loading

0 comments on commit ed71469

Please sign in to comment.