Skip to content

Commit

Permalink
vcf_to_csv.py changed
Browse files Browse the repository at this point in the history
  • Loading branch information
kubranarci committed Jul 8, 2024
1 parent 81ff2a1 commit 3deb09d
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 30 deletions.
126 changes: 104 additions & 22 deletions bin/vcf_to_csv.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,117 @@
#!/usr/bin/env python
# Copyright 2022 - Barcelona Supercomputing Center
# Author: Rodrigo Martin
# BSC Dual License

# Copyright 2024 - GHGA
# Author: Kuebra Narci
'''
Generates a CSV file from an input VCF file
Generates a CSV file from a VCF
Expected usage:
$ python vcf_to_csv.py <vcf_file> <output_file>
$ python vcf_to_csv.py <vcf_file> <output>
Use --help for more information.
'''
import sys
from argparse import ArgumentParser
import re

import csv

def parse_info_field(info):
"""Parse the INFO field of a VCF line into a dictionary."""
info_dict = {}
for entry in info.split(';'):
key_value = entry.split('=')
if len(key_value) == 2:
info_dict[key_value[0]] = key_value[1]
else:
info_dict[key_value[0]] = True
return info_dict

def extract_gt_from_sample(sample, format_field):
"""Extract GT value from the sample field."""
format_fields = format_field.split(':')
sample_values = sample.split(':')
if 'GT' in format_fields:
gt_index = format_fields.index('GT')
return sample_values[gt_index]
return './.' # Default GT value if not present

def vcf_to_csv(vcf_file, csv_file):
"""Convert a VCF file to a CSV file."""
with open(vcf_file, 'r') as vcf:
headers = []
sample_headers = []
records = []
include_supp_vec = False
include_supp = False
include_type_inferred = False
include_svtype = False
include_svlen = False

for line in vcf:
if line.startswith('##'):
continue # Skip meta-information lines
elif line.startswith('#'):
headers = line[1:].strip().split('\t')
sample_headers = headers[9:] # The sample headers start from the 10th column
else:
row = line.strip().split('\t')
info_dict = parse_info_field(row[7])

# Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field
if 'SUPP_VEC' in info_dict:
include_supp_vec = True
if 'SUPP' in info_dict:
include_supp = True
if 'type_inferred' in info_dict:
include_type_inferred = True
if 'SVTYPE' in info_dict:
include_svtype = True
if 'SVLEN' in info_dict:
include_svlen = True

records.append((row, info_dict))

# Write the header with optional fields
headers_to_write = headers[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
if include_supp_vec:
headers_to_write.append("SUPP_VEC")
if include_supp:
headers_to_write.append("SUPP")
if include_type_inferred:
headers_to_write.append("type_inferred")
if include_svtype:
headers_to_write.append("SVTYPE")
if include_svlen:
headers_to_write.append("SVLEN")
headers_to_write.extend([f'{sample}_GT' for sample in sample_headers])

with open(csv_file, 'w', newline='') as csvf:
csv_writer = csv.writer(csvf)
csv_writer.writerow(headers_to_write)

for row, info_dict in records:
row_to_write = row[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
if include_supp_vec:
row_to_write.append(info_dict.get('SUPP_VEC', ''))
if include_supp:
row_to_write.append(info_dict.get('SUPP', ''))
if include_type_inferred:
row_to_write.append(info_dict.get('type_inferred', ''))
if include_svtype:
row_to_write.append(info_dict.get('SVTYPE', ''))
if include_svlen:
row_to_write.append(info_dict.get('SVLEN', ''))
format_field = row[8]
gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]]
row_to_write.extend(gt_values)

csv_writer.writerow(row_to_write)

if __name__ == '__main__':
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/')
from variant_extractor import VariantExtractor

# Parse arguments
parser = ArgumentParser(description='Generate CSV file from a VCF file')
parser = ArgumentParser(description='Generates a CSV file from a VCF')
parser.add_argument('vcf_file', help='VCF file')
parser.add_argument('output_file', help='Output file')
parser.add_argument('-f', '--fasta-ref', help='FASTA reference file')
parser.add_argument('output', help='Output CSV file')
args = parser.parse_args()

variants = []

print(f'Reading VCF file: {args.vcf_file}')
extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta_ref)
df = extractor.to_dataframe()
# Insert id column in the first position
df.insert(0, 'id', '')
df['id'] = df['variant_record_obj'].apply(lambda x: x.id)
df.drop(['variant_record_obj'], axis=1, inplace=True)
df.to_csv(f'{args.output_file}', index=False)
vcf_to_csv(args.vcf_file, args.output)
6 changes: 1 addition & 5 deletions modules/local/vcf_to_csv/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ process VCF_TO_CSV {

input:
tuple val(meta), path(input)
tuple val(meta2), path(fasta)
tuple val(meta3), path(fasta_fai)

output:
tuple val(meta), path("*.csv") , emit: output
Expand All @@ -25,9 +23,7 @@ process VCF_TO_CSV {
"""
vcf_to_csv.py \\
$input \\
${prefix}.csv \\
-f $fasta
${prefix}.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 1 addition & 3 deletions subworkflows/local/compare_benchmark_results.nf
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,7 @@ workflow COMPARE_BENCHMARK_RESULTS {
merged_vcfs = merged_vcfs.mix(SURVIVOR_MERGE.out.vcf)

VCF_TO_CSV(
merged_vcfs,
fasta,
fai
merged_vcfs
)
versions = versions.mix(VCF_TO_CSV.out.versions)

Expand Down

0 comments on commit 3deb09d

Please sign in to comment.