forked from nf-core/variantbenchmarking
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
81ff2a1
commit 3deb09d
Showing
3 changed files
with
106 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,35 +1,117 @@ | ||
#!/usr/bin/env python | ||
# Copyright 2022 - Barcelona Supercomputing Center | ||
# Author: Rodrigo Martin | ||
# BSC Dual License | ||
|
||
# Copyright 2024 - GHGA | ||
# Author: Kuebra Narci | ||
''' | ||
Generates a CSV file from an input VCF file | ||
Generates a CSV file from a VCF | ||
Expected usage: | ||
$ python vcf_to_csv.py <vcf_file> <output_file> | ||
$ python vcf_to_csv.py <vcf_file> <output> | ||
Use --help for more information. | ||
''' | ||
import sys | ||
from argparse import ArgumentParser | ||
import re | ||
|
||
import csv | ||
|
||
def parse_info_field(info): | ||
"""Parse the INFO field of a VCF line into a dictionary.""" | ||
info_dict = {} | ||
for entry in info.split(';'): | ||
key_value = entry.split('=') | ||
if len(key_value) == 2: | ||
info_dict[key_value[0]] = key_value[1] | ||
else: | ||
info_dict[key_value[0]] = True | ||
return info_dict | ||
|
||
def extract_gt_from_sample(sample, format_field): | ||
"""Extract GT value from the sample field.""" | ||
format_fields = format_field.split(':') | ||
sample_values = sample.split(':') | ||
if 'GT' in format_fields: | ||
gt_index = format_fields.index('GT') | ||
return sample_values[gt_index] | ||
return './.' # Default GT value if not present | ||
|
||
def vcf_to_csv(vcf_file, csv_file): | ||
"""Convert a VCF file to a CSV file.""" | ||
with open(vcf_file, 'r') as vcf: | ||
headers = [] | ||
sample_headers = [] | ||
records = [] | ||
include_supp_vec = False | ||
include_supp = False | ||
include_type_inferred = False | ||
include_svtype = False | ||
include_svlen = False | ||
|
||
for line in vcf: | ||
if line.startswith('##'): | ||
continue # Skip meta-information lines | ||
elif line.startswith('#'): | ||
headers = line[1:].strip().split('\t') | ||
sample_headers = headers[9:] # The sample headers start from the 10th column | ||
else: | ||
row = line.strip().split('\t') | ||
info_dict = parse_info_field(row[7]) | ||
|
||
# Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field | ||
if 'SUPP_VEC' in info_dict: | ||
include_supp_vec = True | ||
if 'SUPP' in info_dict: | ||
include_supp = True | ||
if 'type_inferred' in info_dict: | ||
include_type_inferred = True | ||
if 'SVTYPE' in info_dict: | ||
include_svtype = True | ||
if 'SVLEN' in info_dict: | ||
include_svlen = True | ||
|
||
records.append((row, info_dict)) | ||
|
||
# Write the header with optional fields | ||
headers_to_write = headers[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER | ||
if include_supp_vec: | ||
headers_to_write.append("SUPP_VEC") | ||
if include_supp: | ||
headers_to_write.append("SUPP") | ||
if include_type_inferred: | ||
headers_to_write.append("type_inferred") | ||
if include_svtype: | ||
headers_to_write.append("SVTYPE") | ||
if include_svlen: | ||
headers_to_write.append("SVLEN") | ||
headers_to_write.extend([f'{sample}_GT' for sample in sample_headers]) | ||
|
||
with open(csv_file, 'w', newline='') as csvf: | ||
csv_writer = csv.writer(csvf) | ||
csv_writer.writerow(headers_to_write) | ||
|
||
for row, info_dict in records: | ||
row_to_write = row[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER | ||
if include_supp_vec: | ||
row_to_write.append(info_dict.get('SUPP_VEC', '')) | ||
if include_supp: | ||
row_to_write.append(info_dict.get('SUPP', '')) | ||
if include_type_inferred: | ||
row_to_write.append(info_dict.get('type_inferred', '')) | ||
if include_svtype: | ||
row_to_write.append(info_dict.get('SVTYPE', '')) | ||
if include_svlen: | ||
row_to_write.append(info_dict.get('SVLEN', '')) | ||
format_field = row[8] | ||
gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]] | ||
row_to_write.extend(gt_values) | ||
|
||
csv_writer.writerow(row_to_write) | ||
|
||
if __name__ == '__main__': | ||
import os | ||
import sys | ||
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/') | ||
from variant_extractor import VariantExtractor | ||
|
||
# Parse arguments | ||
parser = ArgumentParser(description='Generate CSV file from a VCF file') | ||
parser = ArgumentParser(description='Generates a CSV file from a VCF') | ||
parser.add_argument('vcf_file', help='VCF file') | ||
parser.add_argument('output_file', help='Output file') | ||
parser.add_argument('-f', '--fasta-ref', help='FASTA reference file') | ||
parser.add_argument('output', help='Output CSV file') | ||
args = parser.parse_args() | ||
|
||
variants = [] | ||
|
||
print(f'Reading VCF file: {args.vcf_file}') | ||
extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta_ref) | ||
df = extractor.to_dataframe() | ||
# Insert id column in the first position | ||
df.insert(0, 'id', '') | ||
df['id'] = df['variant_record_obj'].apply(lambda x: x.id) | ||
df.drop(['variant_record_obj'], axis=1, inplace=True) | ||
df.to_csv(f'{args.output_file}', index=False) | ||
vcf_to_csv(args.vcf_file, args.output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters