vcf_to_csv.py changed

EladH1 · Jul 8, 2024 · 3deb09d · 3deb09d
1 parent 81ff2a1
commit 3deb09d
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 30 deletions.
diff --git a/bin/vcf_to_csv.py b/bin/vcf_to_csv.py
@@ -1,35 +1,117 @@
 #!/usr/bin/env python
-# Copyright 2022 - Barcelona Supercomputing Center
-# Author: Rodrigo Martin
-# BSC Dual License
+
+# Copyright 2024 - GHGA
+# Author: Kuebra Narci
 '''
-Generates a CSV file from an input VCF file
+Generates a CSV file from a VCF
 Expected usage:
-    $ python vcf_to_csv.py <vcf_file> <output_file>
+    $ python vcf_to_csv.py <vcf_file> <output>
 Use --help for more information.
 '''
+import sys
 from argparse import ArgumentParser
+import re
+
+import csv
+
+def parse_info_field(info):
+    """Parse the INFO field of a VCF line into a dictionary."""
+    info_dict = {}
+    for entry in info.split(';'):
+        key_value = entry.split('=')
+        if len(key_value) == 2:
+            info_dict[key_value[0]] = key_value[1]
+        else:
+            info_dict[key_value[0]] = True
+    return info_dict
+
+def extract_gt_from_sample(sample, format_field):
+    """Extract GT value from the sample field."""
+    format_fields = format_field.split(':')
+    sample_values = sample.split(':')
+    if 'GT' in format_fields:
+        gt_index = format_fields.index('GT')
+        return sample_values[gt_index]
+    return './.'  # Default GT value if not present
+
+def vcf_to_csv(vcf_file, csv_file):
+    """Convert a VCF file to a CSV file."""
+    with open(vcf_file, 'r') as vcf:
+        headers = []
+        sample_headers = []
+        records = []
+        include_supp_vec = False
+        include_supp = False
+        include_type_inferred = False
+        include_svtype = False
+        include_svlen = False
+
+        for line in vcf:
+            if line.startswith('##'):
+                continue  # Skip meta-information lines
+            elif line.startswith('#'):
+                headers = line[1:].strip().split('\t')
+                sample_headers = headers[9:]  # The sample headers start from the 10th column
+            else:
+                row = line.strip().split('\t')
+                info_dict = parse_info_field(row[7])
+
+                # Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field
+                if 'SUPP_VEC' in info_dict:
+                    include_supp_vec = True
+                if 'SUPP' in info_dict:
+                    include_supp = True
+                if 'type_inferred' in info_dict:
+                    include_type_inferred = True
+                if 'SVTYPE' in info_dict:
+                    include_svtype = True
+                if 'SVLEN' in info_dict:
+                    include_svlen = True
+
+                records.append((row, info_dict))
+
+        # Write the header with optional fields
+        headers_to_write = headers[:7]  # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
+        if include_supp_vec:
+            headers_to_write.append("SUPP_VEC")
+        if include_supp:
+            headers_to_write.append("SUPP")
+        if include_type_inferred:
+            headers_to_write.append("type_inferred")
+        if include_svtype:
+            headers_to_write.append("SVTYPE")
+        if include_svlen:
+            headers_to_write.append("SVLEN")
+        headers_to_write.extend([f'{sample}_GT' for sample in sample_headers])
+
+        with open(csv_file, 'w', newline='') as csvf:
+            csv_writer = csv.writer(csvf)
+            csv_writer.writerow(headers_to_write)
+
+            for row, info_dict in records:
+                row_to_write = row[:7]  # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
+                if include_supp_vec:
+                    row_to_write.append(info_dict.get('SUPP_VEC', ''))
+                if include_supp:
+                    row_to_write.append(info_dict.get('SUPP', ''))
+                if include_type_inferred:
+                    row_to_write.append(info_dict.get('type_inferred', ''))
+                if include_svtype:
+                    row_to_write.append(info_dict.get('SVTYPE', ''))
+                if include_svlen:
+                    row_to_write.append(info_dict.get('SVLEN', ''))
+                format_field = row[8]
+                gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]]
+                row_to_write.extend(gt_values)
+
+                csv_writer.writerow(row_to_write)
 
 if __name__ == '__main__':
-    import os
-    import sys
-    sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/')
-    from variant_extractor import VariantExtractor
 
     # Parse arguments
-    parser = ArgumentParser(description='Generate CSV file from a VCF file')
+    parser = ArgumentParser(description='Generates a CSV file from a VCF')
     parser.add_argument('vcf_file', help='VCF file')
-    parser.add_argument('output_file', help='Output file')
-    parser.add_argument('-f', '--fasta-ref', help='FASTA reference file')
+    parser.add_argument('output', help='Output CSV file')
     args = parser.parse_args()
 
-    variants = []
-
-    print(f'Reading VCF file: {args.vcf_file}')
-    extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta_ref)
-    df = extractor.to_dataframe()
-    # Insert id column in the first position
-    df.insert(0, 'id', '')
-    df['id'] = df['variant_record_obj'].apply(lambda x: x.id)
-    df.drop(['variant_record_obj'], axis=1, inplace=True)
-    df.to_csv(f'{args.output_file}', index=False)
+    vcf_to_csv(args.vcf_file, args.output)
diff --git a/modules/local/vcf_to_csv/main.nf b/modules/local/vcf_to_csv/main.nf
@@ -9,8 +9,6 @@ process VCF_TO_CSV {
 
     input:
     tuple val(meta), path(input)
-    tuple val(meta2), path(fasta)
-    tuple val(meta3), path(fasta_fai)
 
     output:
     tuple val(meta), path("*.csv")   , emit: output
@@ -25,9 +23,7 @@ process VCF_TO_CSV {
     """
     vcf_to_csv.py \\
         $input \\
-        ${prefix}.csv \\
-        -f $fasta
-
+        ${prefix}.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/subworkflows/local/compare_benchmark_results.nf b/subworkflows/local/compare_benchmark_results.nf
@@ -66,9 +66,7 @@ workflow COMPARE_BENCHMARK_RESULTS {
     merged_vcfs = merged_vcfs.mix(SURVIVOR_MERGE.out.vcf)
 
     VCF_TO_CSV(
-        merged_vcfs,
-        fasta,
-        fai
+        merged_vcfs
     )
     versions = versions.mix(VCF_TO_CSV.out.versions)