Skip to content

Commit

Permalink
feat: extract-vars writes thinned-out SCV infos (#240) (#258)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Sep 6, 2024
1 parent 66671cd commit 19ecf18
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 13 deletions.
46 changes: 44 additions & 2 deletions clinvar_data/extract_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
import tqdm

from clinvar_data.pbs.clinvar_public import Allele, ClassifiedRecord, VariationArchive
from clinvar_data.pbs.clinvar_public_pb2 import AggregateClassificationSet
from clinvar_data.pbs.clinvar_public_pb2 import (
AggregateClassificationSet,
ClinicalAssertion,
)
from clinvar_data.pbs.extracted_vars import (
ExtractedRcvRecord,
ExtractedVcvRecord,
Expand Down Expand Up @@ -93,6 +96,43 @@ def thin_out_aggregate_classification_set(
return result


def thin_out_clinical_assertions(
clinical_assertions: typing.Iterable[ClinicalAssertion],
) -> list[ClinicalAssertion]:
result = []
for clinical_assertion in clinical_assertions:
entry = ClinicalAssertion()
entry.CopyFrom(clinical_assertion)
for key in (
"clinvar_submission_id",
"additional_submitters",
"record_status",
"attributes",
"observed_ins",
"simple_allele",
"haplotype",
"genotype",
"trait_set",
"citations",
"study_name",
"study_description",
"comments",
"submission_names",
"date_created",
"date_last_updated",
"submission_date",
"id",
"fda_recognized_database",
):
entry.ClearField(key)
if entry.HasField("clinvar_accession"):
entry.clinvar_accession.ClearField("submitter_identifiers")
if entry.HasField("classifications"):
entry.classifications.ClearField("comments")
result.append(entry)
return result


def run(path_input: str, output_dir: str, gzip_output: bool):
"""Execute the variant extraction."""
os.makedirs(output_dir, exist_ok=True)
Expand Down Expand Up @@ -142,7 +182,6 @@ def run(path_input: str, output_dir: str, gzip_output: bool):
for gene in classified_record.simple_allele.genes
if gene.HasField("hgnc_id")
]

for location in simple_allele.locations or []:
for sequence_location in location.sequence_locations or []:
record = ExtractedVcvRecord(
Expand All @@ -153,6 +192,9 @@ def run(path_input: str, output_dir: str, gzip_output: bool):
classifications=(
thin_out_aggregate_classification_set(classified_record.classifications)
),
clinical_assertions=(
thin_out_clinical_assertions(classified_record.clinical_assertions)
),
sequence_location=sequence_location,
hgnc_ids=hgnc_ids,
)
Expand Down
1 change: 0 additions & 1 deletion clinvar_data/pbs/class_by_freq_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion clinvar_data/pbs/clinvar_public_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 4 additions & 5 deletions clinvar_data/pbs/extracted_vars_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions clinvar_data/pbs/extracted_vars_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
NAME_FIELD_NUMBER: builtins.int
VARIATION_TYPE_FIELD_NUMBER: builtins.int
CLASSIFICATIONS_FIELD_NUMBER: builtins.int
CLINICAL_ASSERTIONS_FIELD_NUMBER: builtins.int
SEQUENCE_LOCATION_FIELD_NUMBER: builtins.int
HGNC_IDS_FIELD_NUMBER: builtins.int
name: builtins.str
Expand All @@ -193,6 +194,14 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
def classifications(self) -> clinvar_data.pbs.clinvar_public_pb2.AggregateClassificationSet:
"""Classifications (thinned out)."""

@property
def clinical_assertions(
self,
) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[
clinvar_data.pbs.clinvar_public_pb2.ClinicalAssertion
]:
"""Clinical assertions (thinned out),"""

@property
def sequence_location(self) -> clinvar_data.pbs.clinvar_public_pb2.Location.SequenceLocation:
"""The sequence location on one reference."""
Expand All @@ -213,6 +222,9 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
classifications: (
clinvar_data.pbs.clinvar_public_pb2.AggregateClassificationSet | None
) = ...,
clinical_assertions: (
collections.abc.Iterable[clinvar_data.pbs.clinvar_public_pb2.ClinicalAssertion] | None
) = ...,
sequence_location: (
clinvar_data.pbs.clinvar_public_pb2.Location.SequenceLocation | None
) = ...,
Expand All @@ -236,6 +248,8 @@ class ExtractedVcvRecord(google.protobuf.message.Message):
b"accession",
"classifications",
b"classifications",
"clinical_assertions",
b"clinical_assertions",
"hgnc_ids",
b"hgnc_ids",
"name",
Expand Down
1 change: 0 additions & 1 deletion clinvar_data/pbs/gene_impact_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion clinvar_data/pbs/phenotype_link_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions protos/clinvar_data/pbs/extracted_vars.proto
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ message ExtractedVcvRecord {
VariationType variation_type = 4;
// Classifications (thinned out).
clinvar_data.pbs.clinvar_public.AggregateClassificationSet classifications = 5;
// Clinical assertions (thinned out),
repeated clinvar_data.pbs.clinvar_public.ClinicalAssertion clinical_assertions = 8;
// The sequence location on one reference.
clinvar_data.pbs.clinvar_public.Location.SequenceLocation sequence_location = 6;
// List of HGNC IDs.
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"assembly": "GRCh37", "chr": "CHROMOSOME_2", "accession": "NC_000002.11", "start": 143685263, "stop": 143685263, "displayStart": 143685263, "displayStop": 143685263, "variantLength": 1, "referenceAllele": "G", "alternateAllele": "C", "positionVcf": 143685263, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"]}
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"assembly": "GRCh37", "chr": "CHROMOSOME_2", "accession": "NC_000002.11", "start": 143685263, "stop": 143685263, "displayStart": 143685263, "displayStop": 143685263, "variantLength": 1, "referenceAllele": "G", "alternateAllele": "C", "positionVcf": 143685263, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"], "clinicalAssertions": [{"clinvarAccession": {"accession": "SCV001433049", "version": 1, "dateUpdated": "2020-09-27T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z"}, "classifications": {"reviewStatus": "SUBMITTER_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "germlineClassification": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z"}, "assertion": "ASSERTION_VARIATION_TO_DISEASE"}]}
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"forDisplay": true, "assembly": "GRCh38", "chr": "CHROMOSOME_2", "accession": "NC_000002.12", "start": 142927694, "stop": 142927694, "displayStart": 142927694, "displayStop": 142927694, "variantLength": 1, "positionVcf": 142927694, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"]}
{"accession": {"accession": "VCV000978270", "version": 1}, "rcvs": [{"accession": {"accession": "RCV001256675", "version": 1}, "title": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser) AND Catel-Manzke syndrome", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": {"value": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "submissionCount": 1}}}}], "name": "NM_003937.3(KYNU):c.326G>C (p.Trp109Ser)", "variationType": "VARIATION_TYPE_SNV", "classifications": {"germlineClassification": {"reviewStatus": "AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "description": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z", "mostRecentSubmission": "2020-09-27T00:00:00Z", "numberOfSubmitters": 1, "numberOfSubmissions": 1}}, "sequenceLocation": {"forDisplay": true, "assembly": "GRCh38", "chr": "CHROMOSOME_2", "accession": "NC_000002.12", "start": 142927694, "stop": 142927694, "displayStart": 142927694, "displayStop": 142927694, "variantLength": 1, "positionVcf": 142927694, "referenceAlleleVcf": "G", "alternateAlleleVcf": "C"}, "hgncIds": ["HGNC:6469"], "clinicalAssertions": [{"clinvarAccession": {"accession": "SCV001433049", "version": 1, "dateUpdated": "2020-09-27T00:00:00Z", "dateCreated": "2020-09-27T00:00:00Z"}, "classifications": {"reviewStatus": "SUBMITTER_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER", "germlineClassification": "Pathogenic", "dateLastEvaluated": "2012-01-07T00:00:00Z"}, "assertion": "ASSERTION_VARIATION_TO_DISEASE"}]}

0 comments on commit 19ecf18

Please sign in to comment.