Skip to content

Commit

Permalink
WIP: update Haplotype notebook to 2.x
Browse files Browse the repository at this point in the history
  • Loading branch information
ahwagner committed Jan 30, 2024
1 parent bede107 commit ca3c638
Showing 1 changed file with 115 additions and 96 deletions.
211 changes: 115 additions & 96 deletions notebooks/Haplotypes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,28 +37,22 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Removing allOf attribute from AbsoluteCopyNumber to avoid python-jsonschema-objects error.\n",
"Removing allOf attribute from SequenceInterval to avoid python-jsonschema-objects error.\n",
"Removing allOf attribute from RepeatedSequenceExpression to avoid python-jsonschema-objects error.\n",
"/Users/kls035/src/vrs-python/venv/3.9/lib/python3.9/site-packages/python_jsonschema_objects/__init__.py:49: UserWarning: Schema version http://json-schema.org/draft-07/schema not recognized. Some keywords and features may not be supported.\n",
" warnings.warn(\n"
]
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.722521Z",
"start_time": "2024-01-30T19:58:14.505437Z"
}
],
},
"outputs": [],
"source": [
"from ga4gh.vrs import models, vrs_deref, vrs_enref\n",
"from ga4gh.core import ga4gh_identify, ga4gh_serialize, ga4gh_digest, ga4gh_deref\n",
"\n",
"import json\n",
"\n",
"def ppo(o, indent=2):\n",
" \"\"\"pretty print object as json\"\"\"\n",
" print(json.dumps(o.as_dict(), sort_keys=True, indent=indent))\n",
" print(json.dumps(o.dict(exclude_none=True), sort_keys=True, indent=indent))\n",
" \n"
]
},
Expand All @@ -73,7 +67,12 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.725813Z",
"start_time": "2024-01-30T19:58:14.723142Z"
}
},
"outputs": [],
"source": [
"# NC_000019.10 (GRCh38 chr 19 primary assembly) sequence id\n",
Expand All @@ -84,43 +83,56 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.728545Z",
"start_time": "2024-01-30T19:58:14.725830Z"
}
},
"outputs": [],
"source": [
"locations = {\n",
" \"rs429358_38\": models.SequenceLocation(\n",
" sequence_id = sequence_id,\n",
" interval = models.SequenceInterval(start=models.Number(value=44908683, type=\"Number\"), \n",
" end=models.Number(value=44908684, type=\"Number\"), \n",
" type=\"SequenceInterval\"),\n",
" start = 44908683,\n",
" end = 44908684, \n",
" type=\"SequenceLocation\"),\n",
" \"rs7412_38\": models.SequenceLocation(\n",
" sequence_id = sequence_id,\n",
" interval=models.SequenceInterval(start=models.Number(value=44908821, type=\"Number\"), \n",
" end=models.Number(value=44908822, type=\"Number\"),\n",
" type=\"SequenceInterval\"),\n",
" start=44908821, \n",
" end=44908822,\n",
" type=\"SequenceLocation\")\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.731353Z",
"start_time": "2024-01-30T19:58:14.729608Z"
}
},
"outputs": [],
"source": [
"alleles = {\n",
" \"rs429358_38_C\": models.Allele(location=locations[\"rs429358_38\"], state=models.SequenceState(sequence=\"C\", type=\"SequenceState\"), type=\"Allele\"),\n",
" \"rs429358_38_T\": models.Allele(location=locations[\"rs429358_38\"], state=models.SequenceState(sequence=\"T\", type=\"SequenceState\"), type=\"Allele\"),\n",
" \"rs7412_38_C\": models.Allele(location=locations[\"rs7412_38\"], state=models.SequenceState(sequence=\"C\", type=\"SequenceState\"), type=\"Allele\"),\n",
" \"rs7412_38_T\": models.Allele(location=locations[\"rs7412_38\"], state=models.SequenceState(sequence=\"T\", type=\"SequenceState\"), type=\"Allele\"),\n",
" \"rs429358_38_C\": models.Allele(location=locations[\"rs429358_38\"], state=models.LiteralSequenceExpression(sequence=\"C\", type=\"LiteralSequenceExpression\"), type=\"Allele\"),\n",
" \"rs429358_38_T\": models.Allele(location=locations[\"rs429358_38\"], state=models.LiteralSequenceExpression(sequence=\"T\", type=\"LiteralSequenceExpression\"), type=\"Allele\"),\n",
" \"rs7412_38_C\": models.Allele(location=locations[\"rs7412_38\"], state=models.LiteralSequenceExpression(sequence=\"C\", type=\"LiteralSequenceExpression\"), type=\"Allele\"),\n",
" \"rs7412_38_T\": models.Allele(location=locations[\"rs7412_38\"], state=models.LiteralSequenceExpression(sequence=\"T\", type=\"LiteralSequenceExpression\"), type=\"Allele\"),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.735619Z",
"start_time": "2024-01-30T19:58:14.733439Z"
}
},
"outputs": [],
"source": [
"haplotypes = {\n",
Expand All @@ -135,7 +147,11 @@
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
"scrolled": true,
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.738312Z",
"start_time": "2024-01-30T19:58:14.736031Z"
}
},
"outputs": [
{
Expand All @@ -146,49 +162,30 @@
" \"members\": [\n",
" {\n",
" \"location\": {\n",
" \"interval\": {\n",
" \"end\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908684\n",
" },\n",
" \"start\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908683\n",
" },\n",
" \"type\": \"SequenceInterval\"\n",
" },\n",
" \"sequence_id\": \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n",
" \"end\": 44908684,\n",
" \"start\": 44908683,\n",
" \"type\": \"SequenceLocation\"\n",
" },\n",
" \"state\": {\n",
" \"sequence\": \"C\",\n",
" \"type\": \"SequenceState\"\n",
" \"type\": \"LiteralSequenceExpression\"\n",
" },\n",
" \"type\": \"Allele\"\n",
" },\n",
" {\n",
" \"location\": {\n",
" \"interval\": {\n",
" \"end\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908822\n",
" },\n",
" \"start\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908821\n",
" },\n",
" \"type\": \"SequenceInterval\"\n",
" },\n",
" \"sequence_id\": \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n",
" \"end\": 44908822,\n",
" \"start\": 44908821,\n",
" \"type\": \"SequenceLocation\"\n",
" },\n",
" \"state\": {\n",
" \"sequence\": \"T\",\n",
" \"type\": \"SequenceState\"\n",
" \"type\": \"LiteralSequenceExpression\"\n",
" },\n",
" \"type\": \"Allele\"\n",
" }\n",
" ]\n",
" ],\n",
" \"type\": \"Haplotype\"\n",
"}\n"
]
}
Expand All @@ -200,8 +197,22 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.742995Z",
"start_time": "2024-01-30T19:58:14.740239Z"
}
},
"outputs": [
{
"data": {
"text/plain": "'ga4gh:HT.oQfqyk1bh2QpfFsNCM6fwzHM3tzcjaNp'"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Generated a computed identifier for the Haplotype\n",
"ga4gh_identify(haplotypes[\"APOE-ε1\"])"
Expand All @@ -210,22 +221,26 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.766766Z",
"start_time": "2024-01-30T19:58:14.744418Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ga4gh:VH.XcRh22GN0SXmi0J7RyvQFw5cXb35Pesy\n",
"ga4gh:VH.gwpj5LNuNqwI9TZ-MNI6h7AZhjJztQ4O\n"
"ga4gh:HT.Asb-qa3Jb4mzhaVE_FNDIhfu_JaVtem4\n",
"ga4gh:HT.oQfqyk1bh2QpfFsNCM6fwzHM3tzcjaNp\n"
]
}
],
"source": [
"# The order of haplotype members does not change GA4GH Computed Identifier\n",
"# The order of haplotype members is meaningful in VRS 2.0! See VRS documentation for how to order.\n",
"apoe1_alleles = (alleles[\"rs7412_38_T\"], alleles[\"rs429358_38_C\"])\n",
"\n",
"#note: this should be the same, but currently is not?\n",
"print(ga4gh_identify(models.Haplotype(members=apoe1_alleles, type=\"Haplotype\")))\n",
"print(ga4gh_identify(models.Haplotype(members=tuple(reversed(apoe1_alleles)), type=\"Haplotype\")))\n",
"# assert (ga4gh_identify(models.Haplotype(members=apoe1_alleles)) ==\n",
Expand All @@ -235,7 +250,12 @@
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.767092Z",
"start_time": "2024-01-30T19:58:14.747525Z"
}
},
"outputs": [
{
"name": "stdout",
Expand All @@ -245,72 +265,71 @@
" \"members\": [\n",
" {\n",
" \"location\": {\n",
" \"interval\": {\n",
" \"end\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908684\n",
" },\n",
" \"start\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908683\n",
" },\n",
" \"type\": \"SequenceInterval\"\n",
" },\n",
" \"sequence_id\": \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n",
" \"end\": 44908684,\n",
" \"start\": 44908683,\n",
" \"type\": \"SequenceLocation\"\n",
" },\n",
" \"state\": {\n",
" \"sequence\": \"C\",\n",
" \"type\": \"SequenceState\"\n",
" \"type\": \"LiteralSequenceExpression\"\n",
" },\n",
" \"type\": \"Allele\"\n",
" },\n",
" {\n",
" \"location\": {\n",
" \"interval\": {\n",
" \"end\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908822\n",
" },\n",
" \"start\": {\n",
" \"type\": \"Number\",\n",
" \"value\": 44908821\n",
" },\n",
" \"type\": \"SequenceInterval\"\n",
" },\n",
" \"sequence_id\": \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n",
" \"end\": 44908822,\n",
" \"start\": 44908821,\n",
" \"type\": \"SequenceLocation\"\n",
" },\n",
" \"state\": {\n",
" \"sequence\": \"T\",\n",
" \"type\": \"SequenceState\"\n",
" \"type\": \"LiteralSequenceExpression\"\n",
" },\n",
" \"type\": \"Allele\"\n",
" }\n",
" ]\n",
" ],\n",
" \"type\": \"Haplotype\"\n",
"}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/ahw001/git/vrs-python/venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n",
" Expected `Union[definition-ref, SequenceLocation]` but got `str` - serialized value may not be as expected\n",
" return self.__pydantic_serializer__.to_python(\n",
"/Users/ahw001/git/vrs-python/venv/lib/python3.10/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n",
" Expected `Union[Allele, definition-ref]` but got `str` - serialized value may not be as expected\n",
" Expected `Union[Allele, definition-ref]` but got `str` - serialized value may not be as expected\n",
" return self.__pydantic_serializer__.to_python(\n"
]
}
],
"source": [
"# Haplotype members may be referenced (rather than inline) for more concise representations\n",
"apoe1_haplotype_ref = vrs_enref(haplotypes[\"APOE-ε1\"])\n",
"ppo(apoe1_haplotype_ref)"
"ppo(haplotypes[\"APOE-ε1\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2024-01-30T19:58:14.767250Z",
"start_time": "2024-01-30T19:58:14.750808Z"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "vrs-python",
"name": "python3",
"language": "python",
"name": "vrs-python"
"display_name": "Python 3 (ipykernel)"
},
"language_info": {
"codemirror_mode": {
Expand Down

0 comments on commit ca3c638

Please sign in to comment.