-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
274 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "bb4a4052", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import re\n", | ||
"import string\n", | ||
"from typing import Tuple, List\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"\n", | ||
"df = pd.read_csv(\"./hgvs_searches.csv\")\n", | ||
"non_resolve_mask = df[\"can_resolve\"] == False\n", | ||
"hgvs_errors_df = df[non_resolve_mask]\n", | ||
"\n", | ||
"hgvs_errors_df = hgvs_errors_df.sort_values(\"hgvs\")\n", | ||
" \n", | ||
"# dropping ALL duplicate values\n", | ||
"hgvs_errors_df.drop_duplicates(subset=\"hgvs\",\n", | ||
" keep=False, inplace=True)\n", | ||
"\n", | ||
"hgvs_errors_df.to_csv(\"hgvs_errors_uniq.csv\")\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a11dfb04", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"hgvs_errors_df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "8d610f24", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"\n", | ||
"\n", | ||
"pattern_kind_no_colon = re.compile(r\"(c|g|m|n|p)\\.(\\d+)\")\n", | ||
"pattern_kind_no_dot = re.compile(r\":(c|g|m|n|p)(\\d+)\")\n", | ||
"pattern_gene_symbol = re.compile(r\"^[A-Z0-9-]+$|^C[0-9XY]+orf[0-9]+\") # HGNC gene symbol - https://www.biostars.org/p/60118/#65063\n", | ||
"\n", | ||
"\n", | ||
"# Copy/pasted from pyhgvs\n", | ||
"# The RefSeq standard for naming contigs/transcripts/proteins:\n", | ||
"# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8\n", | ||
"REFSEQ_PREFIXES = [\n", | ||
" ('AC_', 'genomic',\n", | ||
" 'Complete genomic molecule, usually alternate assembly'),\n", | ||
" ('NC_', 'genomic',\n", | ||
" 'Complete genomic molecule, usually reference assembly'),\n", | ||
" ('NG_', 'genomic', 'Incomplete genomic region'),\n", | ||
" ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),\n", | ||
" ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),\n", | ||
" ('NS_', 'genomic', 'Environmental sequence'),\n", | ||
" ('NZ_', 'genomic', 'Unfinished WGS'),\n", | ||
" ('NM_', 'mRNA', ''),\n", | ||
" ('NR_', 'RNA', ''),\n", | ||
" ('XM_', 'mRNA', 'Predicted model'),\n", | ||
" ('XR_', 'RNA', 'Predicted model'),\n", | ||
" ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),\n", | ||
" ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),\n", | ||
" ('YP_', 'Protein', ''),\n", | ||
" ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),\n", | ||
" ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),\n", | ||
"]\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"def remove_non_printable_characters(hgvs_string):\n", | ||
" return re.sub(f'[^{re.escape(string.printable)}]', '', hgvs_string)\n", | ||
"\n", | ||
"def remove_whitespace(hgvs_string):\n", | ||
" \"\"\" This would be covered in remove_invalid_characters but this gives a nicer message \"\"\"\n", | ||
" return re.sub(\"\\s\", '', hgvs_string)\n", | ||
"\n", | ||
"def remove_invalid_characters(hgvs_string):\n", | ||
" return re.sub(\"[^A-Za-z0-9-_\\(\\)\\>=]\", '', hgvs_string)\n", | ||
"\n", | ||
"\n", | ||
"def clean_kind(hgvs_string):\n", | ||
" # Fix common typos\n", | ||
" \n", | ||
" # c, -> c. \n", | ||
" # ;c -> :c semicolon\n", | ||
" \n", | ||
" \n", | ||
" return hgvs_string\n", | ||
" \n", | ||
"\n", | ||
"def add_unmatched_brackets(hgvs_string):\n", | ||
" return hgvs_string\n", | ||
"\n", | ||
"def add_missing_colon(hgvs_string):\n", | ||
" # GLA c.\n", | ||
" # NM_001205293.2(CACNA1E):c.4165C>T'\n", | ||
" \n", | ||
" return hgvs_string\n", | ||
"\n", | ||
"def remove_duplicates(hgvs_string):\n", | ||
" hgvs_string = re.sub(\"::+\", \":\", hgvs_string)\n", | ||
" hgvs_string = re.sub(\"\\.\\.+\", \".\", hgvs_string)\n", | ||
" return hgvs_string\n", | ||
"\n", | ||
"\n", | ||
"def fix_allele_case(allele_string):\n", | ||
" allele_keyworks = [\n", | ||
" 'del',\n", | ||
" 'delins',\n", | ||
" 'dup',\n", | ||
" 'ins',\n", | ||
" 'inv',\n", | ||
" ]\n", | ||
" for ak in allele_keyworks:\n", | ||
" allele_string = re.sub(ak, ak, allele_string, flags=re.IGNORECASE)\n", | ||
" return allele_string\n", | ||
" \n", | ||
"\n", | ||
"GLOBAL_CLEAN = {\n", | ||
" \"remove_non_printable_characters\": remove_non_printable_characters,\n", | ||
" \"remove_whitespace\": remove_whitespace,\n", | ||
" \"remove_invalid_characters\", remove_invalid_characters,\n", | ||
" \"remove duplicates\": remove_duplicates,\n", | ||
"}\n", | ||
"\n", | ||
"\n", | ||
" # Optional - remove gene symbol - (for clingen and biocommons HGVS) \n", | ||
" # \"remove_gene_symbol\": remove_gene_symbol,\n", | ||
"\n", | ||
"# \"clean_kind\": clean_kind,\n", | ||
"# \"add_unmatched_brackets\": add_unmatched_brackets,\n", | ||
"# \"add_missing_colon\": add_missing_colon,\n", | ||
"\n", | ||
"\n", | ||
"test_hgvs = [\n", | ||
" \"c.4165C>T\", # This should fail as it has no transcript/gene\n", | ||
" \"CACNA1E:c.4165C>T'\", # gene name - it's resolution that is trick here\n", | ||
" \"CACNA1E c.4165C>T'\", # extra space, missing colon \n", | ||
" \"CACNA1Ec.4165C>T'\", # missing colon\n", | ||
" \"NM_001205293.2 :c.4165C>T'\", # whitespace\n", | ||
" \"NM_001205293.2(CACNA1E):c.4165C>T'\", # \n", | ||
" \"NM_001205293.2 :c.4165C>T'\", # whitespace\n", | ||
"]\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"def clean_hgvs(original_hgvs_string) -> Tuple[str, List[str]]:\n", | ||
" hgvs_string = original_hgvs_string\n", | ||
" clean_messages = []\n", | ||
"\n", | ||
" for clean_method_desc, clean_hgvs_func in GLOBAL_CLEAN.items():\n", | ||
" cleaned_hgvs_string = clean_hgvs_func(hgvs_string) # hgvs_method)\n", | ||
" if cleaned_hgvs_string != hgvs_string:\n", | ||
" clean_messages.append(clean_method_desc)\n", | ||
" hgvs_string = cleaned_hgvs_string\n", | ||
"\n", | ||
"\n", | ||
" # Now we split it up into reference/kind/allele\n", | ||
" \n", | ||
" \n", | ||
" return hgvs_string, clean_messages\n", | ||
" " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d30b8846", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# I think we should first break it up into \n", | ||
"# reference / kind / allele\n", | ||
"# \n", | ||
"\n", | ||
"\n", | ||
"original_hgvs_string = \"GLA c.1277_1278delAA\"\n", | ||
"hgvs_string, clean_messages = clean_hgvs(original_hgvs_string)\n", | ||
"print(f\"{original_hgvs_string} -> {hgvs_string} \")\n", | ||
"for msg in clean_messages:\n", | ||
" print(msg)\n", | ||
" \n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ba35d85d", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# This is from VariantGrid code\n", | ||
"\n", | ||
"\n", | ||
" BAD_HGVS = [\n", | ||
" \"NM_000038.6;c.4332A>T\" # Semicolon instead\n", | ||
" \"NM_205768 c.44A>G\", # Missing colon (no version)\n", | ||
" \"NM_005629.3:c1403A>C\", # Missing dot after kind\n", | ||
" \"NM_001101.4 c.95C>G\", # Missing colon\n", | ||
" \"NM_00380.3: c.648_649delGA\", # space after colon\n", | ||
" \"NC_000023.10:g. 31496384G>A\",\n", | ||
" \"NM_004245: :c.337G>T\", # Double colon\n", | ||
" \"NC_000017.10:g.21085664 G>C\", # Space after numbers\n", | ||
" \"NC_000023.10:g. 133547943G>A\", # Space after g.\n", | ||
" # Missing transcript underscore, Missing colon, Missing dot after g\n", | ||
" # Space between position and reference base\n", | ||
" \"NC000002.10g39139341 C>T\",\n", | ||
" # Unbalanced brackets\n", | ||
" \"NM_001754.5):c.557T>A\",\n", | ||
" \"(NM_004991.4:c.2577+4A>T\",\n", | ||
" # Good brackets HGVS (just testing gene symbol)\n", | ||
" \"NM_001754.5(RUNX1):c.1415T>C\",\n", | ||
" \"NM_032638:c.1126_1133DUP\", # Case\n", | ||
" \"NM_001754.5:557T>A\", # Missing \"c.\"\n", | ||
" \"NC_000007.13:117199563G>T\", # Missing \"g.\"\n", | ||
" ]\n", | ||
"\n", | ||
" for bad_hgvs in BAD_HGVS:\n", | ||
" try:\n", | ||
" HGVSName(bad_hgvs)\n", | ||
" self.fail(f\"Expected '{bad_hgvs}' to fail!\")\n", | ||
" except:\n", | ||
" pass # Expected\n", | ||
"\n", | ||
" fixed_hgvs = HGVSMatcher.clean_hgvs(bad_hgvs)[0]\n", | ||
" HGVSName(fixed_hgvs)\n", | ||
"\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "86c509cb", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |