Skip to content

Commit

Permalink
Paper notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Mar 3, 2023
1 parent c8aee43 commit 7489b15
Showing 1 changed file with 274 additions and 0 deletions.
274 changes: 274 additions & 0 deletions paper/HGVS cleaning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "bb4a4052",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import string\n",
"from typing import Tuple, List\n",
"import pandas as pd\n",
"\n",
"\n",
"df = pd.read_csv(\"./hgvs_searches.csv\")\n",
"non_resolve_mask = df[\"can_resolve\"] == False\n",
"hgvs_errors_df = df[non_resolve_mask]\n",
"\n",
"hgvs_errors_df = hgvs_errors_df.sort_values(\"hgvs\")\n",
" \n",
"# dropping ALL duplicate values\n",
"hgvs_errors_df.drop_duplicates(subset=\"hgvs\",\n",
" keep=False, inplace=True)\n",
"\n",
"hgvs_errors_df.to_csv(\"hgvs_errors_uniq.csv\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a11dfb04",
"metadata": {},
"outputs": [],
"source": [
"\n",
"hgvs_errors_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d610f24",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"pattern_kind_no_colon = re.compile(r\"(c|g|m|n|p)\\.(\\d+)\")\n",
"pattern_kind_no_dot = re.compile(r\":(c|g|m|n|p)(\\d+)\")\n",
"pattern_gene_symbol = re.compile(r\"^[A-Z0-9-]+$|^C[0-9XY]+orf[0-9]+\") # HGNC gene symbol - https://www.biostars.org/p/60118/#65063\n",
"\n",
"\n",
"# Copy/pasted from pyhgvs\n",
"# The RefSeq standard for naming contigs/transcripts/proteins:\n",
"# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8\n",
"REFSEQ_PREFIXES = [\n",
" ('AC_', 'genomic',\n",
" 'Complete genomic molecule, usually alternate assembly'),\n",
" ('NC_', 'genomic',\n",
" 'Complete genomic molecule, usually reference assembly'),\n",
" ('NG_', 'genomic', 'Incomplete genomic region'),\n",
" ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),\n",
" ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),\n",
" ('NS_', 'genomic', 'Environmental sequence'),\n",
" ('NZ_', 'genomic', 'Unfinished WGS'),\n",
" ('NM_', 'mRNA', ''),\n",
" ('NR_', 'RNA', ''),\n",
" ('XM_', 'mRNA', 'Predicted model'),\n",
" ('XR_', 'RNA', 'Predicted model'),\n",
" ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),\n",
" ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),\n",
" ('YP_', 'Protein', ''),\n",
" ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),\n",
" ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),\n",
"]\n",
"\n",
"\n",
"\n",
"\n",
"def remove_non_printable_characters(hgvs_string):\n",
" return re.sub(f'[^{re.escape(string.printable)}]', '', hgvs_string)\n",
"\n",
"def remove_whitespace(hgvs_string):\n",
" \"\"\" This would be covered in remove_invalid_characters but this gives a nicer message \"\"\"\n",
" return re.sub(\"\\s\", '', hgvs_string)\n",
"\n",
"def remove_invalid_characters(hgvs_string):\n",
" return re.sub(\"[^A-Za-z0-9-_\\(\\)\\>=]\", '', hgvs_string)\n",
"\n",
"\n",
"def clean_kind(hgvs_string):\n",
" # Fix common typos\n",
" \n",
" # c, -> c. \n",
" # ;c -> :c semicolon\n",
" \n",
" \n",
" return hgvs_string\n",
" \n",
"\n",
"def add_unmatched_brackets(hgvs_string):\n",
" return hgvs_string\n",
"\n",
"def add_missing_colon(hgvs_string):\n",
" # GLA c.\n",
" # NM_001205293.2(CACNA1E):c.4165C>T'\n",
" \n",
" return hgvs_string\n",
"\n",
"def remove_duplicates(hgvs_string):\n",
" hgvs_string = re.sub(\"::+\", \":\", hgvs_string)\n",
" hgvs_string = re.sub(\"\\.\\.+\", \".\", hgvs_string)\n",
" return hgvs_string\n",
"\n",
"\n",
"def fix_allele_case(allele_string):\n",
" allele_keyworks = [\n",
" 'del',\n",
" 'delins',\n",
" 'dup',\n",
" 'ins',\n",
" 'inv',\n",
" ]\n",
" for ak in allele_keyworks:\n",
" allele_string = re.sub(ak, ak, allele_string, flags=re.IGNORECASE)\n",
" return allele_string\n",
" \n",
"\n",
"GLOBAL_CLEAN = {\n",
" \"remove_non_printable_characters\": remove_non_printable_characters,\n",
" \"remove_whitespace\": remove_whitespace,\n",
" \"remove_invalid_characters\", remove_invalid_characters,\n",
" \"remove duplicates\": remove_duplicates,\n",
"}\n",
"\n",
"\n",
" # Optional - remove gene symbol - (for clingen and biocommons HGVS) \n",
" # \"remove_gene_symbol\": remove_gene_symbol,\n",
"\n",
"# \"clean_kind\": clean_kind,\n",
"# \"add_unmatched_brackets\": add_unmatched_brackets,\n",
"# \"add_missing_colon\": add_missing_colon,\n",
"\n",
"\n",
"test_hgvs = [\n",
" \"c.4165C>T\", # This should fail as it has no transcript/gene\n",
" \"CACNA1E:c.4165C>T'\", # gene name - it's resolution that is trick here\n",
" \"CACNA1E c.4165C>T'\", # extra space, missing colon \n",
" \"CACNA1Ec.4165C>T'\", # missing colon\n",
" \"NM_001205293.2 :c.4165C>T'\", # whitespace\n",
" \"NM_001205293.2(CACNA1E):c.4165C>T'\", # \n",
" \"NM_001205293.2 :c.4165C>T'\", # whitespace\n",
"]\n",
"\n",
"\n",
"\n",
"def clean_hgvs(original_hgvs_string) -> Tuple[str, List[str]]:\n",
" hgvs_string = original_hgvs_string\n",
" clean_messages = []\n",
"\n",
" for clean_method_desc, clean_hgvs_func in GLOBAL_CLEAN.items():\n",
" cleaned_hgvs_string = clean_hgvs_func(hgvs_string) # hgvs_method)\n",
" if cleaned_hgvs_string != hgvs_string:\n",
" clean_messages.append(clean_method_desc)\n",
" hgvs_string = cleaned_hgvs_string\n",
"\n",
"\n",
" # Now we split it up into reference/kind/allele\n",
" \n",
" \n",
" return hgvs_string, clean_messages\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d30b8846",
"metadata": {},
"outputs": [],
"source": [
"# I think we should first break it up into \n",
"# reference / kind / allele\n",
"# \n",
"\n",
"\n",
"original_hgvs_string = \"GLA c.1277_1278delAA\"\n",
"hgvs_string, clean_messages = clean_hgvs(original_hgvs_string)\n",
"print(f\"{original_hgvs_string} -> {hgvs_string} \")\n",
"for msg in clean_messages:\n",
" print(msg)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba35d85d",
"metadata": {},
"outputs": [],
"source": [
"# This is from VariantGrid code\n",
"\n",
"\n",
" BAD_HGVS = [\n",
" \"NM_000038.6;c.4332A>T\" # Semicolon instead\n",
" \"NM_205768 c.44A>G\", # Missing colon (no version)\n",
" \"NM_005629.3:c1403A>C\", # Missing dot after kind\n",
" \"NM_001101.4 c.95C>G\", # Missing colon\n",
" \"NM_00380.3: c.648_649delGA\", # space after colon\n",
" \"NC_000023.10:g. 31496384G>A\",\n",
" \"NM_004245: :c.337G>T\", # Double colon\n",
" \"NC_000017.10:g.21085664 G>C\", # Space after numbers\n",
" \"NC_000023.10:g. 133547943G>A\", # Space after g.\n",
" # Missing transcript underscore, Missing colon, Missing dot after g\n",
" # Space between position and reference base\n",
" \"NC000002.10g39139341 C>T\",\n",
" # Unbalanced brackets\n",
" \"NM_001754.5):c.557T>A\",\n",
" \"(NM_004991.4:c.2577+4A>T\",\n",
" # Good brackets HGVS (just testing gene symbol)\n",
" \"NM_001754.5(RUNX1):c.1415T>C\",\n",
" \"NM_032638:c.1126_1133DUP\", # Case\n",
" \"NM_001754.5:557T>A\", # Missing \"c.\"\n",
" \"NC_000007.13:117199563G>T\", # Missing \"g.\"\n",
" ]\n",
"\n",
" for bad_hgvs in BAD_HGVS:\n",
" try:\n",
" HGVSName(bad_hgvs)\n",
" self.fail(f\"Expected '{bad_hgvs}' to fail!\")\n",
" except:\n",
" pass # Expected\n",
"\n",
" fixed_hgvs = HGVSMatcher.clean_hgvs(bad_hgvs)[0]\n",
" HGVSName(fixed_hgvs)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86c509cb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 7489b15

Please sign in to comment.