From 7489b15d3c51c453899ddd1f51b1e4ecab643fe7 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Fri, 3 Mar 2023 17:47:40 +1030 Subject: [PATCH] Paper notebook --- paper/HGVS cleaning.ipynb | 274 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 paper/HGVS cleaning.ipynb diff --git a/paper/HGVS cleaning.ipynb b/paper/HGVS cleaning.ipynb new file mode 100644 index 0000000..8aa5b4a --- /dev/null +++ b/paper/HGVS cleaning.ipynb @@ -0,0 +1,274 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bb4a4052", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import string\n", + "from typing import Tuple, List\n", + "import pandas as pd\n", + "\n", + "\n", + "df = pd.read_csv(\"./hgvs_searches.csv\")\n", + "non_resolve_mask = df[\"can_resolve\"] == False\n", + "hgvs_errors_df = df[non_resolve_mask]\n", + "\n", + "hgvs_errors_df = hgvs_errors_df.sort_values(\"hgvs\")\n", + " \n", + "# dropping ALL duplicate values\n", + "hgvs_errors_df.drop_duplicates(subset=\"hgvs\",\n", + " keep=False, inplace=True)\n", + "\n", + "hgvs_errors_df.to_csv(\"hgvs_errors_uniq.csv\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a11dfb04", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "hgvs_errors_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d610f24", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "\n", + "pattern_kind_no_colon = re.compile(r\"(c|g|m|n|p)\\.(\\d+)\")\n", + "pattern_kind_no_dot = re.compile(r\":(c|g|m|n|p)(\\d+)\")\n", + "pattern_gene_symbol = re.compile(r\"^[A-Z0-9-]+$|^C[0-9XY]+orf[0-9]+\") # HGNC gene symbol - https://www.biostars.org/p/60118/#65063\n", + "\n", + "\n", + "# Copy/pasted from pyhgvs\n", + "# The RefSeq standard for naming contigs/transcripts/proteins:\n", + "# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8\n", + "REFSEQ_PREFIXES = [\n", + " ('AC_', 'genomic',\n", + " 'Complete genomic molecule, usually alternate assembly'),\n", + " ('NC_', 'genomic',\n", + " 'Complete genomic molecule, usually reference assembly'),\n", + " ('NG_', 'genomic', 'Incomplete genomic region'),\n", + " ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),\n", + " ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),\n", + " ('NS_', 'genomic', 'Environmental sequence'),\n", + " ('NZ_', 'genomic', 'Unfinished WGS'),\n", + " ('NM_', 'mRNA', ''),\n", + " ('NR_', 'RNA', ''),\n", + " ('XM_', 'mRNA', 'Predicted model'),\n", + " ('XR_', 'RNA', 'Predicted model'),\n", + " ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),\n", + " ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),\n", + " ('YP_', 'Protein', ''),\n", + " ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),\n", + " ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),\n", + "]\n", + "\n", + "\n", + "\n", + "\n", + "def remove_non_printable_characters(hgvs_string):\n", + " return re.sub(f'[^{re.escape(string.printable)}]', '', hgvs_string)\n", + "\n", + "def remove_whitespace(hgvs_string):\n", + " \"\"\" This would be covered in remove_invalid_characters but this gives a nicer message \"\"\"\n", + " return re.sub(\"\\s\", '', hgvs_string)\n", + "\n", + "def remove_invalid_characters(hgvs_string):\n", + " return re.sub(\"[^A-Za-z0-9-_\\(\\)\\>=]\", '', hgvs_string)\n", + "\n", + "\n", + "def clean_kind(hgvs_string):\n", + " # Fix common typos\n", + " \n", + " # c, -> c. \n", + " # ;c -> :c semicolon\n", + " \n", + " \n", + " return hgvs_string\n", + " \n", + "\n", + "def add_unmatched_brackets(hgvs_string):\n", + " return hgvs_string\n", + "\n", + "def add_missing_colon(hgvs_string):\n", + " # GLA c.\n", + " # NM_001205293.2(CACNA1E):c.4165C>T'\n", + " \n", + " return hgvs_string\n", + "\n", + "def remove_duplicates(hgvs_string):\n", + " hgvs_string = re.sub(\"::+\", \":\", hgvs_string)\n", + " hgvs_string = re.sub(\"\\.\\.+\", \".\", hgvs_string)\n", + " return hgvs_string\n", + "\n", + "\n", + "def fix_allele_case(allele_string):\n", + " allele_keyworks = [\n", + " 'del',\n", + " 'delins',\n", + " 'dup',\n", + " 'ins',\n", + " 'inv',\n", + " ]\n", + " for ak in allele_keyworks:\n", + " allele_string = re.sub(ak, ak, allele_string, flags=re.IGNORECASE)\n", + " return allele_string\n", + " \n", + "\n", + "GLOBAL_CLEAN = {\n", + " \"remove_non_printable_characters\": remove_non_printable_characters,\n", + " \"remove_whitespace\": remove_whitespace,\n", + " \"remove_invalid_characters\", remove_invalid_characters,\n", + " \"remove duplicates\": remove_duplicates,\n", + "}\n", + "\n", + "\n", + " # Optional - remove gene symbol - (for clingen and biocommons HGVS) \n", + " # \"remove_gene_symbol\": remove_gene_symbol,\n", + "\n", + "# \"clean_kind\": clean_kind,\n", + "# \"add_unmatched_brackets\": add_unmatched_brackets,\n", + "# \"add_missing_colon\": add_missing_colon,\n", + "\n", + "\n", + "test_hgvs = [\n", + " \"c.4165C>T\", # This should fail as it has no transcript/gene\n", + " \"CACNA1E:c.4165C>T'\", # gene name - it's resolution that is trick here\n", + " \"CACNA1E c.4165C>T'\", # extra space, missing colon \n", + " \"CACNA1Ec.4165C>T'\", # missing colon\n", + " \"NM_001205293.2 :c.4165C>T'\", # whitespace\n", + " \"NM_001205293.2(CACNA1E):c.4165C>T'\", # \n", + " \"NM_001205293.2 :c.4165C>T'\", # whitespace\n", + "]\n", + "\n", + "\n", + "\n", + "def clean_hgvs(original_hgvs_string) -> Tuple[str, List[str]]:\n", + " hgvs_string = original_hgvs_string\n", + " clean_messages = []\n", + "\n", + " for clean_method_desc, clean_hgvs_func in GLOBAL_CLEAN.items():\n", + " cleaned_hgvs_string = clean_hgvs_func(hgvs_string) # hgvs_method)\n", + " if cleaned_hgvs_string != hgvs_string:\n", + " clean_messages.append(clean_method_desc)\n", + " hgvs_string = cleaned_hgvs_string\n", + "\n", + "\n", + " # Now we split it up into reference/kind/allele\n", + " \n", + " \n", + " return hgvs_string, clean_messages\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d30b8846", + "metadata": {}, + "outputs": [], + "source": [ + "# I think we should first break it up into \n", + "# reference / kind / allele\n", + "# \n", + "\n", + "\n", + "original_hgvs_string = \"GLA c.1277_1278delAA\"\n", + "hgvs_string, clean_messages = clean_hgvs(original_hgvs_string)\n", + "print(f\"{original_hgvs_string} -> {hgvs_string} \")\n", + "for msg in clean_messages:\n", + " print(msg)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba35d85d", + "metadata": {}, + "outputs": [], + "source": [ + "# This is from VariantGrid code\n", + "\n", + "\n", + " BAD_HGVS = [\n", + " \"NM_000038.6;c.4332A>T\" # Semicolon instead\n", + " \"NM_205768 c.44A>G\", # Missing colon (no version)\n", + " \"NM_005629.3:c1403A>C\", # Missing dot after kind\n", + " \"NM_001101.4 c.95C>G\", # Missing colon\n", + " \"NM_00380.3: c.648_649delGA\", # space after colon\n", + " \"NC_000023.10:g. 31496384G>A\",\n", + " \"NM_004245: :c.337G>T\", # Double colon\n", + " \"NC_000017.10:g.21085664 G>C\", # Space after numbers\n", + " \"NC_000023.10:g. 133547943G>A\", # Space after g.\n", + " # Missing transcript underscore, Missing colon, Missing dot after g\n", + " # Space between position and reference base\n", + " \"NC000002.10g39139341 C>T\",\n", + " # Unbalanced brackets\n", + " \"NM_001754.5):c.557T>A\",\n", + " \"(NM_004991.4:c.2577+4A>T\",\n", + " # Good brackets HGVS (just testing gene symbol)\n", + " \"NM_001754.5(RUNX1):c.1415T>C\",\n", + " \"NM_032638:c.1126_1133DUP\", # Case\n", + " \"NM_001754.5:557T>A\", # Missing \"c.\"\n", + " \"NC_000007.13:117199563G>T\", # Missing \"g.\"\n", + " ]\n", + "\n", + " for bad_hgvs in BAD_HGVS:\n", + " try:\n", + " HGVSName(bad_hgvs)\n", + " self.fail(f\"Expected '{bad_hgvs}' to fail!\")\n", + " except:\n", + " pass # Expected\n", + "\n", + " fixed_hgvs = HGVSMatcher.clean_hgvs(bad_hgvs)[0]\n", + " HGVSName(fixed_hgvs)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86c509cb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}