Merge pull request #89 from leexgh/hgnc-2023-10

Update HGNC with prev_symbols
genome-nexus · Jan 9, 2024 · 1a12a21 · 1a12a21
2 parents e26b4b0 + d13f697
commit 1a12a21
Show file tree

Hide file tree

Showing 10 changed files with 156,858 additions and 150,274 deletions.
diff --git a/data/common_input/hgnc_complete_set_2023-10.txt b/data/common_input/hgnc_complete_set_2023-10.txt
diff --git a/data/grch37_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt b/data/grch37_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt
diff --git a/data/grch37_ensembl92/export/ensembl_biomart_transcripts.json.gz b/data/grch37_ensembl92/export/ensembl_biomart_transcripts.json.gz
diff --git a/data/grch38_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt b/data/grch38_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt
diff --git a/data/grch38_ensembl92/export/ensembl_biomart_transcripts.json.gz b/data/grch38_ensembl92/export/ensembl_biomart_transcripts.json.gz
diff --git a/data/grch38_ensembl92/export/oncokb_cancer_genes_list_from_API.json b/data/grch38_ensembl92/export/oncokb_cancer_genes_list_from_API.json
diff --git a/data/grch38_ensembl95/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt b/data/grch38_ensembl95/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt
diff --git a/data/grch38_ensembl95/export/ensembl_biomart_transcripts.json.gz b/data/grch38_ensembl95/export/ensembl_biomart_transcripts.json.gz
diff --git a/scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py b/scripts/add_domains_hugo_ccds_refseq_exon_info_uniprot_to_ensembl_transcript.py
@@ -31,7 +31,7 @@ def get_hgnc_symbol(transcript_id, hgnc_dict):
 
     hgnc_dict = dict()
     for index, row in hgnc_df.iterrows():
-        for symbol in row['synonyms'].split('|'):
+        for symbol in row['prev_symbol'].split('|'):
             hgnc_dict[symbol] = index
 
     hgnc_symbol_list = transcripts.index.drop_duplicates().map(lambda transcript_id: get_hgnc_symbol(transcript_id, hgnc_dict))
@@ -213,7 +213,7 @@ def main(ensembl_biomart_transcripts,
     transcripts = add_ccds(transcripts, ccds, isoform_overrides_uniprot, isoform_overrides_mskcc)
 
     # Add nested HGNC, exons and PFAM domains
-    hgnc_df = pd.read_csv(hgnc_symbol_set, sep='\t', usecols = ['symbol', 'synonyms'], index_col=0).dropna()
+    hgnc_df = pd.read_csv(hgnc_symbol_set, sep='\t', usecols = ['symbol', 'prev_symbol'], index_col=0).dropna()
     transcripts = add_nested_hgnc(transcripts, hgnc_df)
     transcripts = add_nested_transcript_info(transcripts, transcript_info)
     transcripts = add_nested_pfam_domains(transcripts, pfam_domains)

diff --git a/scripts/make_one_canonical_transcript_per_gene.py b/scripts/make_one_canonical_transcript_per_gene.py
@@ -146,13 +146,18 @@ def main(ensembl_biomart_geneids_transcript_info,
     # Convert new column names to old stable column names. If this is not done properly, Genome Nexus and any other
     # downstream applications break
     # TODO: Update Genome Nexus to accept the latest HGNC column names so that remapping is not necessary.
-    column_name_mapping = {
+    column_name_mapping = {'name': 'approved_name',
                            'symbol': 'approved_symbol',
-                           'synonyms': 'synonyms',
+                           'prev_symbol': 'previous_symbols',
+                           'alias_symbol': 'synonyms',
                            'location': 'chromosome',
                            'entrez_id': 'entrez_gene_id',
+                           'ena': 'accession_numbers',
+                           'refseq_accession': 'refseq_ids',
+                           'uniprot_ids': 'uniprot_id',
                            'ensembl_id': 'ensembl_gene_id'}
     hgnc_df.rename(columns=column_name_mapping, inplace=True)
+    hgnc_df = hgnc_df[hgnc_df['approved_name'] != 'entry withdrawn'].copy()
     hugos = hgnc_df['approved_symbol'].unique()
     hgnc_df = hgnc_df.set_index('approved_symbol')
     # assume each row has approved symbol
@@ -167,8 +172,10 @@ def main(ensembl_biomart_geneids_transcript_info,
 
     # create hgnc_symbol to gene id mapping
     # ignore hugo symbols from ensembl data dump (includes prev symbols and synonyms)
-    synonyms = hgnc_df.synonyms.str.strip('"').str.split("|").dropna()
-    synonyms = set(itertools.chain.from_iterable(synonyms))
+    syns = hgnc_df.synonyms.str.strip('"').str.split("|").dropna()
+    syns = set(itertools.chain.from_iterable(syns))
+    previous_symbols = hgnc_df.previous_symbols.str.strip('"').str.split("|").dropna()
+    previous_symbols = set(itertools.chain.from_iterable(previous_symbols))
 
     # there is overlap between symbols, synonyms and previous symbols
     # therefore use logic in above order when querying
@@ -179,8 +186,8 @@ def main(ensembl_biomart_geneids_transcript_info,
     # all cancer genes and hugo symbols in ensembl data dump should be
     # contained in hgnc approved symbols and synonyms
     # c12orf9 is only in sanger's cancer gene census and has been withdrawn
-    assert(len(lowercase_set(set(cgs)) - set(['c12orf9']) - lowercase_set(set(hugos).union(synonyms))) == 0)
-    no_symbols_in_hgnc = lowercase_set(transcript_info_df.hgnc_symbol.dropna().unique()) - lowercase_set(set(hugos).union(synonyms))
+    assert(len(lowercase_set(set(cgs)) - set(['c12orf9']) - lowercase_set(set(hugos).union(syns).union(previous_symbols))) == 0)
+    no_symbols_in_hgnc = lowercase_set(transcript_info_df.hgnc_symbol.dropna().unique()) - lowercase_set(set(hugos).union(syns).union(previous_symbols))
     new_genes = ignore_certain_genes(ignore_rna_gene(no_symbols_in_hgnc),ignored_genes_file_name)
     if len(new_genes) != 0:
         print('------ New genes need to be added into ignored_genes.txt ------\n' +