Skip to content

Commit

Permalink
Merge pull request #89 from leexgh/hgnc-2023-10
Browse files Browse the repository at this point in the history
Update HGNC with prev_symbols
  • Loading branch information
leexgh authored Jan 9, 2024
2 parents e26b4b0 + d13f697 commit 1a12a21
Show file tree
Hide file tree
Showing 10 changed files with 156,858 additions and 150,274 deletions.
85,929 changes: 43,737 additions & 42,192 deletions data/common_input/hgnc_complete_set_2023-10.txt

Large diffs are not rendered by default.

85,929 changes: 43,737 additions & 42,192 deletions data/grch37_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt

Large diffs are not rendered by default.

Binary file not shown.
44,229 changes: 22,372 additions & 21,857 deletions data/grch38_ensembl92/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt

Large diffs are not rendered by default.

Binary file modified data/grch38_ensembl92/export/ensembl_biomart_transcripts.json.gz
Binary file not shown.
5,093 changes: 3,260 additions & 1,833 deletions data/grch38_ensembl92/export/oncokb_cancer_genes_list_from_API.json

Large diffs are not rendered by default.

85,929 changes: 43,737 additions & 42,192 deletions data/grch38_ensembl95/export/ensembl_biomart_canonical_transcripts_per_hgnc.txt

Large diffs are not rendered by default.

Binary file modified data/grch38_ensembl95/export/ensembl_biomart_transcripts.json.gz
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def get_hgnc_symbol(transcript_id, hgnc_dict):

hgnc_dict = dict()
for index, row in hgnc_df.iterrows():
for symbol in row['synonyms'].split('|'):
for symbol in row['prev_symbol'].split('|'):
hgnc_dict[symbol] = index

hgnc_symbol_list = transcripts.index.drop_duplicates().map(lambda transcript_id: get_hgnc_symbol(transcript_id, hgnc_dict))
Expand Down Expand Up @@ -213,7 +213,7 @@ def main(ensembl_biomart_transcripts,
transcripts = add_ccds(transcripts, ccds, isoform_overrides_uniprot, isoform_overrides_mskcc)

# Add nested HGNC, exons and PFAM domains
hgnc_df = pd.read_csv(hgnc_symbol_set, sep='\t', usecols = ['symbol', 'synonyms'], index_col=0).dropna()
hgnc_df = pd.read_csv(hgnc_symbol_set, sep='\t', usecols = ['symbol', 'prev_symbol'], index_col=0).dropna()
transcripts = add_nested_hgnc(transcripts, hgnc_df)
transcripts = add_nested_transcript_info(transcripts, transcript_info)
transcripts = add_nested_pfam_domains(transcripts, pfam_domains)
Expand Down
19 changes: 13 additions & 6 deletions scripts/make_one_canonical_transcript_per_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,18 @@ def main(ensembl_biomart_geneids_transcript_info,
# Convert new column names to old stable column names. If this is not done properly, Genome Nexus and any other
# downstream applications break
# TODO: Update Genome Nexus to accept the latest HGNC column names so that remapping is not necessary.
column_name_mapping = {
column_name_mapping = {'name': 'approved_name',
'symbol': 'approved_symbol',
'synonyms': 'synonyms',
'prev_symbol': 'previous_symbols',
'alias_symbol': 'synonyms',
'location': 'chromosome',
'entrez_id': 'entrez_gene_id',
'ena': 'accession_numbers',
'refseq_accession': 'refseq_ids',
'uniprot_ids': 'uniprot_id',
'ensembl_id': 'ensembl_gene_id'}
hgnc_df.rename(columns=column_name_mapping, inplace=True)
hgnc_df = hgnc_df[hgnc_df['approved_name'] != 'entry withdrawn'].copy()
hugos = hgnc_df['approved_symbol'].unique()
hgnc_df = hgnc_df.set_index('approved_symbol')
# assume each row has approved symbol
Expand All @@ -167,8 +172,10 @@ def main(ensembl_biomart_geneids_transcript_info,

# create hgnc_symbol to gene id mapping
# ignore hugo symbols from ensembl data dump (includes prev symbols and synonyms)
synonyms = hgnc_df.synonyms.str.strip('"').str.split("|").dropna()
synonyms = set(itertools.chain.from_iterable(synonyms))
syns = hgnc_df.synonyms.str.strip('"').str.split("|").dropna()
syns = set(itertools.chain.from_iterable(syns))
previous_symbols = hgnc_df.previous_symbols.str.strip('"').str.split("|").dropna()
previous_symbols = set(itertools.chain.from_iterable(previous_symbols))

# there is overlap between symbols, synonyms and previous symbols
# therefore use logic in above order when querying
Expand All @@ -179,8 +186,8 @@ def main(ensembl_biomart_geneids_transcript_info,
# all cancer genes and hugo symbols in ensembl data dump should be
# contained in hgnc approved symbols and synonyms
# c12orf9 is only in sanger's cancer gene census and has been withdrawn
assert(len(lowercase_set(set(cgs)) - set(['c12orf9']) - lowercase_set(set(hugos).union(synonyms))) == 0)
no_symbols_in_hgnc = lowercase_set(transcript_info_df.hgnc_symbol.dropna().unique()) - lowercase_set(set(hugos).union(synonyms))
assert(len(lowercase_set(set(cgs)) - set(['c12orf9']) - lowercase_set(set(hugos).union(syns).union(previous_symbols))) == 0)
no_symbols_in_hgnc = lowercase_set(transcript_info_df.hgnc_symbol.dropna().unique()) - lowercase_set(set(hugos).union(syns).union(previous_symbols))
new_genes = ignore_certain_genes(ignore_rna_gene(no_symbols_in_hgnc),ignored_genes_file_name)
if len(new_genes) != 0:
print('------ New genes need to be added into ignored_genes.txt ------\n' +
Expand Down

0 comments on commit 1a12a21

Please sign in to comment.