Skip to content

Commit

Permalink
added core-187 demo script; fixed minor issue in ensembl-fetch
Browse files Browse the repository at this point in the history
  • Loading branch information
reece committed Aug 28, 2015
1 parent 1bfe67f commit 94bed76
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 4 deletions.
4 changes: 2 additions & 2 deletions loading/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ dump-%: dumps/%.pgd.gz dumps/%.pgd.gz.sha1;
#=> dumps/%.pgd.gz -- create dump of named schema (e.g., uta_20140210)
.PRECIOUS: dumps/%.pgd.gz
dumps/%.pgd.gz:
# Takes ~5 minutes
# expect ~5 minutes
(time pg_dump -U uta_admin -h localhost -d uta_dev -n $* | gzip) >$@.tmp 2>$@.log
mv "$@.tmp" "$@"

Expand All @@ -401,7 +401,7 @@ push-dev-%: logs/uta.biocommons.org/uta_dev/load-%.log;

.PRECIOUS: logs/uta.biocommons.org/uta_dev/load-%.log
logs/uta.biocommons.org/uta_dev/load-%.log: dumps/%.pgd.gz
# Takes ~5 minutes
# expect ~1.5h
@mkdir -pv ${@D}
(gzip -cdq $< | time psql -h uta.biocommons.org -U uta_admin -d uta_dev -aeE) >$@.tmp 2>&1

Expand Down
6 changes: 4 additions & 2 deletions sbin/ensembl-fetch
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ sub fetch_Gene_by_name($$);
my $root = "$FindBin::RealBin/..";

my $opts = {
'primary-only' => 0,
'npartitions' => 1000,
'divisor' => undef,
'modulus' => undef,
Expand Down Expand Up @@ -178,6 +179,7 @@ GetOptions($opts,
'port|p=s',
'prefix=s',
'user|u=s',
'primary-only+',
)
|| die("$0: you got usage issues, homey\n");

Expand Down Expand Up @@ -264,7 +266,7 @@ sub process_genes($@) {
my @subgenes = @genes[$s..$e];
my $msg = sprintf("subset: $pfx: %d/%d (%.1f%%) [%d, %d] = [%s,%s]...",
($i+1), $npart, ($i+1)/$npart*100, $s, $e,
'a', 'b'); # $subgenes[0]->external_name(), $subgenes[$#subgenes]->external_name());
$subgenes[0]->external_name(), $subgenes[$#subgenes]->external_name());
if (-d $pfx) {
$logger->info($msg . "$pfx already exists; skipping");
} else {
Expand Down Expand Up @@ -350,7 +352,7 @@ sub process1($$$$) {
next;
}
my $ac = $name_to_ac{$srn};
if ($ac !~ m/^NC_/) {
if ($opts->{primary_only} and $ac !~ m/^NC_/) {
$logger->warn(sprintf("gene %s, tx %s (%s): on non-chromosomal sequence %s (%s); skipping",
$hgnc, $tx->display_id(), $g->biotype(), $ac, $srn));
next;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python

"""CORE-187 suggested that we were missing a transcript,
ENST00000576892, for FAM58A. This script demonstrates the cause: that
transcript is on a patch region.
"""


import logging
import sys

import requests

logger = logging.getLogger(__name__)

base_uri = "http://grch37.rest.ensembl.org"

def _fetch_json(path):
uri = base_uri + path
r = requests.get(uri, headers={"Content-Type" : "application/json"})
r.raise_for_status()
logger.info('fetched '+uri)
return r.json()

def fetch_gene(hgnc):
return _fetch_json("/xrefs/symbol/homo_sapiens/"+hgnc)

def lookup(id):
return _fetch_json("/lookup/id/{id}?expand=1".format(id=id))


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)

ensgs = [r['id'] for r in fetch_gene('FAM58A') if r['id'].startswith("ENSG")]

for ensg in ensgs:
l = lookup(ensg)
txs = l['Transcript']
print("* {l[display_name]} {l[version]} {l[seq_region_name]} {l[object_type]}; {n} transcripts ".format(l=l, n=len(txs)))
for tx in txs:
print(" {tx[display_name]} {tx[id]} {tx[version]} {tx[seq_region_name]}; {n} exons".format(tx=tx, n=len(tx)))

0 comments on commit 94bed76

Please sign in to comment.