Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature simbad query tap #2856

Merged
merged 23 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
eb79ed7
FEAT: Add a query_tap method to SimbadClass
ManonMarchand Oct 11, 2023
e6580a8
docs: reorganize sections in simbad docs
ManonMarchand Oct 12, 2023
ab7c7d2
docs: add query TAP section
ManonMarchand Oct 12, 2023
f003c5d
feat: add helper functions to explore SIMBAD tables
ManonMarchand Oct 19, 2023
fca6d15
docs: add documentation for query_tap and its helper methods
ManonMarchand Oct 19, 2023
14fe47d
test: add tests for _adql_parameter and _adql_name
ManonMarchand Oct 19, 2023
9076760
docs: add changelog entry
ManonMarchand Oct 19, 2023
6d31b1f
test: add try/except on pyvo version for DALOverflowWarning
ManonMarchand Oct 20, 2023
564517a
fix: exchange order of BaseVOQuery and SimbadBaseQuery
ManonMarchand Nov 8, 2023
96777c1
refactor: rename tables and columns into list_tables and list_columns
ManonMarchand Nov 8, 2023
67aab66
style: make optional kwargs kwarg only
ManonMarchand Nov 9, 2023
cc3aeab
docs: remove links to issues, rephrase adql example
ManonMarchand Nov 9, 2023
21ccd2b
docs: remove doctest ignore on first TAP example
ManonMarchand Nov 9, 2023
744262f
feat: add caching to query_tap and hardlimit
ManonMarchand Nov 10, 2023
821aec7
docs: add an example of Simbad.query_tap with an uploaded table
ManonMarchand Nov 10, 2023
8ff6d73
fix: remove use of functools.cache_property for python 3.7 support
ManonMarchand Nov 10, 2023
4983ab6
fix: switch simbad.mirror list into simbad.conf.server_list
ManonMarchand Nov 13, 2023
79d4010
docs: fix english issues
ManonMarchand Nov 22, 2023
56c8e1d
docs: increase font size in simbad graph
ManonMarchand Nov 22, 2023
b99ad4c
fix: updated datatype in doctest outputs
ManonMarchand Jan 26, 2024
d2b2bc4
edit intersphinx links
ManonMarchand Jan 26, 2024
1304824
fix: simplify IN statement
ManonMarchand Jan 26, 2024
ccf93cb
fix: remove unused adql_name function
ManonMarchand Jan 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ esa.hubble
- New methods to download single files ``download_file`` and download FITS associated to an observation ``download_fits_files``. [#2797]
- New function to retrieve all the files associated to an observation. [#2797]

simbad
^^^^^^

- new ``query_tap`` method to access SIMBAD. This comes with additional methods to explore SIMBAD's tables and
their links: ``Simbad.list_tables``, ``Simbad.list_columns``, and ``Simbad.list_linked_tables``. [#2856]

solarsystem.neodys
^^^^^^^^^^^^^^^^^^

Expand Down
5 changes: 4 additions & 1 deletion astroquery/simbad/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ class Conf(_config.ConfigNamespace):
"""
Configuration parameters for `astroquery.simbad`.
"""
# the first item is the default configuration
servers_list = ['simbad.cds.unistra.fr', 'simbad.harvard.edu']

server = _config.ConfigItem(
['simbad.cds.unistra.fr', 'simbad.harvard.edu'],
servers_list,
'Name of the SIMBAD mirror to use.')

timeout = _config.ConfigItem(
Expand Down
320 changes: 318 additions & 2 deletions astroquery/simbad/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os
from collections import namedtuple
from io import BytesIO
from functools import lru_cache
import warnings
import astropy.units as u
from astropy.utils import isiterable
Expand All @@ -17,9 +18,11 @@
from astropy.table import Table
import astropy.io.votable as votable

from astroquery.query import BaseQuery
from astroquery.query import BaseQuery, BaseVOQuery
from astroquery.utils import commons, async_to_sync
from astroquery.exceptions import TableParseError, LargeQueryWarning, BlankResponseWarning

from pyvo.dal import TAPService
from . import conf


Expand Down Expand Up @@ -90,6 +93,24 @@ def strip_field(field, keep_filters=False):
return field


def _adql_parameter(entry: str):
"""Replace single quotes by two single quotes.

This should be applied to parameters used in ADQL queries.
It is not a SQL injection protection: it just allows to search, for example,
for authors with quotes in their names or titles/descriptions with apostrophes.

Parameters
----------
entry : str

Returns
-------
str
"""
return entry.replace("'", "''")


error_regex = re.compile(r'(?ms)\[(?P<line>\d+)\]\s?(?P<msg>.+?)(\[|\Z)')
SimbadError = namedtuple('SimbadError', ('line', 'msg'))
VersionInfo = namedtuple('VersionInfo', ('major', 'minor', 'micro', 'patch'))
Expand Down Expand Up @@ -260,7 +281,7 @@ def _request(self, *args, **kwargs):


@async_to_sync
class SimbadClass(SimbadBaseQuery):
class SimbadClass(BaseVOQuery, SimbadBaseQuery):
"""
The class for querying the Simbad web service.

Expand Down Expand Up @@ -300,6 +321,50 @@ class SimbadClass(SimbadBaseQuery):
def __init__(self):
super().__init__()
self._VOTABLE_FIELDS = self._VOTABLE_FIELDS.copy()
self._server = conf.server
self._tap = None

@property
def server(self):
"""The Simbad mirror to use."""
return self._server

@server.setter
def server(self, server: str):
"""Allows to switch server between Simbad mirrors.

Parameters
----------
server : str
It should be one of `~astroquery.simbad.conf.servers_list`.
"""
if server in conf.servers_list:
self._server = server
else:
raise ValueError(f"'{server}' does not correspond to a Simbad server, "
f"the two existing ones are {conf.servers_list}.")

@property
def tap(self):
"""A `~pyvo.dal.TAPService` service for Simbad."""
tap_url = f"https://{self.server}/simbad/sim-tap"
# only creates a new tap instance if there are no existing one
# or if the server property changed since the last getter call.
if (not self._tap) or (self._tap.baseurl != tap_url):
self._tap = TAPService(baseurl=tap_url, session=self._session)
return self._tap

@property
@lru_cache(1)
def hardlimit(self):
"""The maximum number of lines for Simbad's output.

This property is cached to avoid calls to simbad's capability
webpage each time the getter is called.
"""
# replace stack of property and lru_cache by functools.cache_property when
# astroquery drops python 3.7 support
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh, reminder that we should indeed drop support.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I keep the comment to help when the support will be dropped, or is it noise?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please do keep comments like these. Whenever we drop support I usually go around grep for things like 3.7, etc, so will more likely to find this for cleanup because of the comment.

return self.tap.hardlimit

def list_wildcards(self):
"""
Expand Down Expand Up @@ -936,6 +1001,257 @@ def query_objectids_async(self, object_name, *, cache=True,

return response

def list_tables(self, *, get_adql=False):
"""The names and descriptions of the tables in SIMBAD.

Parameters
----------
get_adql : bool, optional
Returns the ADQL string instead of querying SIMBAD.

Returns
-------
`~astropy.table.Table`
"""
query = ("SELECT table_name, description"
" FROM TAP_SCHEMA.tables"
" WHERE schema_name = 'public'")
if get_adql:
return query
return self.query_tap(query)

def list_columns(self, *tables: str, keyword=None, get_adql=False):
"""
Get the list of SIMBAD columns.

Add tables names to restrict to some tables. Call the function without
any parameter to get all columns names from all tables. The keyword argument
looks for columns in the selected Simbad tables that contain the
given keyword. The keyword search is not case-sensitive.

Parameters
----------
*tables : str, optional
Add tables names as strings to restrict to these tables columns.
keyword : str, optional
A keyword to look for in column names, table names, or descriptions.
get_adql : bool, optional
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may need to call this get_query_payload to be consistent with the rest of the astroquery API, though your name is more precise and makes sense.

What do you think @keflavich, is it time to cleanup what get_query_payload does and return the sql/adql where it makes more sense than returning the GET/POST parameters?

Copy link
Member Author

@ManonMarchand ManonMarchand Jan 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for bringing this up, I forgot to share my concerns about this argument.

I was also thinking about a get_raw_votable (or get_votable) option. The use cases I see are:

But I did not want to fall in the boolean trap cause these two arguments (get_adql\get_query_payload and get_votable) would be mutually exclusive.

Then my idea was to maybe deviate from the API in place and introduce an output_format argument that'd accept ['astropy_table', 'adql_string', 'astropy_votable', 'raw_votable'] and would default to astropy_table. This could be extended too.

What are your thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was also thinking about a get_raw_votable (or get_votable) option.

Yes, those would be great and I vaguely recall that it came up multiple times in the past and there should be issues for it already either here or in pyvo.

Then my idea was to maybe deviate from the API in place and introduce an output_format argument that'd accept ['astropy_table', 'adql_string', 'astropy_votable', 'raw_votable'] and would default to astropy_table. This could be extended too.
What are your thoughts?

lets factor out this discussion into a separate ticket and pull into the discussion other interested parties (I think Adrian will definitely be interested, maybe the ESA folks, too, even though they don't use pyvo (yet)). I might use a different arg name that makes is clearer that it's a debug option, and may not support all 4 options you listed, but overall it looks useful to have more tools for easy troubleshooting purposes.

Returns the ADQL string instead of querying SIMBAD.

Examples
--------
>>> from astroquery.simbad import Simbad
>>> Simbad.list_columns("ids", "ident") # doctest: +REMOTE_DATA
<Table length=4>
table_name column_name datatype ... unit ucd
object object object ... object object
---------- ----------- -------- ... ------ -------
ident id VARCHAR ... meta.id
ident oidref BIGINT ...
ids ids VARCHAR ... meta.id
ids oidref BIGINT ...


>>> from astroquery.simbad import Simbad
>>> Simbad.list_columns(keyword="filter") # doctest: +REMOTE_DATA
<Table length=5>
table_name column_name datatype ... unit ucd
object object object ... object object
----------- ----------- ----------- ... ------ ----------------------
filter description UNICODECHAR ... meta.note;instr.filter
filter filtername VARCHAR ... instr.filter
filter unit VARCHAR ... meta.unit
flux filter VARCHAR ... instr.filter
mesDiameter filter CHAR ... instr.filter

>>> from astroquery.simbad import Simbad
>>> Simbad.list_columns("basic", keyword="object") # doctest: +REMOTE_DATA
<Table length=4>
table_name column_name datatype ... unit ucd
object object object ... object object
---------- ----------- -------- ... ------ -------------------
basic main_id VARCHAR ... meta.id;meta.main
basic otype_txt VARCHAR ... src.class
basic oid BIGINT ... meta.record;meta.id
basic otype VARCHAR ... src.class
"""
query = ("SELECT table_name, column_name, datatype, description, unit, ucd"
" FROM TAP_SCHEMA.columns"
" WHERE table_name NOT LIKE 'TAP_SCHEMA.%'")
# select the tables
if len(tables) == 1:
query += f" AND table_name = '{tables[0]}'"
elif len(tables) > 1:
query += f" AND table_name IN {tables}"
# add the keyword condition
if keyword is not None:
condition = f"LIKE LOWERCASE('%{_adql_parameter(keyword)}%')"
query += (f" AND ( (LOWERCASE(column_name) {condition})"
f" OR (LOWERCASE(description) {condition})"
f" OR (LOWERCASE(table_name) {condition}))")
query += " ORDER BY table_name, principal DESC, column_name"
if get_adql:
return query
return self.query_tap(query)

def list_linked_tables(self, table: str, *, get_adql=False):
"""
Expose the tables that can be non-obviously linked with the given table.

This list contains only the links where the column names are not the same in the
two tables. For example every ``oidref`` column of any table can be joined with
any other ``oidref``. The same goes for every ``otype`` column even if this is not
returned by this method.

Parameters
----------
table : str
One of SIMBAD's tables name
get_adql : bool, optional
Returns the ADQL string instead of querying SIMBAD.

Returns
-------
`~astropy.table.Table`
The information necessary to join the given table to an other.

Examples
--------
>>> from astroquery.simbad import Simbad
>>> Simbad.list_linked_tables("otypes") # doctest: +REMOTE_DATA
<Table length=2>
from_table from_column target_table target_column
object object object object
---------- ----------- ------------ -------------
otypedef otype otypes otype
otypes oidref basic oid
"""
query = ("SELECT from_table, from_column, target_table, target_column"
" FROM TAP_SCHEMA.key_columns JOIN TAP_SCHEMA.keys USING (key_id)"
f" WHERE (from_table = '{_adql_parameter(table)}')"
f" OR (target_table = '{_adql_parameter(table)}')")
if get_adql:
return query
return self.query_tap(query)

@lru_cache(256)
def _cached_query_tap(self, query: str, *, maxrec=10000):
"""Cache version of query TAP

This private method is called when query_tap is executed without an
``uploads`` extra keyword argument. This is a work around because
`~astropy.table.Table` objects are not hashable and thus cannot
be used as arguments for a function decorated with lru_cache.

Parameters
----------
query : str
A string containing the query written in the
Astronomical Data Query Language (ADQL).
maxrec : int, optional
The number of records to be returned. Its maximum value is 2000000.

Returns
-------
`~astropy.table.Table`
The response returned by Simbad.
"""
return self.tap.run_async(query, maxrec=maxrec).to_table()

def query_tap(self, query: str, *, maxrec=10000, **uploads):
"""
Query Simbad TAP service.

Parameters
----------
query : str
A string containing the query written in the
Astronomical Data Query Language (ADQL).
maxrec : int, default: 10000
The number of records to be returned. Its maximum value is given by
`~astroquery.simbad.SimbadClass.hardlimit`.
uploads : `~astropy.table.Table` | `~astropy.io.votable.tree.VOTableFile` | `~pyvo.dal.DALResults`
Any number of local tables to be used in the *query*. In the *query*, these tables
are referred as *TAP_UPLOAD.table_alias* where *TAP_UPLOAD* is imposed and *table_alias*
is the keyword name you chose. The maximum number of lines for the uploaded tables is 200000.

Returns
-------
`~astropy.table.Table`
The response returned by Simbad.

Notes
-----
A TAP (Table Access Protocol) service allows to query data tables with
queries written in ADQL (Astronomical Data Query Language), a flavor
of the more general SQL (Structured Query Language).
For more documentation about writing ADQL queries, you can read its official
documentation (`ADQL documentation <https://ivoa.net/documents/ADQL/index.html>`__)
or the `Simbad ADQL cheat sheet <http://simbad.cds.unistra.fr/simbad/tap/help/adqlHelp.html>`__.
See also: a `graphic representation of Simbad's tables and their relations
<http://simbad.cds.unistra.fr/simbad/tap/tapsearch.html>`__.

See also
--------
list_tables : The list of SIMBAD's tables.
list_columns : SIMBAD's columns list, can be restricted to some tables and some keyword.
list_linked_tables : Given a table, expose non-obvious possible joins with other tables.

Examples
--------

To see the five oldest papers referenced in Simbad

>>> from astroquery.simbad import Simbad
>>> Simbad.query_tap("SELECT top 5 bibcode, title "
... "FROM ref ORDER BY bibcode") # doctest: +REMOTE_DATA
<Table length=5>
bibcode ...
object ...
------------------- ...
1850CDT..1784..227M ...
1857AN.....45...89S ...
1861MNRAS..21...68B ...
1874MNRAS..34...75S ...
1877AN.....89...13W ...

Get the type for a list of objects

>>> from astroquery.simbad import Simbad
>>> Simbad.query_tap("SELECT main_id, otype"
... " FROM basic WHERE main_id IN ('m10', 'm13')") # doctest: +REMOTE_DATA
<Table length=2>
main_id otype
object object
------- ------
M 10 GlC
M 13 GlC

Upload a table to use in a query

>>> from astroquery.simbad import Simbad
>>> from astropy.table import Table
>>> letters_table = Table([["a", "b", "c"]], names=["alphabet"])
>>> Simbad.query_tap("SELECT TAP_UPLOAD.my_table_name.* from TAP_UPLOAD.my_table_name",
... my_table_name=letters_table) # doctest: +REMOTE_DATA
<Table length=3>
alphabet
object
--------
a
b
c
"""
if maxrec > Simbad.hardlimit:
raise ValueError(f"The maximum number of records cannot exceed {Simbad.hardlimit}.")
if query.count("'") % 2:
raise ValueError("Query string contains an odd number of single quotes."
" Escape the unpaired single quote by doubling it.\n"
"ex: 'Barnard's galaxy' -> 'Barnard''s galaxy'.")
if uploads == {}:
return self._cached_query_tap(query, maxrec=maxrec)
return self.tap.run_async(query, maxrec=maxrec, uploads=uploads).to_table()

def _get_query_header(self, get_raw=False):
# if get_raw is set then don't fetch as votable
if get_raw:
Expand Down
Loading
Loading