diff --git a/CHANGELOG.md b/CHANGELOG.md
index e148d2f3..b7091244 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,15 @@ Formatted as described on [https://keepachangelog.com](https://keepachangelog.co
- `ewatercycle.esmvaltool.search.search_esgf` can now be used to find climate model ensembles on ESGF that have the required input variables for generating forcing data ([#422](https://github.com/eWaterCycle/ewatercycle/pull/422)).
- `ewatercycle.observation.caravan.get_caravan_data()` ([#432](https://github.com/eWaterCycle/ewatercycle/issues/432))
+### Fixed
+
+- `get_usgs_data()` throws error ([#414](https://github.com/eWaterCycle/ewatercycle/issues/414))
+- `get_usgs_data()` and 1get_grdc_data()` both return xarray.Dataset ([#253](https://github.com/eWaterCycle/ewatercycle/issues/253))
+
+### Removed
+
+- Caching mechanism from `get_usgs_data()` ([#240](https://github.com/eWaterCycle/ewatercycle/issues/240))
+
## [2.1.1] (2024-06-03)
### Added
diff --git a/README.md b/README.md
index df3ed8f5..d286c86a 100644
--- a/README.md
+++ b/README.md
@@ -93,12 +93,12 @@ cfg_file, cfg_dir = model.setup(
model.initialize(cfg_file)
-observations_df, station_info = ewatercycle.observation.grdc.get_grdc_data(
+observations_df = ewatercycle.observation.grdc.get_grdc_data(
station_id=4147380,
start_time=model.start_time_as_isostr,
end_time=model.end_time_as_isostr,
column='observation',
-)
+).observation.to_dataframe()
simulated_discharge = []
timestamps = []
diff --git a/docs/observations.rst b/docs/observations.rst
index 430fb555..6177d315 100644
--- a/docs/observations.rst
+++ b/docs/observations.rst
@@ -6,7 +6,7 @@ The eWaterCycle platform supports observations relevant for calibrating and vali
USGS
----
-The `U.S. Geological Survey Water Services `_ provides public discharge data for a large number of US based stations. In eWaterCycle we make use of the `USGS web service `_ to automatically retrieve this data.
+The `U.S. Geological Survey Water Services `_ provides public discharge data for a large number of US based stations. In eWaterCycle (:py:func:`ewatercycle.observation.usgs.get_usgs_data`) we make use of the `USGS web service `_ to automatically retrieve this data.
The Discharge timestamp is corrected to the UTC timezone. Units are converted from cubic feet per second to cubic meter per second.
GRDC
diff --git a/docs/user_guide/03_models_obs_analysis.ipynb b/docs/user_guide/03_models_obs_analysis.ipynb
index d0a5472d..38ac64e6 100644
--- a/docs/user_guide/03_models_obs_analysis.ipynb
+++ b/docs/user_guide/03_models_obs_analysis.ipynb
@@ -605,14 +605,14 @@
"source": [
"grdc_station_id = \"6335020\"\n",
"\n",
- "observations, metadata = ewatercycle.observation.grdc.get_grdc_data(\n",
+ "observations = ewatercycle.observation.grdc.get_grdc_data(\n",
" station_id=grdc_station_id,\n",
" start_time=\"1990-01-01T00:00:00Z\", # or: model_instance.start_time_as_isostr\n",
" end_time=\"1990-12-15T00:00:00Z\",\n",
" column=\"GRDC\",\n",
")\n",
"\n",
- "observations.head()"
+ "observations.GRDC.to_dataframe().head()"
]
},
{
@@ -639,7 +639,7 @@
}
],
"source": [
- "print(metadata)"
+ "print(observations.attrs)"
]
},
{
diff --git a/src/ewatercycle/observation/grdc.py b/src/ewatercycle/observation/grdc.py
index 534a19e0..a649f923 100644
--- a/src/ewatercycle/observation/grdc.py
+++ b/src/ewatercycle/observation/grdc.py
@@ -1,9 +1,12 @@
"""Global Runoff Data Centre module."""
+
import logging
import os
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Union
import pandas as pd
+import xarray as xr
+from numpy import nan
from ewatercycle import CFG
from ewatercycle.util import get_time, to_absolute_path
@@ -17,15 +20,18 @@ def get_grdc_data(
station_id: str,
start_time: str,
end_time: str,
- parameter: str = "Q",
data_home: Optional[str] = None,
column: str = "streamflow",
-) -> Tuple[pd.core.frame.DataFrame, MetaDataType]:
+) -> xr.Dataset:
"""Get river discharge data from Global Runoff Data Centre (GRDC).
Requires the GRDC daily data files in a local directory. The GRDC daily data
- files can be ordered at
- https://www.bafg.de/GRDC/EN/02_srvcs/21_tmsrs/riverdischarge_node.html
+ NetCDF file can be downloaded at
+ https://www.bafg.de/GRDC/EN/02_srvcs/21_tmsrs/riverdischarge_node.html .
+ The downloaded zip file contains a file named GRDC-Daily.nc.
+
+ This function will first try to read data from the GRDC-Daily.nc file in the ``data_home`` directory.
+ If that fails it will look for the GRDC Export (ASCII text) formatted file for example ``6435060_Q_Day.Cmd.txt``.
Args:
station_id: The station id to get. The station id can be found in the
@@ -35,55 +41,50 @@ def get_grdc_data(
'YYYY-MM-DDTHH:MM:SSZ'.
end_time: End time of model in UTC and ISO format string e.g.
'YYYY-MM-DDTHH:MM:SSZ'.
- parameter: optional. The parameter code to get, e.g. ('Q') discharge,
- cubic meters per second.
data_home : optional. The directory where the daily grdc data is
located. If left out will use the grdc_location in the eWaterCycle
configuration file.
column: optional. Name of column in dataframe. Default: "streamflow".
Returns:
- grdc data in a dataframe and metadata.
+ grdc data in a xarray dataset. Shaped like a filtered version of the GRDC daily NetCDF file.
+
+ Raises:
+ ValueError: If no data for the requested station id and period could not be found.
Examples:
+
.. code-block:: python
from ewatercycle.observation.grdc import get_grdc_data
- df, meta = get_grdc_data('6335020',
- '2000-01-01T00:00Z',
- '2001-01-01T00:00Z')
- df.describe()
- streamflow
- count 4382.000000
- mean 2328.992469
- std 1190.181058
- min 881.000000
- 25% 1550.000000
- 50% 2000.000000
- 75% 2730.000000
- max 11300.000000
-
- meta
- {'grdc_file_name': '/home/myusername/git/eWaterCycle/ewatercycle/6335020_Q_Day.Cmd.txt',
- 'id_from_grdc': 6335020,
- 'file_generation_date': '2019-03-27',
- 'river_name': 'RHINE RIVER',
- 'station_name': 'REES',
- 'country_code': 'DE',
- 'grdc_latitude_in_arc_degree': 51.756918,
- 'grdc_longitude_in_arc_degree': 6.395395,
- 'grdc_catchment_area_in_km2': 159300.0,
- 'altitude_masl': 8.0,
- 'dataSetContent': 'MEAN DAILY DISCHARGE (Q)',
- 'units': 'm³/s',
- 'time_series': '1814-11 - 2016-12',
- 'no_of_years': 203,
- 'last_update': '2018-05-24',
- 'nrMeasurements': 'NA',
- 'UserStartTime': '2000-01-01T00:00Z',
- 'UserEndTime': '2001-01-01T00:00Z',
- 'nrMissingData': 0}
+ ds = get_grdc_data('6435060',
+ '2000-01-01T00:00Z',
+ '2001-01-01T00:00Z')
+ ds
+ Size: 5kB
+ Dimensions: (time: 367)
+ Coordinates:
+ * time (time) datetime64[ns] 3kB 2000-01-01 ... 2001-01-01
+ id int32 4B 6435060
+ Data variables:
+ streamflow (time) float32 1kB ...
+ area float32 4B ...
+ country xr.Dataset:
+ # Convert the raw data to an xarray
+ data = WaterML11ToPaegan(waterml_data).feature
+
+ # We expect only 1 station
+ if len(data.elements) == 0:
+ raise ValueError("Data does not contain any station data")
+
+ station = data.elements[0]
+
+ # Unit conversion from cubic feet per second to cubic meter per second
+ values = np.array(
+ [float(point.members[0]["value"]) / 35.315 for point in station.elements],
+ dtype=np.float32,
+ )
+ # Convert the time to a numpy array of datetime64 without timezone
+ times = pd.to_datetime([point.time for point in station.elements]).to_numpy(
+ dtype="datetime64[ns]"
+ )
+ attrs = {"units": "m3/s"}
+
+ # Create the xarray dataset
+ ds = xr.Dataset({"streamflow": (["time"], values, attrs)}, coords={"time": times})
+
+ # Set some nice attributes
+ ds.attrs["title"] = "USGS Data from streamflow data"
+ ds.attrs["station"] = station.name
+ ds.attrs["stationid"] = station.get_uid()
+ ds.attrs["location"] = (station.location.y, station.location.x)
+
+ return ds
+
+
+def _download_usgs_data(
+ station_id: str,
+ start_time: str,
+ end_time: str,
+):
+ discharge_parameter = "00060"
+ collector = UsgsRest()
+ collector.filter(
+ start=get_time(start_time),
+ end=get_time(end_time),
+ variables=[discharge_parameter],
+ features=[station_id],
+ )
+ return collector.raw()
+
-def get_usgs_data(station_id, start_date, end_date, parameter="00060", cache_dir=None):
+def get_usgs_data(
+ station_id: str,
+ start_time: str,
+ end_time: str,
+) -> xr.Dataset:
"""Get river discharge data from the USGS REST web service.
See `U.S. Geological Survey Water Services
`_ (USGS)
- Parameters
- ----------
- station_id : str
- The station id to get
- start_date : str
- String for start date in the format: 'YYYY-MM-dd', e.g. '1980-01-01'
- end_date : str
- String for start date in the format: 'YYYY-MM-dd', e.g. '2018-12-31'
- parameter : str
- The parameter code to get, e.g. ('00060') discharge, cubic feet per second
- cache_dir : str
- Directory where files retrieved from the web service are cached.
- If set to None then USGS_DATA_HOME env var will be used as cache directory.
-
- Examples
- --------
- >>> from ewatercycle.observation.usgs import get_usgs_data
- >>> data = get_usgs_data('03109500', '2000-01-01', '2000-12-31', cache_dir='.')
- >>> data
-
+ Args:
+ station_id: The station id to get
+ start_time: Start time of model in UTC and ISO format string e.g.
+ 'YYYY-MM-DDTHH:MM:SSZ'.
+ end_time: End time of model in UTC and ISO format string e.g.
+ 'YYYY-MM-DDTHH:MM:SSZ'.
+
+ Returns:
+ Xarray dataset with the streamflow data
+ with unit and other metadata in the variable and global attributes.
+
+ Examples:
+
+ To get observations from the Little Beaver Creek.
+
+ >>> from ewatercycle.observation.usgs import get_usgs_data
+ >>> data = get_usgs_data('03109500', '2000-01-01T00:00:00Z', '2000-12-31T00:00:00Z')
+ >>> data
+ Size: 96kB
Dimensions: (time: 8032)
Coordinates:
- * time (time) datetime64[ns] 2000-01-04T05:00:00 ... 2000-12-23T04:00:00
+ * time (time) datetime64[ns] 64kB 2000-01-04T05:00:00 ... 2000-12-23...
Data variables:
- Streamflow (time) float32 8.296758 10.420501 ... 10.647034 11.694747
+ streamflow (time) float32 32kB 8.297 10.42 17.58 ... 8.552 10.65 11.69
Attributes:
title: USGS Data from streamflow data
station: Little Beaver Creek near East Liverpool OH
stationid: 03109500
- location: (40.6758974, -80.5406244)
+ location: (np.float64(40.6758974), np.float64(-80.5406244))
""" # noqa: E501
- if cache_dir is None:
- cache_dir = os.environ["USGS_DATA_HOME"]
-
- # Check if we have the netcdf data
- netcdf = os.path.join(
- cache_dir,
- "USGS_"
- + station_id
- + "_"
- + parameter
- + "_"
- + start_date
- + "_"
- + end_date
- + ".nc",
- )
- if os.path.exists(netcdf):
- return xr.open_dataset(netcdf)
-
- # Download the data if needed
- out = os.path.join(
- cache_dir,
- "USGS_"
- + station_id
- + "_"
- + parameter
- + "_"
- + start_date
- + "_"
- + end_date
- + ".wml",
- )
- if not os.path.exists(out):
- collector = UsgsRest()
- collector.filter(
- start=datetime.strptime(start_date, "%Y-%m-%d"),
- end=datetime.strptime(end_date, "%Y-%m-%d"),
- variables=[parameter],
- features=[station_id],
- )
- data = collector.raw()
- with open(out, "w") as file:
- file.write(data)
- collector.clear()
- else:
- with open(out, "r") as file:
- data = file.read()
-
- # Convert the raw data to an xarray
- data = WaterML11ToPaegan(data).feature
-
- # We expect only 1 station
- if len(data.elements) == 0:
- raise ValueError("Data does not contain any station data")
- else:
- station = data.elements[0]
-
- # Unit conversion from cubic feet to cubic meter per second
- values = np.array(
- [float(point.members[0]["value"]) / 35.315 for point in station.elements],
- dtype=np.float32,
- )
- times = [point.time for point in station.elements]
-
- attrs = {
- "units": "cubic meters per second",
- }
-
- # Create the xarray dataset
- ds = xr.Dataset(
- {"streamflow": (["time"], values, attrs)}, coords={"time": times}
- )
-
- # Set some nice attributes
- ds.attrs["title"] = "USGS Data from streamflow data"
- ds.attrs["station"] = station.name
- ds.attrs["stationid"] = station.get_uid()
- ds.attrs["location"] = (station.location.y, station.location.x)
-
- ds.to_netcdf(netcdf)
-
- return ds
+ wml_data = _download_usgs_data(station_id, start_time, end_time)
+ return _xml_to_xarray(wml_data)
diff --git a/tests/src/observation/test_grdc.py b/tests/src/observation/test_grdc.py
index 1f2e5488..7922e2c9 100644
--- a/tests/src/observation/test_grdc.py
+++ b/tests/src/observation/test_grdc.py
@@ -1,9 +1,10 @@
from datetime import datetime
+from pathlib import Path
import numpy as np
-import pandas as pd
import pytest
-from pandas.testing import assert_frame_equal
+import xarray as xr
+from xarray.testing import assert_allclose
from ewatercycle import CFG
from ewatercycle.observation.grdc import get_grdc_data
@@ -58,55 +59,123 @@ def sample_grdc_file(tmp_path):
@pytest.fixture
-def expected_results(tmp_path, sample_grdc_file):
- data = pd.DataFrame(
- {"streamflow": [123.0, 456.0, np.nan]},
- index=[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
+def expected_results():
+ return xr.Dataset.from_dict(
+ {
+ "coords": {
+ "time": {
+ "dims": ("time",),
+ "attrs": {"long_name": "time"},
+ "data": [
+ datetime(2000, 1, 1, 0, 0),
+ datetime(2000, 1, 2, 0, 0),
+ datetime(2000, 1, 3, 0, 0),
+ ],
+ },
+ "id": {
+ "dims": (),
+ "attrs": {"long_name": "grdc number"},
+ "data": 42424242,
+ },
+ },
+ "attrs": {
+ "title": "MEAN DAILY DISCHARGE (Q)",
+ "Conventions": "CF-1.7",
+ "references": "grdc.bafg.de",
+ "institution": "GRDC",
+ "history": "Converted from 42424242_Q_Day.Cmd.txt of 2000-02-02 to netcdf by eWaterCycle Python package",
+ "missing_value": "-999.000",
+ },
+ "dims": {"time": 3},
+ "data_vars": {
+ "streamflow": {
+ "dims": ("time",),
+ "attrs": {"units": "m3/s", "long_name": "Mean daily discharge (Q)"},
+ "data": [123.0, 456.0, np.nan],
+ },
+ "area": {
+ "dims": (),
+ "attrs": {"units": "km2", "long_name": "catchment area"},
+ "data": 4242.0,
+ },
+ "country": {
+ "dims": (),
+ "attrs": {
+ "long_name": "country name",
+ "iso2": "ISO 3166-1 alpha-2 - two-letter country code",
+ },
+ "data": "NA",
+ },
+ "geo_x": {
+ "dims": (),
+ "attrs": {
+ "units": "degree_east",
+ "long_name": "station longitude (WGS84)",
+ },
+ "data": 4.955153,
+ },
+ "geo_y": {
+ "dims": (),
+ "attrs": {
+ "units": "degree_north",
+ "long_name": "station latitude (WGS84)",
+ },
+ "data": 52.356154,
+ },
+ "geo_z": {
+ "dims": (),
+ "attrs": {
+ "units": "m",
+ "long_name": "station altitude (m above sea level)",
+ },
+ "data": 8.0,
+ },
+ "owneroforiginaldata": {
+ "dims": (),
+ "attrs": {"long_name": "Owner of original data"},
+ "data": "Unknown",
+ },
+ "river_name": {
+ "dims": (),
+ "attrs": {"long_name": "river name"},
+ "data": "SOME RIVER",
+ },
+ "station_name": {
+ "dims": (),
+ "attrs": {"long_name": "station name"},
+ "data": "SOME",
+ },
+ "timezone": {
+ "dims": (),
+ "attrs": {
+ "units": "00:00",
+ "long_name": "utc offset, in relation to the national capital",
+ },
+ "data": np.nan,
+ },
+ },
+ }
)
- data.index.rename("time", inplace=True)
- metadata = {
- "altitude_masl": 8.0,
- "country_code": "NA",
- "dataSetContent": "MEAN DAILY DISCHARGE (Q)",
- "file_generation_date": "2000-02-02",
- "grdc_catchment_area_in_km2": 4242.0,
- "grdc_file_name": str(tmp_path / sample_grdc_file),
- "grdc_latitude_in_arc_degree": 52.356154,
- "grdc_longitude_in_arc_degree": 4.955153,
- "id_from_grdc": 42424242,
- "last_update": "2000-02-01",
- "no_of_years": 1,
- "nrMeasurements": 3,
- "river_name": "SOME RIVER",
- "station_name": "SOME",
- "time_series": "2000-01 - 2000-01",
- "units": "m³/s",
- "UserEndTime": "2000-02-01T00:00Z",
- "UserStartTime": "2000-01-01T00:00Z",
- "nrMissingData": 1,
- }
- return data, metadata
-
-
-def test_get_grdc_data_with_datahome(tmp_path, expected_results):
- expected_data, expected_metadata = expected_results
- result_data, result_metadata = get_grdc_data(
+
+
+def test_get_grdc_data_with_datahome(
+ tmp_path, expected_results: xr.Dataset, sample_grdc_file
+):
+ result_data = get_grdc_data(
"42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z", data_home=str(tmp_path)
)
- assert_frame_equal(result_data, expected_data)
- assert result_metadata == expected_metadata
+ print(result_data.to_dict())
+ assert_allclose(result_data, expected_results)
-def test_get_grdc_data_with_cfg(expected_results, tmp_path):
+def test_get_grdc_data_with_cfg(
+ expected_results: xr.Dataset, tmp_path, sample_grdc_file
+):
CFG.grdc_location = tmp_path
- expected_data, expected_metadata = expected_results
- result_data, result_metadata = get_grdc_data(
- "42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z"
- )
+ result_data = get_grdc_data("42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z")
- assert_frame_equal(result_data, expected_data)
- assert result_metadata == expected_metadata
+ assert_allclose(result_data, expected_results)
def test_get_grdc_data_without_file(tmp_path):
@@ -119,13 +188,137 @@ def test_get_grdc_data_without_file(tmp_path):
)
-def test_get_grdc_dat_custom_column_name(expected_results, tmp_path):
- CFG.grdc_location = str(tmp_path)
- result_data, result_metadata = get_grdc_data(
+def test_get_grdc_data_custom_column_name(
+ expected_results: xr.Dataset, tmp_path: Path, sample_grdc_file
+):
+ CFG.grdc_location = tmp_path
+ result_data = get_grdc_data(
"42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z", column="observation"
)
- expected_default_data, expected_metadata = expected_results
- expected_data = expected_default_data.rename(columns={"streamflow": "observation"})
- assert_frame_equal(result_data, expected_data)
- assert result_metadata == expected_metadata
+ expected_data = expected_results.rename({"streamflow": "observation"})
+
+ assert_allclose(result_data, expected_data)
+
+
+@pytest.fixture
+def sample_nc_file(tmp_path):
+ fn = tmp_path / "GRDC-Daily.nc"
+ ds = xr.Dataset.from_dict(
+ {
+ "coords": {
+ "time": {
+ "dims": ("time",),
+ "attrs": {"long_name": "time"},
+ "data": [
+ datetime(2000, 1, 1, 0, 0),
+ datetime(2000, 1, 2, 0, 0),
+ datetime(2000, 1, 3, 0, 0),
+ ],
+ },
+ "id": {
+ "dims": ("id",),
+ "attrs": {"long_name": "grdc number"},
+ "data": [42424242],
+ },
+ },
+ "attrs": {
+ "title": "MEAN DAILY DISCHARGE (Q)",
+ "Conventions": "CF-1.7",
+ "references": "grdc.bafg.de",
+ "institution": "GRDC",
+ "history": "Converted from 42424242_Q_Day.Cmd.txt of 2000-02-02 to netcdf by eWaterCycle Python package",
+ "missing_value": "-999.000",
+ },
+ "dims": {"time": 3, "id": 1},
+ "data_vars": {
+ "runoff_mean": {
+ "dims": ("time", "id"),
+ "attrs": {"units": "m3/s", "long_name": "Mean daily discharge (Q)"},
+ "data": [[123.0], [456.0], [np.nan]],
+ },
+ "area": {
+ "dims": ("id",),
+ "attrs": {"units": "km2", "long_name": "catchment area"},
+ "data": [4242.0],
+ },
+ "country": {
+ "dims": ("id",),
+ "attrs": {
+ "long_name": "country name",
+ "iso2": "ISO 3166-1 alpha-2 - two-letter country code",
+ },
+ "data": ["NA"],
+ },
+ "geo_x": {
+ "dims": ("id",),
+ "attrs": {
+ "units": "degree_east",
+ "long_name": "station longitude (WGS84)",
+ },
+ "data": [4.955153],
+ },
+ "geo_y": {
+ "dims": ("id",),
+ "attrs": {
+ "units": "degree_north",
+ "long_name": "station latitude (WGS84)",
+ },
+ "data": [52.356154],
+ },
+ "geo_z": {
+ "dims": ("id",),
+ "attrs": {
+ "units": "m",
+ "long_name": "station altitude (m above sea level)",
+ },
+ "data": [8.0],
+ },
+ "owneroforiginaldata": {
+ "dims": ("id",),
+ "attrs": {"long_name": "Owner of original data"},
+ "data": ["Unknown"],
+ },
+ "river_name": {
+ "dims": ("id",),
+ "attrs": {"long_name": "river name"},
+ "data": ["SOME RIVER"],
+ },
+ "station_name": {
+ "dims": ("id",),
+ "attrs": {"long_name": "station name"},
+ "data": ["SOME"],
+ },
+ "timezone": {
+ "dims": ("id",),
+ "attrs": {
+ "units": "00:00",
+ "long_name": "utc offset, in relation to the national capital",
+ },
+ "data": [np.nan],
+ },
+ },
+ }
+ )
+ ds.to_netcdf(fn)
+ return str(tmp_path)
+
+
+def test_get_grdc_data_from_nc(sample_nc_file, expected_results: xr.Dataset):
+ result_data = get_grdc_data(
+ "42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z", data_home=sample_nc_file
+ )
+ assert_allclose(result_data, expected_results)
+
+
+def test_get_grdc_data_from_nc_missing_and_no_txtfile(tmp_path, sample_nc_file):
+ with pytest.raises(
+ ValueError,
+ match="The grdc station 42424243 is not in the .*/GRDC-Daily.nc file and .*/42424243_Q_Day.Cmd.txt does not exist!",
+ ):
+ get_grdc_data(
+ "42424243",
+ "2000-01-01T00:00Z",
+ "2000-02-01T00:00Z",
+ data_home=str(tmp_path),
+ )
diff --git a/tests/src/observation/test_usgs.py b/tests/src/observation/test_usgs.py
new file mode 100644
index 00000000..6f47b9a0
--- /dev/null
+++ b/tests/src/observation/test_usgs.py
@@ -0,0 +1,207 @@
+import datetime
+from textwrap import dedent
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from ewatercycle.observation.usgs import _xml_to_xarray
+
+
+@pytest.fixture
+def waterml_data():
+ """This was generated by running
+
+ ```python
+ from ewatercycle.observation.usgs import _download_usgs_data
+ print(_download_usgs_data("03109500", "2000-01-06T00:00:00", "2000-01-07T00:00:00"))
+ ```
+ """
+ return dedent(
+ """\
+
+
+
+
+ http://nwis.waterservices.usgs.gov/nwis/iv/startDT=2000-01-06T00%3A00&endDT=2000-01-07T00%3A00¶meterCd=00060&sites=03109500&format=waterml%2C1.1
+
+ [ALL:03109500]
+ [00060]
+
+ 2000-01-06T00:00:00.000
+ 2000-01-07T00:00:00.000
+
+
+ [ALL:03109500]
+ [mode=RANGE, modifiedSince=null]
+ interval={INTERVAL[2000-01-06T00:00:00.000Z/2000-01-07T00:00:00.000Z]}
+ methodIds=[ALL]
+ 2024-07-05T08:06:53.782Z
+ 8a673850-3aa5-11ef-9e48-4cd98f8df011
+ Provisional data are subject to revision. Go to
+ http://waterdata.usgs.gov/nwis/help/?provisional for more information.
+ nadww01
+
+
+
+ Little Beaver Creek near East Liverpool OH
+ 03109500
+
+
+
+
+
+
+ 40.6758974
+ -80.5406244
+
+
+ ST
+ 05030101
+ 39
+ 39029
+
+
+ 00060
+ Streamflow, ft³/s
+ Discharge, cubic feet per second
+ Derived Value
+
+ ft3/s
+
+
+
+
+ -999999.0
+
+
+ 1570
+ 1510
+ 1430
+ 1370
+ 1320
+ 1260
+ 1220
+ 1180
+ 1140
+ 1110
+ 1070
+ 1040
+ 1020
+ 995
+ 967
+ 947
+ 920
+ 901
+ 888
+ 868
+ 849
+ 831
+ 818
+ 806
+ 788
+
+ [91]
+ Returned when there is no matching qualifier.
+
+
+ A
+ Approved for publication -- Processing and review
+ completed.
+
+
+
+
+
+
+
+ """
+ )
+
+
+def test_xml_to_xarray(waterml_data: str):
+ result = _xml_to_xarray(waterml_data)
+ expected = xr.Dataset.from_dict(
+ {
+ "coords": {
+ "time": {
+ "dims": ("time",),
+ "attrs": {},
+ "data": [
+ datetime.datetime(2000, 1, 6, 5, 0),
+ datetime.datetime(2000, 1, 6, 6, 0),
+ datetime.datetime(2000, 1, 6, 7, 0),
+ datetime.datetime(2000, 1, 6, 8, 0),
+ datetime.datetime(2000, 1, 6, 9, 0),
+ datetime.datetime(2000, 1, 6, 10, 0),
+ datetime.datetime(2000, 1, 6, 11, 0),
+ datetime.datetime(2000, 1, 6, 12, 0),
+ datetime.datetime(2000, 1, 6, 13, 0),
+ datetime.datetime(2000, 1, 6, 14, 0),
+ datetime.datetime(2000, 1, 6, 15, 0),
+ datetime.datetime(2000, 1, 6, 16, 0),
+ datetime.datetime(2000, 1, 6, 17, 0),
+ datetime.datetime(2000, 1, 6, 18, 0),
+ datetime.datetime(2000, 1, 6, 19, 0),
+ datetime.datetime(2000, 1, 6, 20, 0),
+ datetime.datetime(2000, 1, 6, 21, 0),
+ datetime.datetime(2000, 1, 6, 22, 0),
+ datetime.datetime(2000, 1, 6, 23, 0),
+ datetime.datetime(2000, 1, 7, 0, 0),
+ datetime.datetime(2000, 1, 7, 1, 0),
+ datetime.datetime(2000, 1, 7, 2, 0),
+ datetime.datetime(2000, 1, 7, 3, 0),
+ datetime.datetime(2000, 1, 7, 4, 0),
+ datetime.datetime(2000, 1, 7, 5, 0),
+ ],
+ }
+ },
+ "attrs": {
+ "title": "USGS Data from streamflow data",
+ "station": "Little Beaver Creek near East Liverpool OH",
+ "stationid": "03109500",
+ "location": (np.float64(40.6758974), np.float64(-80.5406244)),
+ },
+ "dims": {"time": 25},
+ "data_vars": {
+ "streamflow": {
+ "dims": ("time",),
+ "attrs": {"units": "m3/s"},
+ "data": [
+ 44.45703125,
+ 42.758033752441406,
+ 40.49271011352539,
+ 38.7937126159668,
+ 37.37788391113281,
+ 35.678890228271484,
+ 34.546226501464844,
+ 33.4135627746582,
+ 32.28089904785156,
+ 31.4314022064209,
+ 30.29874038696289,
+ 29.449241638183594,
+ 28.882911682128906,
+ 28.174997329711914,
+ 27.382131576538086,
+ 26.815799713134766,
+ 26.051252365112305,
+ 25.51323890686035,
+ 25.145122528076172,
+ 24.57879066467285,
+ 24.040775299072266,
+ 23.531078338623047,
+ 23.162961959838867,
+ 22.823162078857422,
+ 22.313465118408203,
+ ],
+ }
+ },
+ }
+ )
+
+ xr.testing.assert_identical(result, expected)