diff --git a/CHANGELOG.md b/CHANGELOG.md index e148d2f3..b7091244 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,15 @@ Formatted as described on [https://keepachangelog.com](https://keepachangelog.co - `ewatercycle.esmvaltool.search.search_esgf` can now be used to find climate model ensembles on ESGF that have the required input variables for generating forcing data ([#422](https://github.com/eWaterCycle/ewatercycle/pull/422)). - `ewatercycle.observation.caravan.get_caravan_data()` ([#432](https://github.com/eWaterCycle/ewatercycle/issues/432)) +### Fixed + +- `get_usgs_data()` throws error ([#414](https://github.com/eWaterCycle/ewatercycle/issues/414)) +- `get_usgs_data()` and 1get_grdc_data()` both return xarray.Dataset ([#253](https://github.com/eWaterCycle/ewatercycle/issues/253)) + +### Removed + +- Caching mechanism from `get_usgs_data()` ([#240](https://github.com/eWaterCycle/ewatercycle/issues/240)) + ## [2.1.1] (2024-06-03) ### Added diff --git a/README.md b/README.md index df3ed8f5..d286c86a 100644 --- a/README.md +++ b/README.md @@ -93,12 +93,12 @@ cfg_file, cfg_dir = model.setup( model.initialize(cfg_file) -observations_df, station_info = ewatercycle.observation.grdc.get_grdc_data( +observations_df = ewatercycle.observation.grdc.get_grdc_data( station_id=4147380, start_time=model.start_time_as_isostr, end_time=model.end_time_as_isostr, column='observation', -) +).observation.to_dataframe() simulated_discharge = [] timestamps = [] diff --git a/docs/observations.rst b/docs/observations.rst index 430fb555..6177d315 100644 --- a/docs/observations.rst +++ b/docs/observations.rst @@ -6,7 +6,7 @@ The eWaterCycle platform supports observations relevant for calibrating and vali USGS ---- -The `U.S. Geological Survey Water Services `_ provides public discharge data for a large number of US based stations. In eWaterCycle we make use of the `USGS web service `_ to automatically retrieve this data. +The `U.S. Geological Survey Water Services `_ provides public discharge data for a large number of US based stations. In eWaterCycle (:py:func:`ewatercycle.observation.usgs.get_usgs_data`) we make use of the `USGS web service `_ to automatically retrieve this data. The Discharge timestamp is corrected to the UTC timezone. Units are converted from cubic feet per second to cubic meter per second. GRDC diff --git a/docs/user_guide/03_models_obs_analysis.ipynb b/docs/user_guide/03_models_obs_analysis.ipynb index d0a5472d..38ac64e6 100644 --- a/docs/user_guide/03_models_obs_analysis.ipynb +++ b/docs/user_guide/03_models_obs_analysis.ipynb @@ -605,14 +605,14 @@ "source": [ "grdc_station_id = \"6335020\"\n", "\n", - "observations, metadata = ewatercycle.observation.grdc.get_grdc_data(\n", + "observations = ewatercycle.observation.grdc.get_grdc_data(\n", " station_id=grdc_station_id,\n", " start_time=\"1990-01-01T00:00:00Z\", # or: model_instance.start_time_as_isostr\n", " end_time=\"1990-12-15T00:00:00Z\",\n", " column=\"GRDC\",\n", ")\n", "\n", - "observations.head()" + "observations.GRDC.to_dataframe().head()" ] }, { @@ -639,7 +639,7 @@ } ], "source": [ - "print(metadata)" + "print(observations.attrs)" ] }, { diff --git a/src/ewatercycle/observation/grdc.py b/src/ewatercycle/observation/grdc.py index 534a19e0..a649f923 100644 --- a/src/ewatercycle/observation/grdc.py +++ b/src/ewatercycle/observation/grdc.py @@ -1,9 +1,12 @@ """Global Runoff Data Centre module.""" + import logging import os -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Union import pandas as pd +import xarray as xr +from numpy import nan from ewatercycle import CFG from ewatercycle.util import get_time, to_absolute_path @@ -17,15 +20,18 @@ def get_grdc_data( station_id: str, start_time: str, end_time: str, - parameter: str = "Q", data_home: Optional[str] = None, column: str = "streamflow", -) -> Tuple[pd.core.frame.DataFrame, MetaDataType]: +) -> xr.Dataset: """Get river discharge data from Global Runoff Data Centre (GRDC). Requires the GRDC daily data files in a local directory. The GRDC daily data - files can be ordered at - https://www.bafg.de/GRDC/EN/02_srvcs/21_tmsrs/riverdischarge_node.html + NetCDF file can be downloaded at + https://www.bafg.de/GRDC/EN/02_srvcs/21_tmsrs/riverdischarge_node.html . + The downloaded zip file contains a file named GRDC-Daily.nc. + + This function will first try to read data from the GRDC-Daily.nc file in the ``data_home`` directory. + If that fails it will look for the GRDC Export (ASCII text) formatted file for example ``6435060_Q_Day.Cmd.txt``. Args: station_id: The station id to get. The station id can be found in the @@ -35,55 +41,50 @@ def get_grdc_data( 'YYYY-MM-DDTHH:MM:SSZ'. end_time: End time of model in UTC and ISO format string e.g. 'YYYY-MM-DDTHH:MM:SSZ'. - parameter: optional. The parameter code to get, e.g. ('Q') discharge, - cubic meters per second. data_home : optional. The directory where the daily grdc data is located. If left out will use the grdc_location in the eWaterCycle configuration file. column: optional. Name of column in dataframe. Default: "streamflow". Returns: - grdc data in a dataframe and metadata. + grdc data in a xarray dataset. Shaped like a filtered version of the GRDC daily NetCDF file. + + Raises: + ValueError: If no data for the requested station id and period could not be found. Examples: + .. code-block:: python from ewatercycle.observation.grdc import get_grdc_data - df, meta = get_grdc_data('6335020', - '2000-01-01T00:00Z', - '2001-01-01T00:00Z') - df.describe() - streamflow - count 4382.000000 - mean 2328.992469 - std 1190.181058 - min 881.000000 - 25% 1550.000000 - 50% 2000.000000 - 75% 2730.000000 - max 11300.000000 - - meta - {'grdc_file_name': '/home/myusername/git/eWaterCycle/ewatercycle/6335020_Q_Day.Cmd.txt', - 'id_from_grdc': 6335020, - 'file_generation_date': '2019-03-27', - 'river_name': 'RHINE RIVER', - 'station_name': 'REES', - 'country_code': 'DE', - 'grdc_latitude_in_arc_degree': 51.756918, - 'grdc_longitude_in_arc_degree': 6.395395, - 'grdc_catchment_area_in_km2': 159300.0, - 'altitude_masl': 8.0, - 'dataSetContent': 'MEAN DAILY DISCHARGE (Q)', - 'units': 'm³/s', - 'time_series': '1814-11 - 2016-12', - 'no_of_years': 203, - 'last_update': '2018-05-24', - 'nrMeasurements': 'NA', - 'UserStartTime': '2000-01-01T00:00Z', - 'UserEndTime': '2001-01-01T00:00Z', - 'nrMissingData': 0} + ds = get_grdc_data('6435060', + '2000-01-01T00:00Z', + '2001-01-01T00:00Z') + ds + Size: 5kB + Dimensions: (time: 367) + Coordinates: + * time (time) datetime64[ns] 3kB 2000-01-01 ... 2001-01-01 + id int32 4B 6435060 + Data variables: + streamflow (time) float32 1kB ... + area float32 4B ... + country xr.Dataset: + # Convert the raw data to an xarray + data = WaterML11ToPaegan(waterml_data).feature + + # We expect only 1 station + if len(data.elements) == 0: + raise ValueError("Data does not contain any station data") + + station = data.elements[0] + + # Unit conversion from cubic feet per second to cubic meter per second + values = np.array( + [float(point.members[0]["value"]) / 35.315 for point in station.elements], + dtype=np.float32, + ) + # Convert the time to a numpy array of datetime64 without timezone + times = pd.to_datetime([point.time for point in station.elements]).to_numpy( + dtype="datetime64[ns]" + ) + attrs = {"units": "m3/s"} + + # Create the xarray dataset + ds = xr.Dataset({"streamflow": (["time"], values, attrs)}, coords={"time": times}) + + # Set some nice attributes + ds.attrs["title"] = "USGS Data from streamflow data" + ds.attrs["station"] = station.name + ds.attrs["stationid"] = station.get_uid() + ds.attrs["location"] = (station.location.y, station.location.x) + + return ds + + +def _download_usgs_data( + station_id: str, + start_time: str, + end_time: str, +): + discharge_parameter = "00060" + collector = UsgsRest() + collector.filter( + start=get_time(start_time), + end=get_time(end_time), + variables=[discharge_parameter], + features=[station_id], + ) + return collector.raw() + -def get_usgs_data(station_id, start_date, end_date, parameter="00060", cache_dir=None): +def get_usgs_data( + station_id: str, + start_time: str, + end_time: str, +) -> xr.Dataset: """Get river discharge data from the USGS REST web service. See `U.S. Geological Survey Water Services `_ (USGS) - Parameters - ---------- - station_id : str - The station id to get - start_date : str - String for start date in the format: 'YYYY-MM-dd', e.g. '1980-01-01' - end_date : str - String for start date in the format: 'YYYY-MM-dd', e.g. '2018-12-31' - parameter : str - The parameter code to get, e.g. ('00060') discharge, cubic feet per second - cache_dir : str - Directory where files retrieved from the web service are cached. - If set to None then USGS_DATA_HOME env var will be used as cache directory. - - Examples - -------- - >>> from ewatercycle.observation.usgs import get_usgs_data - >>> data = get_usgs_data('03109500', '2000-01-01', '2000-12-31', cache_dir='.') - >>> data - + Args: + station_id: The station id to get + start_time: Start time of model in UTC and ISO format string e.g. + 'YYYY-MM-DDTHH:MM:SSZ'. + end_time: End time of model in UTC and ISO format string e.g. + 'YYYY-MM-DDTHH:MM:SSZ'. + + Returns: + Xarray dataset with the streamflow data + with unit and other metadata in the variable and global attributes. + + Examples: + + To get observations from the Little Beaver Creek. + + >>> from ewatercycle.observation.usgs import get_usgs_data + >>> data = get_usgs_data('03109500', '2000-01-01T00:00:00Z', '2000-12-31T00:00:00Z') + >>> data + Size: 96kB Dimensions: (time: 8032) Coordinates: - * time (time) datetime64[ns] 2000-01-04T05:00:00 ... 2000-12-23T04:00:00 + * time (time) datetime64[ns] 64kB 2000-01-04T05:00:00 ... 2000-12-23... Data variables: - Streamflow (time) float32 8.296758 10.420501 ... 10.647034 11.694747 + streamflow (time) float32 32kB 8.297 10.42 17.58 ... 8.552 10.65 11.69 Attributes: title: USGS Data from streamflow data station: Little Beaver Creek near East Liverpool OH stationid: 03109500 - location: (40.6758974, -80.5406244) + location: (np.float64(40.6758974), np.float64(-80.5406244)) """ # noqa: E501 - if cache_dir is None: - cache_dir = os.environ["USGS_DATA_HOME"] - - # Check if we have the netcdf data - netcdf = os.path.join( - cache_dir, - "USGS_" - + station_id - + "_" - + parameter - + "_" - + start_date - + "_" - + end_date - + ".nc", - ) - if os.path.exists(netcdf): - return xr.open_dataset(netcdf) - - # Download the data if needed - out = os.path.join( - cache_dir, - "USGS_" - + station_id - + "_" - + parameter - + "_" - + start_date - + "_" - + end_date - + ".wml", - ) - if not os.path.exists(out): - collector = UsgsRest() - collector.filter( - start=datetime.strptime(start_date, "%Y-%m-%d"), - end=datetime.strptime(end_date, "%Y-%m-%d"), - variables=[parameter], - features=[station_id], - ) - data = collector.raw() - with open(out, "w") as file: - file.write(data) - collector.clear() - else: - with open(out, "r") as file: - data = file.read() - - # Convert the raw data to an xarray - data = WaterML11ToPaegan(data).feature - - # We expect only 1 station - if len(data.elements) == 0: - raise ValueError("Data does not contain any station data") - else: - station = data.elements[0] - - # Unit conversion from cubic feet to cubic meter per second - values = np.array( - [float(point.members[0]["value"]) / 35.315 for point in station.elements], - dtype=np.float32, - ) - times = [point.time for point in station.elements] - - attrs = { - "units": "cubic meters per second", - } - - # Create the xarray dataset - ds = xr.Dataset( - {"streamflow": (["time"], values, attrs)}, coords={"time": times} - ) - - # Set some nice attributes - ds.attrs["title"] = "USGS Data from streamflow data" - ds.attrs["station"] = station.name - ds.attrs["stationid"] = station.get_uid() - ds.attrs["location"] = (station.location.y, station.location.x) - - ds.to_netcdf(netcdf) - - return ds + wml_data = _download_usgs_data(station_id, start_time, end_time) + return _xml_to_xarray(wml_data) diff --git a/tests/src/observation/test_grdc.py b/tests/src/observation/test_grdc.py index 1f2e5488..7922e2c9 100644 --- a/tests/src/observation/test_grdc.py +++ b/tests/src/observation/test_grdc.py @@ -1,9 +1,10 @@ from datetime import datetime +from pathlib import Path import numpy as np -import pandas as pd import pytest -from pandas.testing import assert_frame_equal +import xarray as xr +from xarray.testing import assert_allclose from ewatercycle import CFG from ewatercycle.observation.grdc import get_grdc_data @@ -58,55 +59,123 @@ def sample_grdc_file(tmp_path): @pytest.fixture -def expected_results(tmp_path, sample_grdc_file): - data = pd.DataFrame( - {"streamflow": [123.0, 456.0, np.nan]}, - index=[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], +def expected_results(): + return xr.Dataset.from_dict( + { + "coords": { + "time": { + "dims": ("time",), + "attrs": {"long_name": "time"}, + "data": [ + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 2, 0, 0), + datetime(2000, 1, 3, 0, 0), + ], + }, + "id": { + "dims": (), + "attrs": {"long_name": "grdc number"}, + "data": 42424242, + }, + }, + "attrs": { + "title": "MEAN DAILY DISCHARGE (Q)", + "Conventions": "CF-1.7", + "references": "grdc.bafg.de", + "institution": "GRDC", + "history": "Converted from 42424242_Q_Day.Cmd.txt of 2000-02-02 to netcdf by eWaterCycle Python package", + "missing_value": "-999.000", + }, + "dims": {"time": 3}, + "data_vars": { + "streamflow": { + "dims": ("time",), + "attrs": {"units": "m3/s", "long_name": "Mean daily discharge (Q)"}, + "data": [123.0, 456.0, np.nan], + }, + "area": { + "dims": (), + "attrs": {"units": "km2", "long_name": "catchment area"}, + "data": 4242.0, + }, + "country": { + "dims": (), + "attrs": { + "long_name": "country name", + "iso2": "ISO 3166-1 alpha-2 - two-letter country code", + }, + "data": "NA", + }, + "geo_x": { + "dims": (), + "attrs": { + "units": "degree_east", + "long_name": "station longitude (WGS84)", + }, + "data": 4.955153, + }, + "geo_y": { + "dims": (), + "attrs": { + "units": "degree_north", + "long_name": "station latitude (WGS84)", + }, + "data": 52.356154, + }, + "geo_z": { + "dims": (), + "attrs": { + "units": "m", + "long_name": "station altitude (m above sea level)", + }, + "data": 8.0, + }, + "owneroforiginaldata": { + "dims": (), + "attrs": {"long_name": "Owner of original data"}, + "data": "Unknown", + }, + "river_name": { + "dims": (), + "attrs": {"long_name": "river name"}, + "data": "SOME RIVER", + }, + "station_name": { + "dims": (), + "attrs": {"long_name": "station name"}, + "data": "SOME", + }, + "timezone": { + "dims": (), + "attrs": { + "units": "00:00", + "long_name": "utc offset, in relation to the national capital", + }, + "data": np.nan, + }, + }, + } ) - data.index.rename("time", inplace=True) - metadata = { - "altitude_masl": 8.0, - "country_code": "NA", - "dataSetContent": "MEAN DAILY DISCHARGE (Q)", - "file_generation_date": "2000-02-02", - "grdc_catchment_area_in_km2": 4242.0, - "grdc_file_name": str(tmp_path / sample_grdc_file), - "grdc_latitude_in_arc_degree": 52.356154, - "grdc_longitude_in_arc_degree": 4.955153, - "id_from_grdc": 42424242, - "last_update": "2000-02-01", - "no_of_years": 1, - "nrMeasurements": 3, - "river_name": "SOME RIVER", - "station_name": "SOME", - "time_series": "2000-01 - 2000-01", - "units": "m³/s", - "UserEndTime": "2000-02-01T00:00Z", - "UserStartTime": "2000-01-01T00:00Z", - "nrMissingData": 1, - } - return data, metadata - - -def test_get_grdc_data_with_datahome(tmp_path, expected_results): - expected_data, expected_metadata = expected_results - result_data, result_metadata = get_grdc_data( + + +def test_get_grdc_data_with_datahome( + tmp_path, expected_results: xr.Dataset, sample_grdc_file +): + result_data = get_grdc_data( "42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z", data_home=str(tmp_path) ) - assert_frame_equal(result_data, expected_data) - assert result_metadata == expected_metadata + print(result_data.to_dict()) + assert_allclose(result_data, expected_results) -def test_get_grdc_data_with_cfg(expected_results, tmp_path): +def test_get_grdc_data_with_cfg( + expected_results: xr.Dataset, tmp_path, sample_grdc_file +): CFG.grdc_location = tmp_path - expected_data, expected_metadata = expected_results - result_data, result_metadata = get_grdc_data( - "42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z" - ) + result_data = get_grdc_data("42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z") - assert_frame_equal(result_data, expected_data) - assert result_metadata == expected_metadata + assert_allclose(result_data, expected_results) def test_get_grdc_data_without_file(tmp_path): @@ -119,13 +188,137 @@ def test_get_grdc_data_without_file(tmp_path): ) -def test_get_grdc_dat_custom_column_name(expected_results, tmp_path): - CFG.grdc_location = str(tmp_path) - result_data, result_metadata = get_grdc_data( +def test_get_grdc_data_custom_column_name( + expected_results: xr.Dataset, tmp_path: Path, sample_grdc_file +): + CFG.grdc_location = tmp_path + result_data = get_grdc_data( "42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z", column="observation" ) - expected_default_data, expected_metadata = expected_results - expected_data = expected_default_data.rename(columns={"streamflow": "observation"}) - assert_frame_equal(result_data, expected_data) - assert result_metadata == expected_metadata + expected_data = expected_results.rename({"streamflow": "observation"}) + + assert_allclose(result_data, expected_data) + + +@pytest.fixture +def sample_nc_file(tmp_path): + fn = tmp_path / "GRDC-Daily.nc" + ds = xr.Dataset.from_dict( + { + "coords": { + "time": { + "dims": ("time",), + "attrs": {"long_name": "time"}, + "data": [ + datetime(2000, 1, 1, 0, 0), + datetime(2000, 1, 2, 0, 0), + datetime(2000, 1, 3, 0, 0), + ], + }, + "id": { + "dims": ("id",), + "attrs": {"long_name": "grdc number"}, + "data": [42424242], + }, + }, + "attrs": { + "title": "MEAN DAILY DISCHARGE (Q)", + "Conventions": "CF-1.7", + "references": "grdc.bafg.de", + "institution": "GRDC", + "history": "Converted from 42424242_Q_Day.Cmd.txt of 2000-02-02 to netcdf by eWaterCycle Python package", + "missing_value": "-999.000", + }, + "dims": {"time": 3, "id": 1}, + "data_vars": { + "runoff_mean": { + "dims": ("time", "id"), + "attrs": {"units": "m3/s", "long_name": "Mean daily discharge (Q)"}, + "data": [[123.0], [456.0], [np.nan]], + }, + "area": { + "dims": ("id",), + "attrs": {"units": "km2", "long_name": "catchment area"}, + "data": [4242.0], + }, + "country": { + "dims": ("id",), + "attrs": { + "long_name": "country name", + "iso2": "ISO 3166-1 alpha-2 - two-letter country code", + }, + "data": ["NA"], + }, + "geo_x": { + "dims": ("id",), + "attrs": { + "units": "degree_east", + "long_name": "station longitude (WGS84)", + }, + "data": [4.955153], + }, + "geo_y": { + "dims": ("id",), + "attrs": { + "units": "degree_north", + "long_name": "station latitude (WGS84)", + }, + "data": [52.356154], + }, + "geo_z": { + "dims": ("id",), + "attrs": { + "units": "m", + "long_name": "station altitude (m above sea level)", + }, + "data": [8.0], + }, + "owneroforiginaldata": { + "dims": ("id",), + "attrs": {"long_name": "Owner of original data"}, + "data": ["Unknown"], + }, + "river_name": { + "dims": ("id",), + "attrs": {"long_name": "river name"}, + "data": ["SOME RIVER"], + }, + "station_name": { + "dims": ("id",), + "attrs": {"long_name": "station name"}, + "data": ["SOME"], + }, + "timezone": { + "dims": ("id",), + "attrs": { + "units": "00:00", + "long_name": "utc offset, in relation to the national capital", + }, + "data": [np.nan], + }, + }, + } + ) + ds.to_netcdf(fn) + return str(tmp_path) + + +def test_get_grdc_data_from_nc(sample_nc_file, expected_results: xr.Dataset): + result_data = get_grdc_data( + "42424242", "2000-01-01T00:00Z", "2000-02-01T00:00Z", data_home=sample_nc_file + ) + assert_allclose(result_data, expected_results) + + +def test_get_grdc_data_from_nc_missing_and_no_txtfile(tmp_path, sample_nc_file): + with pytest.raises( + ValueError, + match="The grdc station 42424243 is not in the .*/GRDC-Daily.nc file and .*/42424243_Q_Day.Cmd.txt does not exist!", + ): + get_grdc_data( + "42424243", + "2000-01-01T00:00Z", + "2000-02-01T00:00Z", + data_home=str(tmp_path), + ) diff --git a/tests/src/observation/test_usgs.py b/tests/src/observation/test_usgs.py new file mode 100644 index 00000000..6f47b9a0 --- /dev/null +++ b/tests/src/observation/test_usgs.py @@ -0,0 +1,207 @@ +import datetime +from textwrap import dedent + +import numpy as np +import pytest +import xarray as xr + +from ewatercycle.observation.usgs import _xml_to_xarray + + +@pytest.fixture +def waterml_data(): + """This was generated by running + + ```python + from ewatercycle.observation.usgs import _download_usgs_data + print(_download_usgs_data("03109500", "2000-01-06T00:00:00", "2000-01-07T00:00:00")) + ``` + """ + return dedent( + """\ + + + + + http://nwis.waterservices.usgs.gov/nwis/iv/startDT=2000-01-06T00%3A00&endDT=2000-01-07T00%3A00&parameterCd=00060&sites=03109500&format=waterml%2C1.1 + + [ALL:03109500] + [00060] + + 2000-01-06T00:00:00.000 + 2000-01-07T00:00:00.000 + + + [ALL:03109500] + [mode=RANGE, modifiedSince=null] + interval={INTERVAL[2000-01-06T00:00:00.000Z/2000-01-07T00:00:00.000Z]} + methodIds=[ALL] + 2024-07-05T08:06:53.782Z + 8a673850-3aa5-11ef-9e48-4cd98f8df011 + Provisional data are subject to revision. Go to + http://waterdata.usgs.gov/nwis/help/?provisional for more information. + nadww01 + + + + Little Beaver Creek near East Liverpool OH + 03109500 + + + + + + + 40.6758974 + -80.5406244 + + + ST + 05030101 + 39 + 39029 + + + 00060 + Streamflow, ft³/s + Discharge, cubic feet per second + Derived Value + + ft3/s + + + + + -999999.0 + + + 1570 + 1510 + 1430 + 1370 + 1320 + 1260 + 1220 + 1180 + 1140 + 1110 + 1070 + 1040 + 1020 + 995 + 967 + 947 + 920 + 901 + 888 + 868 + 849 + 831 + 818 + 806 + 788 + + [91] + Returned when there is no matching qualifier. + + + A + Approved for publication -- Processing and review + completed. + + + + + + + + """ + ) + + +def test_xml_to_xarray(waterml_data: str): + result = _xml_to_xarray(waterml_data) + expected = xr.Dataset.from_dict( + { + "coords": { + "time": { + "dims": ("time",), + "attrs": {}, + "data": [ + datetime.datetime(2000, 1, 6, 5, 0), + datetime.datetime(2000, 1, 6, 6, 0), + datetime.datetime(2000, 1, 6, 7, 0), + datetime.datetime(2000, 1, 6, 8, 0), + datetime.datetime(2000, 1, 6, 9, 0), + datetime.datetime(2000, 1, 6, 10, 0), + datetime.datetime(2000, 1, 6, 11, 0), + datetime.datetime(2000, 1, 6, 12, 0), + datetime.datetime(2000, 1, 6, 13, 0), + datetime.datetime(2000, 1, 6, 14, 0), + datetime.datetime(2000, 1, 6, 15, 0), + datetime.datetime(2000, 1, 6, 16, 0), + datetime.datetime(2000, 1, 6, 17, 0), + datetime.datetime(2000, 1, 6, 18, 0), + datetime.datetime(2000, 1, 6, 19, 0), + datetime.datetime(2000, 1, 6, 20, 0), + datetime.datetime(2000, 1, 6, 21, 0), + datetime.datetime(2000, 1, 6, 22, 0), + datetime.datetime(2000, 1, 6, 23, 0), + datetime.datetime(2000, 1, 7, 0, 0), + datetime.datetime(2000, 1, 7, 1, 0), + datetime.datetime(2000, 1, 7, 2, 0), + datetime.datetime(2000, 1, 7, 3, 0), + datetime.datetime(2000, 1, 7, 4, 0), + datetime.datetime(2000, 1, 7, 5, 0), + ], + } + }, + "attrs": { + "title": "USGS Data from streamflow data", + "station": "Little Beaver Creek near East Liverpool OH", + "stationid": "03109500", + "location": (np.float64(40.6758974), np.float64(-80.5406244)), + }, + "dims": {"time": 25}, + "data_vars": { + "streamflow": { + "dims": ("time",), + "attrs": {"units": "m3/s"}, + "data": [ + 44.45703125, + 42.758033752441406, + 40.49271011352539, + 38.7937126159668, + 37.37788391113281, + 35.678890228271484, + 34.546226501464844, + 33.4135627746582, + 32.28089904785156, + 31.4314022064209, + 30.29874038696289, + 29.449241638183594, + 28.882911682128906, + 28.174997329711914, + 27.382131576538086, + 26.815799713134766, + 26.051252365112305, + 25.51323890686035, + 25.145122528076172, + 24.57879066467285, + 24.040775299072266, + 23.531078338623047, + 23.162961959838867, + 22.823162078857422, + 22.313465118408203, + ], + } + }, + } + ) + + xr.testing.assert_identical(result, expected)