Skip to content

Commit

Permalink
Merge pull request #101 from OrdnanceSurvey/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
JEPooley authored Oct 4, 2023
2 parents 6b14bfc + 27abb8e commit ea0138b
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 41 deletions.
21 changes: 14 additions & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
# Changelog

## [1.2.8] - 2023/10/04

### Added
- Downloads API outputs a missing_files.[datetime].json file that details the specific files that didn't successfully download [JEPooley] [Amber Thorne]

### Fixed
- Downloads API now shows correct number of downloaded files e.g. when some have failed or are missing [JEPooley] [Amber Thorne]

## [1.2.7] - 2023/06/28

### Fixed
- NamesAPI no longer raises a KeyError if the response status code is 200 [JEPooley] [sam596]

## [1.2.6] - 2023/06/28

### Features
### Added
- Added check for chunk size when streaming data. Program should error if file download is incomplete [JEPooley]


### Changed
- Upgrade requests version in dependencies to 2.31.0 [gwionap]

Expand All @@ -20,7 +27,7 @@

- Import error for osdatahub.post in PlacesAPI [FHunt-OS] [JEPooley]

### Features
### Added

- Added support for the dataset parameter within the PlacesAPI wrapper [FHunt-OS] [JEPooley]

Expand All @@ -45,7 +52,7 @@
- Extents `from_radius` method no longer errors if the coordinate is not a tuple - now accepts any Iterable (issue 66) [JEPooley]
- Updated setup.cfg and tox.ini to reflect python3.11 compatability (issue 68) [JEPooley] [dchirst]

### Features
### Added

- Added proxy support using the `osdatahub.set_proxies` method (issue 55) [JEPooley] [FHunt-OS] [abiddiscombe]

Expand All @@ -57,23 +64,23 @@

## [1.2.0] - 2022/11/07

### Features
### Added

- Added NGD API [dchirst] [BenDickens] [JEPooley]
- Fixed typos in Features and Places APIs [dchirst]
- Added NGD quick start to README [dchirst] [JEPooley]

## [1.1.0] - 2022/08/22

### Features
### Added

- Support the new Data Hub API v2 [dchirst]
- Allow filters to be joined using bitwise operators [E-Paine]
- Improved warnings when queries are too large (issue 25) [E-Paine]
- Allow any type of collection to be used to construct a bounding box (issue 22) [E-Paine]
- Warn when using EPSG:4329 with the features API (issue 29) [E-Paine]

### Bugs
### Fixed

- Error when `nearest` returned an empty feature set (issue 24) [E-Paine]

2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = osdatahub
version = 1.2.7
version = 1.2.8
author = OS Rapid Prototyping
author_email = [email protected]
classifiers =
Expand Down
132 changes: 100 additions & 32 deletions src/osdatahub/DownloadsAPI/downloads_api.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
import functools
import json
import logging
import os
import time
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from http import HTTPStatus
from multiprocessing import cpu_count
from pathlib import Path
from typing import Union
from typing import List, Union

import osdatahub
import requests
from requests.exceptions import HTTPError
from tqdm import tqdm

import osdatahub

retries = 3


class _DownloadObj:
Expand Down Expand Up @@ -43,30 +49,73 @@ def download(self, output_dir: Union[str, Path], overwrite: bool = False, pbar:
f"Skipping download...")
return output_path

response = requests.get(self.url, stream=True, proxies=osdatahub.get_proxies())
response.raise_for_status()
expected_size = int(response.headers.get('content-length'))
current_size = 0
chunk_size = 1048576 # 1024 ** 2 -> 1MB
if response.status_code == 200:
with open(output_path, 'wb') as f:
if not pbar:
pbar = tqdm(total=expected_size, desc=self.file_name, unit="B", unit_scale=True, leave=True)
for chunk in response.iter_content(chunk_size=chunk_size):
current_size += len(chunk)
f.write(chunk)
f.flush()
pbar.update(chunk_size)

if expected_size != current_size:
deficit = expected_size - current_size
raise IOError(
f'incomplete read ({current_size} bytes read, {deficit} more expected)'
)
pbar.write(f"Finished downloading {self.file_name} to {output_path}")
for _ in range(retries):
try:
response = requests.get(
self.url, stream=True, proxies=osdatahub.get_proxies())
response.raise_for_status()
expected_size = int(response.headers.get('content-length'))
current_size = 0
chunk_size = 1048576 # 1024 ** 2 -> 1MB
if response.status_code == 200:
with open(output_path, 'wb') as f:
if not pbar:
pbar = tqdm(
total=expected_size, desc=self.file_name, unit="B", unit_scale=True, leave=True)
for chunk in response.iter_content(chunk_size=chunk_size):
current_size += len(chunk)
f.write(chunk)
f.flush()
pbar.update(chunk_size)
if expected_size != current_size:
deficit = expected_size - current_size
raise IOError(
f'incomplete read ({current_size} bytes read, {deficit} more expected)'
)
pbar.write(
f"Finished downloading {self.file_name} to {output_path}")
break

except HTTPError as exc:
if int(exc.response.status_code) == 429:
time.sleep(1)
continue
raise

return output_path


def remove_key(url: str):
"""Remove key from url
"""
return "".join([section for section in url.split("&") if "key" not in section])


def format_missing_files(missing_files: List[_DownloadObj]) -> List[dict]:
"""Convert download objects to dictionaries and sanitise
"""
file_info = []
for _download_obj in missing_files:
info = _download_obj.__dict__
info['url'] = remove_key(info['url'])
file_info.append(info)
return {
"missing_file_count": len(missing_files),
"missing_file_info": file_info
}


def save_missing_files(missing_files: List[_DownloadObj], output_dir: Union[str, Path]) -> None:
"""Format and save missing files
"""
if len(missing_files) == 0:
return
data = format_missing_files(missing_files)
path = os.path.join(
output_dir, f"missing_files.{datetime.now().strftime('%Y%m%d%H%M%S')}.json")
json.dump(data, open(path, "w"))


class _DownloadsAPIBase(ABC):
"""Parent class for Product and DataPackage classes as part of the DownloadsAPI
(https://osdatahub.os.uk/docs/downloads/overview)
Expand Down Expand Up @@ -102,7 +151,8 @@ def details(self) -> dict:
"""
Calls endpoint to return details about the product or data package
"""
response = osdatahub.get(self._endpoint(self._id), proxies=osdatahub.get_proxies())
response = osdatahub.get(self._endpoint(
self._id), proxies=osdatahub.get_proxies())
response.raise_for_status()
return response.json()

Expand All @@ -114,7 +164,8 @@ def all_products(cls, **kwargs) -> list:
Returns: list of dictionaries containing all products available to download
"""
response = osdatahub.get(cls._ENDPOINT, proxies=osdatahub.get_proxies())
response = osdatahub.get(
cls._ENDPOINT, proxies=osdatahub.get_proxies())
response.raise_for_status()
return response.json()

Expand Down Expand Up @@ -146,7 +197,8 @@ def _download(download_list: Union[list, _DownloadObj], output_dir: Union[str, P
defaults to the machine's CPU count
"""
if isinstance(download_list, list) and len(download_list) == 0:
raise Exception("Argument \"download_list\" is empty. Please provide at least one DownloadObj to download")
raise Exception(
"Argument \"download_list\" is empty. Please provide at least one DownloadObj to download")
elif isinstance(download_list, list) and len(download_list) > 1 and not download_multiple:
raise Exception("Argument \"download_list\" contains more than 1 object to download, but argument "
"\"download_multiple\" is set to False. Please pass only 1 download or set "
Expand All @@ -162,16 +214,32 @@ def _download(download_list: Union[list, _DownloadObj], output_dir: Union[str, P
with ThreadPoolExecutor(max_workers=processes) as executor:
pbar = tqdm(total=sum([d.size for d in download_list]), unit="B", unit_scale=True, leave=True,
desc=f"Downloaded 0/{len(download_list)} files from osdatahub")
results = list([executor.submit(p.download, output_dir, overwrite, pbar) for p in download_list])

processed_downloads = {}
num_downloads_completed = 0
for _ in as_completed(results):
num_downloads_completed += 1
pbar.set_description(
f"Downloaded {num_downloads_completed}/{len(download_list)} files from osdatahub")
results = []
missing_files = []

for p in download_list:
future = executor.submit(
p.download, output_dir, overwrite, pbar)
processed_downloads[future] = p

for future in as_completed(processed_downloads):
info = processed_downloads[future]
try:
results.append(future.result())
num_downloads_completed += 1
pbar.set_description(
f"Downloaded {num_downloads_completed}/{len(download_list)} files from osdatahub")
except Exception:
missing_files.append(info)

save_missing_files(missing_files, output_dir)
else:
# download single file
d = download_list[0] if isinstance(download_list, list) else download_list
d = download_list[0] if isinstance(
download_list, list) else download_list
results = [d.download(output_dir, overwrite)]

return results
2 changes: 1 addition & 1 deletion src/osdatahub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def set_proxies(proxies):
def get_proxies():
return json.loads(os.environ["_OSDATAHUB_PROXIES"])

__version__ = "1.2.7"
__version__ = "1.2.8"

from osdatahub.extent import Extent
from osdatahub.FeaturesAPI import FeaturesAPI
Expand Down

0 comments on commit ea0138b

Please sign in to comment.