From b71a24e478404c4886bd062f01a35a80ec9e97ab Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Fri, 1 Nov 2024 09:13:59 +0000 Subject: [PATCH] Use API to automate download --- .github/workflows/build_and_publish.yml | 8 +++- src/data_common | 2 +- src/parl_register_interests/official_data.py | 49 ++++++++++++++++++-- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_and_publish.yml b/.github/workflows/build_and_publish.yml index 70092ceb..be24bba3 100644 --- a/.github/workflows/build_and_publish.yml +++ b/.github/workflows/build_and_publish.yml @@ -26,7 +26,13 @@ jobs: uses: actions/checkout@v3 with: submodules: 'recursive' - + + - name: Create dotenv + env: + MYSOC_USER_AGENT: ${{ secrets.MYSOC_USER_AGENT }} + run: | + echo "MYSOC_USER_AGENT=$MYSOC_USER_AGENT" >> .env + - name: Run tests and try and build project uses: mysociety/run-in-devcontainer@v1 with: diff --git a/src/data_common b/src/data_common index d5aad462..f344ce93 160000 --- a/src/data_common +++ b/src/data_common @@ -1 +1 @@ -Subproject commit d5aad46259f4cd1aa39040d1a8870e60b40131f0 +Subproject commit f344ce934eac895d78656658b779805f5ffa6bae diff --git a/src/parl_register_interests/official_data.py b/src/parl_register_interests/official_data.py index c1583196..d9b3cfc2 100644 --- a/src/parl_register_interests/official_data.py +++ b/src/parl_register_interests/official_data.py @@ -5,11 +5,12 @@ import pandas as pd from mysoc_validator import Popolo from mysoc_validator.models.popolo import Chamber, IdentifierScheme +import os +import zipfile -RAW_DATA = Path("data", "raw", "external", "official_data") +import httpx -# ]date(2024, 9, 2) -known_dates = [date(2024, 9, 30)] +RAW_DATA = Path("data", "raw", "external", "official_data") def fix_snake_case(s: str) -> str: @@ -143,8 +144,50 @@ def get_sort_tuple(row: pd.Series): collected_df.to_parquet(package_dir / "overall.parquet") +def download_reg_on_date(register_date: date, force: bool = False): + date_str = register_date.strftime("%y%m%d") + + url = f"https://publications.parliament.uk/pa/cm/cmregmem/{date_str}/{date_str}.zip" + zip_path = RAW_DATA / f"{date_str}.zip" + dest_folder = RAW_DATA / date_str + + if not force and dest_folder.exists(): + return + # use MYSOC_USER_AGENT env var to identify ourselves + headers = {"User-Agent": os.environ.get("MYSOC_USER_AGENT", "")} + print(f"Downloading {url} to {zip_path}") + with httpx.Client() as client: + response = client.get(url, headers=headers) + response.raise_for_status() + + with open(zip_path, "wb") as f: + f.write(response.content) + + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(dest_folder) + + # remove the zip file + zip_path.unlink() + + +def get_latest(): + api_url = "https://interests-api.parliament.uk/api/v1/Registers?Take=20&Skip=0" + + with httpx.Client() as client: + response = client.get(api_url) + response.raise_for_status() + data = response.json() + + latest = data["items"][0] + published_date = latest["publishedDate"] + published_date = datetime.fromisoformat(published_date) + return published_date.date() + + def process_all_regmem(): + known_dates = [get_latest()] for regmem_date in known_dates: + download_reg_on_date(regmem_date) process_register(regmem_date)