Skip to content

Commit

Permalink
Use API to automate download
Browse files Browse the repository at this point in the history
  • Loading branch information
ajparsons committed Nov 1, 2024
1 parent 34935d6 commit 7fe9b10
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 5 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/build_and_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@ jobs:
uses: actions/checkout@v3
with:
submodules: 'recursive'


- name: Create dotenv
env:
MYSOC_USER_AGENT: ${{ secrets.MYSOC_USER_AGENT }}
run: |
echo "MYSOC_USER_AGENT=$MYSOC_USER_AGENT" >> .env
- name: Run tests and try and build project
uses: mysociety/run-in-devcontainer@v1
with:
Expand Down
49 changes: 46 additions & 3 deletions src/parl_register_interests/official_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
import pandas as pd
from mysoc_validator import Popolo
from mysoc_validator.models.popolo import Chamber, IdentifierScheme
import os
import zipfile

RAW_DATA = Path("data", "raw", "external", "official_data")
import httpx

# ]date(2024, 9, 2)
known_dates = [date(2024, 9, 30)]
RAW_DATA = Path("data", "raw", "external", "official_data")


def fix_snake_case(s: str) -> str:
Expand Down Expand Up @@ -143,8 +144,50 @@ def get_sort_tuple(row: pd.Series):
collected_df.to_parquet(package_dir / "overall.parquet")


def download_reg_on_date(register_date: date, force: bool = False):
date_str = register_date.strftime("%y%m%d")

url = f"https://publications.parliament.uk/pa/cm/cmregmem/{date_str}/{date_str}.zip"
zip_path = RAW_DATA / f"{date_str}.zip"
dest_folder = RAW_DATA / date_str

if not force and dest_folder.exists():
return
# use MYSOC_USER_AGENT env var to identify ourselves
headers = {"User-Agent": os.environ.get("MYSOC_USER_AGENT", "")}
print(f"Downloading {url} to {zip_path}")
with httpx.Client() as client:
response = client.get(url, headers=headers)
response.raise_for_status()

with open(zip_path, "wb") as f:
f.write(response.content)

with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(dest_folder)

# remove the zip file
zip_path.unlink()


def get_latest():
api_url = "https://interests-api.parliament.uk/api/v1/Registers?Take=20&Skip=0"

with httpx.Client() as client:
response = client.get(api_url)
response.raise_for_status()
data = response.json()

latest = data["items"][0]
published_date = latest["publishedDate"]
published_date = datetime.fromisoformat(published_date)
return published_date.date()


def process_all_regmem():
known_dates = [get_latest()]
for regmem_date in known_dates:
download_reg_on_date(regmem_date)
process_register(regmem_date)


Expand Down

0 comments on commit 7fe9b10

Please sign in to comment.