Skip to content

Commit

Permalink
Merge pull request #6 from edsu/mypy
Browse files Browse the repository at this point in the history
Check types and style in CI
  • Loading branch information
edsu authored Jan 5, 2024
2 parents 702b2a4 + 83fd8a5 commit 4f21fff
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 86 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ jobs:
pip install poetry
poetry install
- name: Check formatting
run: poetry run black --check .
- name: Style check
uses: chartboost/ruff-action@v1

- name: Test with pytest
- name: Check types
run: poetry run mypy .

- name: Run tests
run: poetry run pytest -v
13 changes: 8 additions & 5 deletions marctable/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from collections.abc import Callable
from io import IOBase
from typing import BinaryIO, TextIO

import click

Expand Down Expand Up @@ -38,7 +40,7 @@ def rule_params(f: Callable) -> Callable:
@cli.command()
@io_params
@rule_params
def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
def csv(infile: BinaryIO, outfile: TextIO, rules: list, batch: int) -> None:
"""
Convert MARC to CSV.
"""
Expand All @@ -48,7 +50,7 @@ def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> Non
@cli.command()
@io_params
@rule_params
def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
def parquet(infile: BinaryIO, outfile: IOBase, rules: list, batch: int) -> None:
"""
Convert MARC to Parquet.
"""
Expand All @@ -58,7 +60,7 @@ def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) ->
@cli.command()
@io_params
@rule_params
def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> None:
def jsonl(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None:
"""
Convert MARC to JSON Lines (JSONL)
"""
Expand All @@ -67,9 +69,10 @@ def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> N

@cli.command()
@click.argument("outfile", type=click.File("w"), default="-")
def avram(outfile: click.File) -> None:
def avram(outfile: TextIO) -> None:
"""
Generate Avram (YAML) from scraping the Library of Congress MARC bibliographic website.
Generate Avram (YAML) from scraping the Library of Congress MARC
bibliographic web.
"""
marctable.marc.crawl(outfile=outfile)

Expand Down
92 changes: 53 additions & 39 deletions marctable/marc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
import re
import sys
from functools import cache
from typing import IO, Generator
from typing import IO, Generator, List, Optional, Type
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag


class Subfield:
Expand All @@ -28,16 +28,21 @@ def __init__(self, code: str, label: str, repeatable: bool = False) -> None:
self.repeatable = repeatable

@classmethod
def from_dict(_, d: dict):
return Subfield(d.get("code"), d.get("label"), d.get("repeatable"))
def from_dict(cls: Type["Subfield"], d: dict) -> "Subfield":
return Subfield(d["code"], d["label"], d["repeatable"])

def to_dict(self) -> dict:
return {"code": self.code, "label": self.label, "repeatable": self.repeatable}


class Field:
def __init__(
self, tag: str, label: str, subfields: dict, repeatable: False, url: str = None
self,
tag: str,
label: str,
subfields: list[Subfield],
repeatable: bool = False,
url: Optional[str] = None,
) -> None:
self.tag = tag
self.label = label
Expand All @@ -47,71 +52,66 @@ def __init__(

def __str__(self) -> str:
if len(self.subfields) > 0:
subfields = ": " + (",".join(self.subfields.keys()))
subfields = ": " + (",".join([sf.code for sf in self.subfields]))
else:
subfields = ""
return (
f"{self.tag} {self.label}: {'R' if self.repeatable else 'NR'} {subfields}"
)

@classmethod
def from_dict(klass, d: dict):
def from_dict(cls: Type["Field"], d: dict) -> "Field":
return Field(
tag=d.get("tag"),
label=d.get("label"),
repeatable=d.get("repeatable"),
tag=d["tag"],
label=d["label"],
repeatable=d["repeatable"],
url=d.get("url"),
subfields=[Subfield.from_dict(d) for d in d.get("subfields", {}).values()],
)

def to_dict(self) -> dict:
return {
d = {
"tag": self.tag,
"label": self.label,
"repeatable": self.repeatable,
"url": self.url,
"subfields": {sf.code: sf.to_dict() for sf in self.subfields.values()},
}

def to_avram(self) -> dict:
d = self.to_dict()
if len(d["subfields"]) == 0:
del d["subfields"]
if self.subfields is not None:
d["subfields"] = {sf.code: sf.to_dict() for sf in self.subfields}

return d

def get_subfield(self, code: str) -> Subfield:
for sf in self.subfields:
if sf.code == code:
return sf
return None
raise SchemaSubfieldError(f"{code} is not a valid subfield in field {self.tag}")


class MARC:
def __init__(self) -> None:
self.fields = []
self.fields: List[Field] = []

@cache
def get_field(self, tag: str) -> Field:
for field in self.fields:
if field.tag == tag:
return field
return None
raise SchemaFieldError(f"{tag} is not a defined field tag in Avram schema")

@cache
def get_subfield(self, tag: str, code: str) -> Subfield:
field = self.get_field(tag)
if field:
return field.get_subfield(code)
else:
return None
return field.get_subfield(code)

@property
def avram_file(self):
def avram_file(self) -> pathlib.Path:
return pathlib.Path(__file__).parent / "marc.json"

@classmethod
@cache
def from_avram(cls, avram_file: IO = None) -> dict:
def from_avram(cls: Type["MARC"], avram_file: Optional[IO] = None) -> "MARC":
marc = MARC()

if avram_file is None:
Expand All @@ -122,7 +122,7 @@ def from_avram(cls, avram_file: IO = None) -> dict:

return marc

def write_avram(self, avram_file: IO = None) -> None:
def to_avram(self, avram_file: Optional[IO] = None) -> None:
if avram_file is None:
avram_file = self.avram_file.open("w")

Expand All @@ -131,11 +131,19 @@ def write_avram(self, avram_file: IO = None) -> None:
"url": "https://www.loc.gov/marc/bibliographic/",
"family": "marc",
"language": "en",
"fields": {f.tag: f.to_avram() for f in self.fields},
"fields": {f.tag: f.to_dict() for f in self.fields},
}
json.dump(d, avram_file, indent=2)


class SchemaFieldError(Exception):
pass


class SchemaSubfieldError(Exception):
pass


def fields() -> Generator[Field, None, None]:
toc_url = "https://www.loc.gov/marc/bibliographic/"
toc_doc = _soup(toc_url)
Expand All @@ -150,30 +158,34 @@ def fields() -> Generator[Field, None, None]:
yield field


def make_field(url: str) -> Field:
def make_field(url: str) -> Optional[Field]:
soup = _soup(url)
h1 = soup.select_one("h1", first=True).text.strip()
if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1):
h1: Optional[Tag] = soup.select_one("h1")
if h1 is None:
raise Exception("Expecting h1 element in {url}")

h1_text: str = h1.text.strip()
if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1_text):
tag, label, repeatable = m1.groups()

# most pages put the subfield info in a list
subfields = {}
subfields = []
for el in soup.select("table.subfields li"):
if m2 := re.match(r"^\$(.) - (.+) \((.+)\)$", el.text):
subfields[m2.group(1)] = Subfield(
m2.group(1), m2.group(2), m2.group(3) == "R"
)
subfields.append(Subfield(m2.group(1), m2.group(2), m2.group(3) == "R"))

# some pages use a different layout, of course
if len(subfields) == 0:
for el in soup.select('td[colspan="1"]'):
for text in el.text.split("$"):
text = text.strip()
if m2 := re.match(r"^(.) - (.+) \((.+)\)$", text):
subfields[m2.group(1)] = Subfield(
code=m2.group(1),
label=m2.group(2),
repeatable=m2.group(3) == "R",
subfields.append(
Subfield(
code=m2.group(1),
label=m2.group(2),
repeatable=m2.group(3) == "R",
)
)

return Field(
Expand All @@ -184,6 +196,8 @@ def make_field(url: str) -> Field:
subfields=subfields,
)

return None


# scrape the loc website for the marc fields
def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None:
Expand All @@ -194,7 +208,7 @@ def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None:
print(f)
if n != 0 and len(marc.fields) >= n:
break
marc.write_avram(outfile)
marc.to_avram(outfile)


def _soup(url: str) -> BeautifulSoup:
Expand Down
Loading

0 comments on commit 4f21fff

Please sign in to comment.