diff --git a/marctable/__init__.py b/marctable/__init__.py index 94144da..fb8273b 100644 --- a/marctable/__init__.py +++ b/marctable/__init__.py @@ -1,4 +1,5 @@ from collections.abc import Callable +from typing import BinaryIO, TextIO import click @@ -38,7 +39,7 @@ def rule_params(f: Callable) -> Callable: @cli.command() @io_params @rule_params -def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> None: +def csv(infile: BinaryIO, outfile: TextIO, rules: list, batch: int) -> None: """ Convert MARC to CSV. """ @@ -48,7 +49,7 @@ def csv(infile: click.File, outfile: click.File, rules: list, batch: int) -> Non @cli.command() @io_params @rule_params -def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> None: +def parquet(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None: """ Convert MARC to Parquet. """ @@ -58,7 +59,7 @@ def parquet(infile: click.File, outfile: click.File, rules: list, batch: int) -> @cli.command() @io_params @rule_params -def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> None: +def jsonl(infile: BinaryIO, outfile: BinaryIO, rules: list, batch: int) -> None: """ Convert MARC to JSON Lines (JSONL) """ @@ -67,7 +68,7 @@ def jsonl(infile: click.File, outfile: click.File, rules: list, batch: int) -> N @cli.command() @click.argument("outfile", type=click.File("w"), default="-") -def avram(outfile: click.File) -> None: +def avram(outfile: TextIO) -> None: """ Generate Avram (YAML) from scraping the Library of Congress MARC bibliographic website. """ diff --git a/marctable/marc.py b/marctable/marc.py index 42fd48b..9d57a5c 100644 --- a/marctable/marc.py +++ b/marctable/marc.py @@ -14,11 +14,11 @@ import re import sys from functools import cache -from typing import IO, Generator +from typing import IO, Generator, List, Optional from urllib.parse import urljoin import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag class Subfield: @@ -29,7 +29,7 @@ def __init__(self, code: str, label: str, repeatable: bool = False) -> None: @classmethod def from_dict(_, d: dict): - return Subfield(d.get("code"), d.get("label"), d.get("repeatable")) + return Subfield(d["code"], d["label"], d["repeatable"]) def to_dict(self) -> dict: return {"code": self.code, "label": self.label, "repeatable": self.repeatable} @@ -37,7 +37,12 @@ def to_dict(self) -> dict: class Field: def __init__( - self, tag: str, label: str, subfields: dict, repeatable: False, url: str = None + self, + tag: str, + label: str, + subfields: list[Subfield], + repeatable: bool = False, + url: Optional[str] = None, ) -> None: self.tag = tag self.label = label @@ -47,7 +52,7 @@ def __init__( def __str__(self) -> str: if len(self.subfields) > 0: - subfields = ": " + (",".join(self.subfields.keys())) + subfields = ": " + (",".join([sf.code for sf in self.subfields])) else: subfields = "" return ( @@ -57,29 +62,27 @@ def __str__(self) -> str: @classmethod def from_dict(klass, d: dict): return Field( - tag=d.get("tag"), - label=d.get("label"), - repeatable=d.get("repeatable"), + tag=d["tag"], + label=d["label"], + repeatable=d["repeatable"], url=d.get("url"), subfields=[Subfield.from_dict(d) for d in d.get("subfields", {}).values()], ) def to_dict(self) -> dict: - return { + d = { "tag": self.tag, "label": self.label, "repeatable": self.repeatable, "url": self.url, - "subfields": {sf.code: sf.to_dict() for sf in self.subfields.values()}, } - def to_avram(self) -> dict: - d = self.to_dict() - if len(d["subfields"]) == 0: - del d["subfields"] + if self.subfields is not None: + d["subfields"] = {sf.code: sf.to_dict() for sf in self.subfields} + return d - def get_subfield(self, code: str) -> Subfield: + def get_subfield(self, code: str) -> Optional[Subfield]: for sf in self.subfields: if sf.code == code: return sf @@ -88,17 +91,17 @@ def get_subfield(self, code: str) -> Subfield: class MARC: def __init__(self) -> None: - self.fields = [] + self.fields: List[Field] = [] @cache - def get_field(self, tag: str) -> Field: + def get_field(self, tag: str) -> Optional[Field]: for field in self.fields: if field.tag == tag: return field return None @cache - def get_subfield(self, tag: str, code: str) -> Subfield: + def get_subfield(self, tag: str, code: str) -> Optional[Subfield]: field = self.get_field(tag) if field: return field.get_subfield(code) @@ -111,7 +114,7 @@ def avram_file(self): @classmethod @cache - def from_avram(cls, avram_file: IO = None) -> dict: + def from_avram(cls, avram_file: Optional[IO] = None): marc = MARC() if avram_file is None: @@ -122,7 +125,7 @@ def from_avram(cls, avram_file: IO = None) -> dict: return marc - def write_avram(self, avram_file: IO = None) -> None: + def to_avram(self, avram_file: Optional[IO] = None) -> None: if avram_file is None: avram_file = self.avram_file.open("w") @@ -131,7 +134,7 @@ def write_avram(self, avram_file: IO = None) -> None: "url": "https://www.loc.gov/marc/bibliographic/", "family": "marc", "language": "en", - "fields": {f.tag: f.to_avram() for f in self.fields}, + "fields": {f.tag: f.to_dict() for f in self.fields}, } json.dump(d, avram_file, indent=2) @@ -152,17 +155,19 @@ def fields() -> Generator[Field, None, None]: def make_field(url: str) -> Field: soup = _soup(url) - h1 = soup.select_one("h1", first=True).text.strip() - if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1): + h1: Optional[Tag] = soup.select_one("h1") + if h1 is None: + raise Exception("Expecting h1 element in {url}") + + h1_text: str = h1.text.strip() + if m1 := re.match(r"^(\d+) - (.+) \((.+)\)$", h1_text): tag, label, repeatable = m1.groups() # most pages put the subfield info in a list - subfields = {} + subfields = [] for el in soup.select("table.subfields li"): if m2 := re.match(r"^\$(.) - (.+) \((.+)\)$", el.text): - subfields[m2.group(1)] = Subfield( - m2.group(1), m2.group(2), m2.group(3) == "R" - ) + subfields.append(Subfield(m2.group(1), m2.group(2), m2.group(3) == "R")) # some pages use a different layout, of course if len(subfields) == 0: @@ -170,10 +175,12 @@ def make_field(url: str) -> Field: for text in el.text.split("$"): text = text.strip() if m2 := re.match(r"^(.) - (.+) \((.+)\)$", text): - subfields[m2.group(1)] = Subfield( - code=m2.group(1), - label=m2.group(2), - repeatable=m2.group(3) == "R", + subfields.append( + Subfield( + code=m2.group(1), + label=m2.group(2), + repeatable=m2.group(3) == "R", + ) ) return Field( @@ -194,7 +201,7 @@ def crawl(n: int = 0, quiet: bool = False, outfile: IO = sys.stdout) -> None: print(f) if n != 0 and len(marc.fields) >= n: break - marc.write_avram(outfile) + marc.to_avram(outfile) def _soup(url: str) -> BeautifulSoup: diff --git a/marctable/utils.py b/marctable/utils.py index 5991fbc..002b686 100644 --- a/marctable/utils.py +++ b/marctable/utils.py @@ -1,6 +1,5 @@ import json -import typing -from typing import Generator +from typing import BinaryIO, Dict, Generator, List, TextIO, Union import pyarrow import pymarc @@ -10,7 +9,7 @@ from .marc import MARC -def to_dataframe(marc_input: typing.BinaryIO, rules: list = []) -> DataFrame: +def to_dataframe(marc_input: BinaryIO, rules: list = []) -> DataFrame: """ Return a single DataFrame for the entire dataset. """ @@ -18,8 +17,8 @@ def to_dataframe(marc_input: typing.BinaryIO, rules: list = []) -> DataFrame: def to_csv( - marc_input: typing.BinaryIO, - csv_output: typing.TextIO, + marc_input: BinaryIO, + csv_output: TextIO, rules: list = [], batch: int = 1000, ) -> None: @@ -32,8 +31,8 @@ def to_csv( def to_jsonl( - marc_input: typing.BinaryIO, - jsonl_output: typing.BinaryIO, + marc_input: BinaryIO, + jsonl_output: BinaryIO, rules: list = [], batch: int = 1000, ) -> None: @@ -46,8 +45,8 @@ def to_jsonl( def to_parquet( - marc_input: typing.BinaryIO, - parquet_output: typing.BinaryIO, + marc_input: BinaryIO, + parquet_output: BinaryIO, rules: list = [], batch: int = 1000, ) -> None: @@ -55,7 +54,7 @@ def to_parquet( Convert MARC to Parquet. """ schema = _make_parquet_schema(rules) - writer = ParquetWriter(parquet_output, schema, compression="gzip") + writer = ParquetWriter(parquet_output, schema, compression="SNAPPY") for records_batch in records_iter(marc_input, rules=rules, batch=batch): table = pyarrow.Table.from_pylist(records_batch, schema) writer.write_table(table) @@ -64,7 +63,7 @@ def to_parquet( def dataframe_iter( - marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000 + marc_input: BinaryIO, rules: list = [], batch: int = 1000 ) -> Generator[DataFrame, None, None]: columns = _columns(_mapping(rules)) for records_batch in records_iter(marc_input, rules, batch): @@ -72,7 +71,7 @@ def dataframe_iter( def records_iter( - marc_input: typing.BinaryIO, rules: list = [], batch: int = 1000 + marc_input: BinaryIO, rules: list = [], batch: int = 1000 ) -> Generator[DataFrame, None, None]: """ Read MARC input and generate a list of dictionaries, where each list element @@ -87,7 +86,7 @@ def records_iter( if record is None: continue - r = {} + r: Dict[str, Union[str, List[str]]] = {} for field in record.fields: if field.tag not in mapping: continue @@ -209,8 +208,9 @@ def _make_parquet_schema(rules: list) -> pyarrow.Schema: typ = pyarrow.string() cols.append((f"F{field_tag}", typ)) else: - for sf in subfields: - if marc.get_subfield(field_tag, sf).repeatable: + for sf_code in subfields: + sf = marc.get_subfield(field_tag, sf_code) + if sf is not None and sf.repeatable: typ = pyarrow.list_(pyarrow.string()) else: typ = pyarrow.string() diff --git a/pyproject.toml b/pyproject.toml index eb7e41b..20554e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,11 @@ click = "^8.1.7" [tool.poetry.group.dev.dependencies] pytest = "^7.4.3" black = "^23.12.0" +types-requests = "^2.31.0.10" +types-beautifulsoup4 = "^4.12.0.7" +mypy = "^1.8.0" +pandas-stubs = "^2.1.4.231227" +pyarrow-stubs = "^10.0.1.7" [tool.poetry.scripts] marctable = "marctable:main"