Skip to content

Commit

Permalink
✨ (cli) generate dataset and catalog from json
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Apr 4, 2024
1 parent fb13ba6 commit 1391d43
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 30 deletions.
94 changes: 64 additions & 30 deletions ftmq/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from ftmq.store import get_store
from ftmq.util import parse_unknown_filters

log = logging.getLogger(__name__)


@click.group(cls=DefaultGroup, default="q", default_if_no_args=True)
def cli() -> None:
Expand Down Expand Up @@ -185,6 +187,36 @@ def dataset_iterate(input_uri: str | None = "-", output_uri: str | None = "-"):
smart_write_proxies(output_uri, dataset.iterate(), serialize=True)


@dataset.command("generate")
@click.option(
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
)
@click.option(
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
)
@click.option(
"--stats",
is_flag=True,
default=False,
show_default=True,
help="Calculate stats",
)
def make_dataset(
input_uri: str | None = "-",
output_uri: str | None = "-",
stats: bool | None = False,
):
"""
Convert dataset YAML specification into json and optionally calculate statistics
"""
dataset = Dataset._from_uri(input_uri)
if stats:
collector = Collector()
statistics = collector.collect_many(dataset.iterate())
dataset.apply_stats(statistics)
smart_write(output_uri, dataset.model_dump_json().encode())


@cli.group()
def catalog():
pass
Expand All @@ -202,6 +234,38 @@ def catalog_iterate(input_uri: str | None = "-", output_uri: str | None = "-"):
smart_write_proxies(output_uri, catalog.iterate(), serialize=True)


@catalog.command("generate")
@click.option(
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
)
@click.option(
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
)
@click.option(
"--stats",
is_flag=True,
default=False,
show_default=True,
help="Calculate stats for each dataset",
)
def make_catalog(
input_uri: str | None = "-",
output_uri: str | None = "-",
stats: bool | None = False,
):
"""
Convert catalog YAML specification into json and fetch dataset metadata
"""
catalog = Catalog._from_uri(input_uri)
if stats:
for dataset in catalog.datasets:
log.info(f"Generating stats for `{dataset.name}` ...")
collector = Collector()
statistics = collector.collect_many(dataset.iterate())
dataset.apply_stats(statistics)
smart_write(output_uri, catalog.model_dump_json().encode())


@cli.group()
def store():
pass
Expand Down Expand Up @@ -274,36 +338,6 @@ def store_iterate(
smart_write_proxies(output_uri, store.iterate(), serialize=True)


@cli.command("make-dataset")
@click.option(
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
)
@click.option(
"-o", "--output-uri", default="-", show_default=True, help="output file or uri"
)
@click.option(
"--stats",
is_flag=True,
default=False,
show_default=True,
help="Calculate stats",
)
def make_dataset(
input_uri: str | None = "-",
output_uri: str | None = "-",
stats: bool | None = False,
):
"""
Convert dataset YAML specification into json and optionally calculate statistics
"""
dataset = Dataset._from_uri(input_uri)
if stats:
collector = Collector()
statistics = collector.collect_many(dataset.iterate())
dataset.apply_stats(statistics)
smart_write(output_uri, dataset.model_dump_json().encode())


@cli.command("aggregate")
@click.option(
"-i", "--input-uri", default="-", show_default=True, help="input file or uri"
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/dataset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
name: test_dataset
15 changes: 15 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from ftmq.cli import cli
from ftmq.io import make_proxy
from ftmq.model import Catalog, Dataset

runner = CliRunner()

Expand Down Expand Up @@ -214,3 +215,17 @@ def test_cli_aggregation(fixtures_path: Path):
}
},
}


def test_cli_generate(fixtures_path: Path):
# dataset
uri = str(fixtures_path / "dataset.yml")
res = runner.invoke(cli, ["dataset", "generate", "-i", uri])
res = orjson.loads(res.output)
assert Dataset(**res)

# catalog
uri = str(fixtures_path / "catalog.yml")
res = runner.invoke(cli, ["catalog", "generate", "-i", uri])
res = orjson.loads(res.output)
assert Catalog(**res)

0 comments on commit 1391d43

Please sign in to comment.