Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically add metadata to Hugging Face Hub repos when uploading projects #793

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,8 +616,15 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi
"--commit-message",
help="""The summary / title / first line of the generated commit.""",
)
@click.option(
"--modelcard/--no-modelcard",
default=True,
help="Update or create a ModelCard with upload.",
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
)
@cli_util.common_options
def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
def run_upload(
project_ids_pattern, repo_id, token, revision, commit_message, modelcard
):
"""
Upload selected projects and their vocabularies to a Hugging Face Hub repository.
\f
Expand Down Expand Up @@ -655,6 +662,8 @@ def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
finally:
for fobj in fobjs:
fobj.close()
if modelcard:
hfh_util.upsert_modelcard(repo_id, projects, token, revision)


@cli.command("download")
Expand Down Expand Up @@ -690,7 +699,9 @@ def run_download(project_ids_pattern, repo_id, token, revision, force):
`project_ids_pattern` from the specified Hugging Face Hub repository and
unzips the archives to `data/` directory and places the configuration files
to `projects.d/` directory. An authentication token and revision can
be given with options.
be given with options. If the README.md does not exist in the repository it is
created with default contents and metadata of the uploaded projects, if it exisits,
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
its metadata are updated as necessary.
"""

project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
Expand Down
46 changes: 46 additions & 0 deletions annif/hfh_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,49 @@ def get_vocab_id_from_config(config_path: str) -> str:
config.read(config_path)
section = config.sections()[0]
return config[section]["vocab"]


def upsert_modelcard(repo_id, projects, token, revision):
"""This function creates or updates a Model Card in a Hugging Face Hub repository
with some metadata in it."""
from huggingface_hub import ModelCard

card_exists = "README.md" in _list_files_in_hf_hub(repo_id, token, revision)
if card_exists:
card = ModelCard.load(repo_id)
commit_message = "Update README.md with Annif"
else:
card = _create_modelcard(repo_id)
commit_message = "Create README.md with Annif"

langs_existing = set(card.data.language) if card.data.language else set()
langs_to_add = {proj.vocab_lang for proj in projects}
card.data.language = list(langs_existing.union(langs_to_add))

card.push_to_hub(
repo_id=repo_id, token=token, revision=revision, commit_message=commit_message
)


def _create_modelcard(repo_id):
from huggingface_hub import ModelCard

content = f"""
---

---

# {repo_id.split("/")[1]}

## Usage

Use the `annif download` command to download selected projects with Annif;
for example, to download all projects in this repository run

annif download "*" {repo_id}

"""
card = ModelCard(content)
card.data.pipeline_tag = "text-classification"
card.data.tags = ["annif"]
return card
26 changes: 24 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,10 +1069,13 @@ def test_run_help():
assert "Run Annif in server mode for development." in result.output


@mock.patch("annif.hfh_util.upsert_modelcard")
@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
def test_upload(
create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard
):
result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"])
assert not result.exception
assert create_commit.call_count == 1
Expand Down Expand Up @@ -1108,16 +1111,35 @@ def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
)
in create_commit.call_args_list
)
assert upsert_modelcard.call_count == 1


@mock.patch("annif.hfh_util.upsert_modelcard")
@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
def test_upload_many(create_commit, CommitOperationAdd, preupload_lfs_files):
def test_upload_many(
create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard
):
result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"])
assert not result.exception
assert create_commit.call_count == 1
assert CommitOperationAdd.call_count == 11
assert upsert_modelcard.call_count == 1


@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
@mock.patch("annif.hfh_util.upsert_modelcard")
def test_upload_no_modelcard_upsert(
upsert_modelcard, create_commit, CommitOperationAdd, preupload_lfs_files
):
result = runner.invoke(
annif.cli.cli, ["upload", "dummy-fi", "dummy-repo", "--no-modelcard"]
)
assert not result.exception
assert upsert_modelcard.call_count == 0


def test_upload_nonexistent_repo():
Expand Down
64 changes: 64 additions & 0 deletions tests/test_hfh_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,67 @@ def test_copy_project_config_overwrite(copy, exists):
assert copy.call_args == mock.call(
"tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg"
)


@mock.patch("annif.hfh_util._list_files_in_hf_hub", return_value=["README.md"])
@mock.patch(
"huggingface_hub.ModelCard",
)
def test_upsert_modelcard_existing_card(ModelCard, _list_files_in_hf_hub, project):
repo_id = "annif-user/Annif-HFH-repo"
project.vocab_lang = "fi"
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
projects = [project]
token = "mytoken"
revision = "main"
ModelCard.load.return_value.data.language = ["en"] # Mock language in card

annif.hfh_util.upsert_modelcard(repo_id, projects, token, revision)

ModelCard.assert_not_called() # Do not create a new card

ModelCard.load.assert_called_once_with(repo_id)
card = ModelCard.load.return_value
card.push_to_hub.assert_called_once_with(
repo_id=repo_id,
token=token,
revision=revision,
commit_message="Update README.md with Annif",
)
assert sorted(card.data.language) == ["en", "fi"]


@mock.patch("annif.hfh_util._list_files_in_hf_hub", return_value=[])
@mock.patch(
"huggingface_hub.ModelCard",
)
def test_upsert_modelcard_new_card(ModelCard, _list_files_in_hf_hub, project):
repo_id = "annif-user/Annif-HFH-repo"
project.vocab_lang = "fi"
projects = [project]
token = "mytoken"
revision = "main"

annif.hfh_util.upsert_modelcard(repo_id, projects, token, revision)

ModelCard.assert_called_once()
card = ModelCard.return_value
card.push_to_hub.assert_called_once_with(
repo_id=repo_id,
token=token,
revision=revision,
commit_message="Create README.md with Annif",
)
assert card.data.language == ["fi"]


@mock.patch(
"huggingface_hub.ModelCard",
)
def test_create_modelcard(ModelCard):
repo_id = "annif-user/Annif-HFH-repo"

card = annif.hfh_util._create_modelcard(repo_id)

assert "# Annif-HFH-repo" in ModelCard.call_args[0][0] # README heading
assert card.data.pipeline_tag == "text-classification"
assert card.data.tags == ["annif"]
Loading