Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: remove dataset push to HF #1465

Merged
merged 1 commit into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions .github/workflows/container-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ jobs:
echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.net,static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.net,images.openfoodfacts.org" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=8GB" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_WORK_MEM=1GB" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=0" >> $GITHUB_ENV
- name: Set various variable for production deployment
if: matrix.env == 'robotoff-org'
run: |
Expand All @@ -57,7 +56,6 @@ jobs:
echo "CROP_ALLOWED_DOMAINS=static.openfoodfacts.org,openfoodfacts-images.s3.eu-west-3.amazonaws.com,images.openfoodfacts.org" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_SHARED_BUFFERS=16GB" >> $GITHUB_ENV
echo "ROBOTOFF_POSTGRES_WORK_MEM=2GB" >> $GITHUB_ENV
echo "ENABLE_HF_PUSH=1" >> $GITHUB_ENV
- name: Wait for container build workflow
uses: tomchv/[email protected]
id: wait-build
Expand Down Expand Up @@ -174,15 +172,9 @@ jobs:
# Google Cloud credentials
echo "GOOGLE_CREDENTIALS=${{ secrets.GOOGLE_CREDENTIALS }}" >> .env

# Token to push dataset to Hugging Face
echo "HF_TOKEN=${{ secrets.HF_TOKEN }}" >> .env

# Secret key to secure batch job import
echo "BATCH_JOB_KEY=${{ secrets.BATCH_JOB_KEY }}" >> .env

# Enable or not dataset push to Hugging Face
echo "ENABLE_HF_PUSH=${{ env.ENABLE_HF_PUSH }}" >> .env

- name: Create Docker volumes
uses: appleboy/ssh-action@master
with:
Expand Down
2 changes: 0 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ x-robotoff-base-env:
GOOGLE_APPLICATION_CREDENTIALS: /opt/robotoff/credentials/google/credentials.json
GOOGLE_CREDENTIALS: # JSON credentials pasted as environment variable
BATCH_JOB_KEY: # Secure Batch job import with a token key
HF_TOKEN: # Hugging Face token
ENABLE_HF_PUSH: # Enable Hugging Face dataset push (0 or 1, disabled by default)

x-robotoff-worker-base:
&robotoff-worker
Expand Down
33 changes: 0 additions & 33 deletions robotoff/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,38 +1201,5 @@ def launch_normalize_barcode_job(
logger.info("Updated %d images", updated)


@app.command()
def push_jsonl_to_hf(
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated",
output_path: Optional[str] = None,
):
"""Clean and convert the JSONL database before pushing to HF.
Possibility to only convert the database locally by indicating an `output_path`.
"""
import os
import tempfile

from robotoff.products import convert_jsonl_to_parquet, push_data_to_hf
from robotoff.utils.logger import get_logger

logger = get_logger()
logger.info("Start command: convert JSONL to Parquet (to HF).")
if output_path:
convert_jsonl_to_parquet(output_file_path=output_path)
else:
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, "converted_data.parquet")
convert_jsonl_to_parquet(output_file_path=file_path)
push_data_to_hf(
data_path=file_path,
repo_id=repo_id,
revision=revision,
commit_message=commit_message,
)
logger.info("JSONL to Parquet succesfully finished.")


def main() -> None:
app()
50 changes: 0 additions & 50 deletions robotoff/products.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
from pathlib import Path
from typing import Iterable, Iterator, Optional, Union

import duckdb
import requests
from huggingface_hub import HfApi
from pymongo import MongoClient

from robotoff import settings
Expand Down Expand Up @@ -574,51 +572,3 @@ def get_product(
:return: the product as a dict or None if it was not found
"""
return get_product_store(product_id.server_type).get_product(product_id, projection)


def convert_jsonl_to_parquet(
output_file_path: str,
dataset_path: Path = settings.JSONL_DATASET_PATH,
query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY,
) -> None:
logger.info("Start JSONL to Parquet conversion process.")
if not dataset_path.exists() or not query_path.exists():
raise FileNotFoundError(
f"{str(dataset_path)} or {str(query_path)} was not found."
)
query = (
query_path.read_text()
.replace("{dataset_path}", str(dataset_path))
.replace("{output_path}", output_file_path)
)
try:
duckdb.sql(query)
except duckdb.Error as e:
logger.error(f"Error executing query: {query}\nError message: {e}")
raise
logger.info("JSONL successfully converted into Parquet file.")


def push_data_to_hf(
data_path: str,
repo_id: str = "openfoodfacts/product-database",
revision: str = "main",
commit_message: str = "Database updated",
) -> None:
logger.info(f"Start pushing data to Hugging Face at {repo_id}")
if not os.path.exists(data_path):
raise FileNotFoundError(f"Data is missing: {data_path}")
if os.path.splitext(data_path)[-1] != ".parquet":
raise ValueError(
f"A parquet file is expected. Got {os.path.splitext(data_path)[-1]} instead."
)
# We use the HF_Hub api since it gives us way more flexibility than push_to_hub()
HfApi().upload_file(
path_or_fileobj=data_path,
repo_id=repo_id,
revision=revision,
repo_type="dataset",
path_in_repo="products.parquet",
commit_message=commit_message,
)
logger.info(f"Data succesfully pushed to Hugging Face at {repo_id}")
26 changes: 2 additions & 24 deletions robotoff/scheduler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import datetime
import os
import tempfile
import uuid
from typing import Iterable

Expand All @@ -24,11 +23,9 @@
from robotoff.models import Prediction, ProductInsight, db
from robotoff.products import (
Product,
convert_jsonl_to_parquet,
fetch_dataset,
get_min_product_store,
has_dataset_changed,
push_data_to_hf,
)
from robotoff.types import InsightType, ServerType
from robotoff.utils import get_logger
Expand Down Expand Up @@ -294,34 +291,15 @@

# this job does no use database
def _update_data() -> None:
"""Download the latest version of the Product Opener product JSONL dump,
convert it to Parquet format and push it to Hugging Face Hub.

Conversion to Parquet is only performed if the envvar ENABLE_HF_PUSH is
set to 1.
"""
"""Download the latest version of the Product Opener product JSONL dump."""
logger.info("Downloading new version of product dataset")
ds_changed = False
try:
if ds_changed := has_dataset_changed():
if has_dataset_changed():

Check warning on line 297 in robotoff/scheduler/__init__.py

View check run for this annotation

Codecov / codecov/patch

robotoff/scheduler/__init__.py#L297

Added line #L297 was not covered by tests
fetch_dataset()
except requests.exceptions.RequestException:
logger.exception("Exception during product dataset refresh")
return

if not settings.ENABLE_HF_PUSH:
logger.info("HF push is disabled, skipping Parquet conversion")
return

if ds_changed:
logger.info("Starting conversion of JSONL to Parquet (to HF)")
with tempfile.TemporaryDirectory() as tmp_dir:
file_path = os.path.join(tmp_dir, "converted_data.parquet")
convert_jsonl_to_parquet(output_file_path=file_path)
push_data_to_hf(data_path=file_path)
else:
logger.info("No changes in product dataset, skipping Parquet conversion")


def transform_insight_iter(insights_iter: Iterable[dict]):
for insight in insights_iter:
Expand Down
4 changes: 0 additions & 4 deletions robotoff/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,3 @@ def get_package_version() -> str:

# Batch jobs
GOOGLE_PROJECT_NAME = "robotoff"

# SQL queries paths
JSONL_TO_PARQUET_SQL_QUERY = PROJECT_DIR / "robotoff/utils/sql/jsonl_to_parquet.sql"
ENABLE_HF_PUSH = bool(int(os.environ.get("ENABLE_HF_PUSH", 0)))
135 changes: 0 additions & 135 deletions robotoff/utils/sql/jsonl_to_parquet.sql

This file was deleted.

14 changes: 1 addition & 13 deletions tests/unit/test_products.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import json
from pathlib import Path
from typing import Optional

import pytest

from robotoff.products import convert_jsonl_to_parquet, is_special_image, is_valid_image
from robotoff.products import is_special_image, is_valid_image
from robotoff.settings import TEST_DATA_DIR
from robotoff.types import JSONType

Expand Down Expand Up @@ -52,14 +51,3 @@ def test_is_valid_image(
output: bool,
):
assert is_valid_image(images, image_path) is output


class TestConvertJSONLToParquet:
def test_convert_jsonl_to_parquet_data_missing(self):
non_existing_path = Path("non/existing/dataset/path")
with pytest.raises(FileNotFoundError):
convert_jsonl_to_parquet(
output_file_path="any_path",
dataset_path=non_existing_path,
query_path=non_existing_path,
)
Loading