From d2259a8c16b20716a9407dbe10c06096540ccd51 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Wed, 30 Oct 2024 15:28:50 +0100 Subject: [PATCH 1/5] fix: :art: Correct timestamp data type --- robotoff/utils/sql/jsonl_to_parquet.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql index dfc8f8f175..deb91d7e9d 100644 --- a/robotoff/utils/sql/jsonl_to_parquet.sql +++ b/robotoff/utils/sql/jsonl_to_parquet.sql @@ -18,7 +18,7 @@ COPY ( completeness, correctors_tags, countries_tags, - created_t, + to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime creator, data_quality_errors_tags, data_quality_info_tags, @@ -45,7 +45,7 @@ COPY ( ingredients_tags, ingredients_text_with_allergens, ingredients_text, - COLUMNS('ingredients_text_\w{2}$'), + COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_.. ingredients_with_specified_percent_n, ingredients_with_unspecified_percent_n, ciqual_food_name_tags, @@ -61,10 +61,10 @@ COPY ( languages_codes, last_edit_dates_tags, last_editor, - last_image_t, + to_timestamp(last_image_t)::datetime AS last_image_t, last_modified_by, - last_modified_t, - last_updated_t, + to_timestamp(last_modified_t)::datetime AS last_modified_t, + to_timestamp(last_updated_t)::datetime AS last_updated_t, link, main_countries_tags, manufacturing_places, From dbd45d753e15892abdc75e3d36e3b04a41ceba55 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Fri, 8 Nov 2024 19:08:00 +0100 Subject: [PATCH 2/5] fix: :art: WIP --- robotoff/products.py | 11 +- robotoff/utils/export.py | 116 +++++++++++ robotoff/utils/sql/jsonl_to_parquet.sql | 263 ++++++++++++------------ 3 files changed, 255 insertions(+), 135 deletions(-) create mode 100644 robotoff/utils/export.py diff --git a/robotoff/products.py b/robotoff/products.py index 284e4d8445..f90fc1db77 100644 --- a/robotoff/products.py +++ b/robotoff/products.py @@ -18,6 +18,7 @@ from robotoff import settings from robotoff.types import JSONType, ProductIdentifier, ServerType from robotoff.utils import get_logger, gzip_jsonl_iter, http_session, jsonl_iter +from robotoff.utils import export logger = get_logger(__name__) @@ -592,9 +593,15 @@ def convert_jsonl_to_parquet( .replace("{output_path}", output_file_path) ) try: - duckdb.sql(query) + logger.info("Query the JSONL using DuckDB.") + arrow_batches = duckdb.sql(query).fetch_arrow_reader(batch_size=100000) + logger.info("Post-process extracted data using Arrow") + # arrow_batches = export.postprocess_arrow_batches(arrow_batches) + logger.info("Write post-processed data into Parquet.") + export.sink_to_parquet(output_file_path, batches=arrow_batches) + except duckdb.Error as e: - logger.error(f"Error executing query: {query}\nError message: {e}") + logger.error("Error executing query: %s\nError message: %s", query, e) raise logger.info("JSONL successfully converted into Parquet file.") diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py new file mode 100644 index 0000000000..8df44af415 --- /dev/null +++ b/robotoff/utils/export.py @@ -0,0 +1,116 @@ +"Functions to postprocess the database conversion into Parquet." + +import json +from typing import Iterator + +import pyarrow as pa +import pyarrow.parquet as pq + + +################ +# Schemas +################ +## Images field +_size_schema = pa.struct( + [ + pa.field("h", pa.int32(), nullable=True), + pa.field("w", pa.int32(), nullable=True), + ] +) + +_dict_schema = pa.struct( + [ + pa.field("key", pa.string(), nullable=True), + pa.field("imgid", pa.string(), nullable=True), + pa.field( + "sizes", + pa.struct( + [ + pa.field("100", _size_schema, nullable=True), + pa.field("200", _size_schema, nullable=True), + pa.field("400", _size_schema, nullable=True), + pa.field("full", _size_schema, nullable=True), + ] + ), + nullable=True, + ), + pa.field("uploaded_t", pa.string(), nullable=True), + pa.field("uploader", pa.string(), nullable=True), + ] +) + +IMAGES_DATATYPE = pa.list_(_dict_schema) + + +################ +# Functions +################ +def sink_to_parquet(path: str, batches: pa.RecordBatchReader): + schema = batches.schema + schema = schema.remove(schema.get_field_index("images")) + schema = schema.append(pa.field("images", IMAGES_DATATYPE)) + with pq.ParquetWriter(path, schema=schema) as writer: + for batch in batches: + batch = batches.read_next_batch() + batch = _postprocess_arrow_batch(batch) + # batch = _postprocess_arrow_batch(batch) + writer.write_batch(batch) + + +def postprocess_arrow_batches(batches: pa.RecordBatchReader) -> pa.RecordBatchReader: + + return pa.RecordBatchReader.from_batches( + schema=batches.schema, + batches=[_postprocess_arrow_batch(batch) for batch in batches] + ) + + +def _postprocess_arrow_batch(batch: pa.RecordBatch) -> pa.RecordBatch: + batch = _postprocess_images(batch) + return batch + + +def _postprocess_images( + batch: pa.RecordBatch, + datatype: pa.DataType = IMAGES_DATATYPE + ): + postprocessed_images = [] + images: list[dict | None] = [ + json.loads(image) if image else None for image in batch["images"].to_pylist() + ] + for image in images: + if image: + postprocessed_images.append( + [ + { + "key": key, + "imgid": str(value.get("imgid", "unknown")), + "sizes": { + "100": { + "h": value.get("sizes", {}).get("100", {}).get("h", 0), + "w": value.get("sizes", {}).get("100", {}).get("w", 0), + }, + "200": { + "h": value.get("sizes", {}).get("200", {}).get("h", 0), + "w": value.get("sizes", {}).get("200", {}).get("w", 0), + }, + "400": { + "h": value.get("sizes", {}).get("400", {}).get("h", 0), + "w": value.get("sizes", {}).get("400", {}).get("w", 0), + }, + "full": { + "h": value.get("sizes", {}).get("full", {}).get("h", 0), + "w": value.get("sizes", {}).get("full", {}).get("w", 0), + }, + }, + "uploaded_t": str(value.get("uploaded_t", "unknown")), + "uploader": str(value.get("uploader", "unknown")), + } + for key, value in image.items() + ] + ) + else: + postprocessed_images.append([]) + images_array = pa.array(postprocessed_images, type=datatype) + batch = batch.set_column(1, "images", images_array) + return batch diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql index deb91d7e9d..cdd6ba7445 100644 --- a/robotoff/utils/sql/jsonl_to_parquet.sql +++ b/robotoff/utils/sql/jsonl_to_parquet.sql @@ -1,135 +1,132 @@ SET threads to 4; SET preserve_insertion_order = false; -COPY ( - SELECT - code, - additives_n, - additives_tags, - allergens_from_ingredients, - allergens_from_user, - allergens_tags, - brands_tags, - categories_properties_tags, - categories, - checkers_tags, - cities_tags, - compared_to_category, - complete, - completeness, - correctors_tags, - countries_tags, - to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime - creator, - data_quality_errors_tags, - data_quality_info_tags, - data_quality_warnings_tags, - data_sources_tags, - ecoscore_data, - ecoscore_grade, - ecoscore_score, - ecoscore_tags, - editors, - emb_codes, - emb_codes_tags, - entry_dates_tags, - environment_impact_level, - food_groups_tags, - forest_footprint_data, - generic_name, - grades, - images, - informers_tags, - ingredients_analysis_tags, - ingredients_from_palm_oil_n, - ingredients_n, - ingredients_tags, - ingredients_text_with_allergens, - ingredients_text, - COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_.. - ingredients_with_specified_percent_n, - ingredients_with_unspecified_percent_n, - ciqual_food_name_tags, - ingredients_percent_analysis, - ingredients_original_tags, - ingredients_without_ciqual_codes_n, - ingredients_without_ciqual_codes, - ingredients, - known_ingredients_n, - labels_tags, - lang, - languages_tags, - languages_codes, - last_edit_dates_tags, - last_editor, - to_timestamp(last_image_t)::datetime AS last_image_t, - last_modified_by, - to_timestamp(last_modified_t)::datetime AS last_modified_t, - to_timestamp(last_updated_t)::datetime AS last_updated_t, - link, - main_countries_tags, - manufacturing_places, - manufacturing_places_tags, - max_imgid, - misc_tags, - minerals_tags, - new_additives_n, - no_nutrition_data, - nova_group, - nova_groups, - nova_groups_markers, - nova_groups_tags, - nucleotides_tags, - nutrient_levels_tags, - unknown_nutrients_tags, - nutriments, - nutriscore_data, - nutriscore_grade, - nutriscore_score, - nutriscore_tags, - nutrition_data_prepared_per, - nutrition_data, - nutrition_grades_tags, - nutrition_score_beverage, - nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients, - nutrition_score_warning_no_fiber, - nutrition_score_warning_no_fruits_vegetables_nuts, - obsolete_since_date, - obsolete, - origins_tags, - packaging_recycling_tags, - packaging_shapes_tags, - packaging_tags, - packagings_materials, - packagings_n, - packagings_n, - photographers, - pnns_groups_1_tags, - pnns_groups_2_tags, - popularity_key, - popularity_tags, - product_name, - product_quantity_unit, - product_quantity, - purchase_places_tags, - quantity, - rev, - scans_n, - scores, - serving_quantity, - serving_size, - sources, - sources_fields, - specific_ingredients, - states_tags, - stores, - stores_tags, - traces_tags, - unique_scans_n, - unknown_ingredients_n, - vitamins_tags, - weighers_tags, - with_non_nutritive_sweeteners, - with_sweeteners, - FROM read_ndjson('{dataset_path}', ignore_errors=True) -) TO '{output_path}' (FORMAT PARQUET) -; \ No newline at end of file +SELECT + code, + additives_n, + additives_tags, + allergens_from_ingredients, + allergens_from_user, + allergens_tags, + brands_tags, + categories_properties_tags, + categories, + checkers_tags, + cities_tags, + compared_to_category, + complete, + completeness, + correctors_tags, + countries_tags, + to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime + creator, + data_quality_errors_tags, + data_quality_info_tags, + data_quality_warnings_tags, + data_sources_tags, + ecoscore_data, + ecoscore_grade, + ecoscore_score, + ecoscore_tags, + editors, + emb_codes, + emb_codes_tags, + entry_dates_tags, + environment_impact_level, + food_groups_tags, + forest_footprint_data, + generic_name, + grades, + images, + informers_tags, + ingredients_analysis_tags, + ingredients_from_palm_oil_n, + ingredients_n, + ingredients_tags, + ingredients_text_with_allergens, + ingredients_text, + COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_.. + ingredients_with_specified_percent_n, + ingredients_with_unspecified_percent_n, + ciqual_food_name_tags, + ingredients_percent_analysis, + ingredients_original_tags, + ingredients_without_ciqual_codes_n, + ingredients_without_ciqual_codes, + ingredients, + known_ingredients_n, + labels_tags, + lang, + languages_tags, + languages_codes, + last_edit_dates_tags, + last_editor, + to_timestamp(last_image_t)::datetime AS last_image_t, + last_modified_by, + to_timestamp(last_modified_t)::datetime AS last_modified_t, + to_timestamp(last_updated_t)::datetime AS last_updated_t, + link, + main_countries_tags, + manufacturing_places, + manufacturing_places_tags, + max_imgid, + misc_tags, + minerals_tags, + new_additives_n, + no_nutrition_data, + nova_group, + nova_groups, + nova_groups_markers, + nova_groups_tags, + nucleotides_tags, + nutrient_levels_tags, + unknown_nutrients_tags, + nutriments, + nutriscore_data, + nutriscore_grade, + nutriscore_score, + nutriscore_tags, + nutrition_data_prepared_per, + nutrition_data, + nutrition_grades_tags, + nutrition_score_beverage, + nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients, + nutrition_score_warning_no_fiber, + nutrition_score_warning_no_fruits_vegetables_nuts, + obsolete_since_date, + obsolete, + origins_tags, + packaging_recycling_tags, + packaging_shapes_tags, + packaging_tags, + packagings_materials, + packagings_n, + packagings_n, + photographers, + pnns_groups_1_tags, + pnns_groups_2_tags, + popularity_key, + popularity_tags, + product_name, + product_quantity_unit, + product_quantity, + purchase_places_tags, + quantity, + rev, + scans_n, + scores, + serving_quantity, + serving_size, + sources, + sources_fields, + specific_ingredients, + states_tags, + stores, + stores_tags, + traces_tags, + unique_scans_n, + unknown_ingredients_n, + vitamins_tags, + weighers_tags, + with_non_nutritive_sweeteners, + with_sweeteners, +FROM read_ndjson('{dataset_path}', ignore_errors=True) \ No newline at end of file From b64a308e5523ed3e88826539be1789fa229ef953 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Mon, 11 Nov 2024 21:01:20 +0100 Subject: [PATCH 3/5] fix: :construction: WIP --- poetry.lock | 87 ++++---- pyproject.toml | 2 +- robotoff/products.py | 37 ++-- robotoff/utils/export.py | 54 +++-- robotoff/utils/sql/jsonl_to_parquet.sql | 262 ++++++++++++------------ 5 files changed, 236 insertions(+), 206 deletions(-) diff --git a/poetry.lock b/poetry.lock index 90bd8ec5ba..053d6f814d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2718,52 +2718,55 @@ six = "*" [[package]] name = "pyarrow" -version = "17.0.0" +version = "18.0.0" description = "Python library for Apache Arrow" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"}, - {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"}, - {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"}, - {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"}, - {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"}, - {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"}, - {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"}, - {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"}, - {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"}, - {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"}, - {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"}, - {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"}, - {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"}, - {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"}, - {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"}, - {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"}, - {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"}, - {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"}, - {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"}, - {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"}, - {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"}, + {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2333f93260674e185cfbf208d2da3007132572e56871f451ba1a556b45dae6e2"}, + {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4c381857754da44326f3a49b8b199f7f87a51c2faacd5114352fc78de30d3aba"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:603cd8ad4976568954598ef0a6d4ed3dfb78aff3d57fa8d6271f470f0ce7d34f"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58a62549a3e0bc9e03df32f350e10e1efb94ec6cf63e3920c3385b26663948ce"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bc97316840a349485fbb137eb8d0f4d7057e1b2c1272b1a20eebbbe1848f5122"}, + {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:2e549a748fa8b8715e734919923f69318c953e077e9c02140ada13e59d043310"}, + {file = "pyarrow-18.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:606e9a3dcb0f52307c5040698ea962685fb1c852d72379ee9412be7de9c5f9e2"}, + {file = "pyarrow-18.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d5795e37c0a33baa618c5e054cd61f586cf76850a251e2b21355e4085def6280"}, + {file = "pyarrow-18.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:5f0510608ccd6e7f02ca8596962afb8c6cc84c453e7be0da4d85f5f4f7b0328a"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:616ea2826c03c16e87f517c46296621a7c51e30400f6d0a61be645f203aa2b93"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1824f5b029ddd289919f354bc285992cb4e32da518758c136271cf66046ef22"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd1b52d0d58dd8f685ced9971eb49f697d753aa7912f0a8f50833c7a7426319"}, + {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:320ae9bd45ad7ecc12ec858b3e8e462578de060832b98fc4d671dee9f10d9954"}, + {file = "pyarrow-18.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:2c992716cffb1088414f2b478f7af0175fd0a76fea80841b1706baa8fb0ebaad"}, + {file = "pyarrow-18.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:e7ab04f272f98ebffd2a0661e4e126036f6936391ba2889ed2d44c5006237802"}, + {file = "pyarrow-18.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:03f40b65a43be159d2f97fd64dc998f769d0995a50c00f07aab58b0b3da87e1f"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be08af84808dff63a76860847c48ec0416928a7b3a17c2f49a072cac7c45efbd"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c70c1965cde991b711a98448ccda3486f2a336457cf4ec4dca257a926e149c9"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:00178509f379415a3fcf855af020e3340254f990a8534294ec3cf674d6e255fd"}, + {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a71ab0589a63a3e987beb2bc172e05f000a5c5be2636b4b263c44034e215b5d7"}, + {file = "pyarrow-18.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:fe92efcdbfa0bcf2fa602e466d7f2905500f33f09eb90bf0bcf2e6ca41b574c8"}, + {file = "pyarrow-18.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:907ee0aa8ca576f5e0cdc20b5aeb2ad4d3953a3b4769fc4b499e00ef0266f02f"}, + {file = "pyarrow-18.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:66dcc216ebae2eb4c37b223feaf82f15b69d502821dde2da138ec5a3716e7463"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc1daf7c425f58527900876354390ee41b0ae962a73ad0959b9d829def583bb1"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:871b292d4b696b09120ed5bde894f79ee2a5f109cb84470546471df264cae136"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:082ba62bdcb939824ba1ce10b8acef5ab621da1f4c4805e07bfd153617ac19d4"}, + {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:2c664ab88b9766413197733c1720d3dcd4190e8fa3bbdc3710384630a0a7207b"}, + {file = "pyarrow-18.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc892be34dbd058e8d189b47db1e33a227d965ea8805a235c8a7286f7fd17d3a"}, + {file = "pyarrow-18.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:28f9c39a56d2c78bf6b87dcc699d520ab850919d4a8c7418cd20eda49874a2ea"}, + {file = "pyarrow-18.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:f1a198a50c409ab2d009fbf20956ace84567d67f2c5701511d4dd561fae6f32e"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5bd7fd32e3ace012d43925ea4fc8bd1b02cc6cc1e9813b518302950e89b5a22"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79"}, + {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420"}, + {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8"}, + {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55"}, + {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03"}, + {file = "pyarrow-18.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2"}, + {file = "pyarrow-18.0.0.tar.gz", hash = "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5"}, ] -[package.dependencies] -numpy = ">=1.16.6" - [package.extras] test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] @@ -4589,4 +4592,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "2ea79042966beabcc2cf7384fd1c819610e59e1970edbcf1bd9453cd92ccc51f" +content-hash = "ebaacb4dc0e299ab2fdd1fdd57d1e26a2fed1488e0d636c9a7793b447d05d632" diff --git a/pyproject.toml b/pyproject.toml index 07281fe934..1ebf56599d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ google-cloud-batch = "~0.17.26" duckdb = "~1.0.0" google-cloud-storage = "~2.14.0" pandas = "~2.2.2" -pyarrow = "~17.0.0" +pyarrow = "18.0.0" rich = "~13.9.2" # Used for CLI pretty print [tool.poetry.dependencies.sentry-sdk] diff --git a/robotoff/products.py b/robotoff/products.py index f90fc1db77..bedb43bd75 100644 --- a/robotoff/products.py +++ b/robotoff/products.py @@ -582,27 +582,32 @@ def convert_jsonl_to_parquet( dataset_path: Path = settings.JSONL_DATASET_PATH, query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY, ) -> None: + from pyarrow import ArrowException logger.info("Start JSONL to Parquet conversion process.") if not dataset_path.exists() or not query_path.exists(): raise FileNotFoundError( f"{str(dataset_path)} or {str(query_path)} was not found." ) - query = ( - query_path.read_text() - .replace("{dataset_path}", str(dataset_path)) - .replace("{output_path}", output_file_path) - ) - try: - logger.info("Query the JSONL using DuckDB.") - arrow_batches = duckdb.sql(query).fetch_arrow_reader(batch_size=100000) - logger.info("Post-process extracted data using Arrow") - # arrow_batches = export.postprocess_arrow_batches(arrow_batches) - logger.info("Write post-processed data into Parquet.") - export.sink_to_parquet(output_file_path, batches=arrow_batches) - - except duckdb.Error as e: - logger.error("Error executing query: %s\nError message: %s", query, e) - raise + with tempfile.TemporaryDirectory() as tmp_dir: + try: + tmp_parquet_path = os.path.join(tmp_dir, "temp.parquet") + query = ( + query_path.read_text() + .replace("{dataset_path}", str(dataset_path)) + .replace("{output_path}", tmp_parquet_path) + ) + logger.info("Query the JSONL using DuckDB.") + duckdb.sql(query) + # logger.info("Post-process extracted data using Arrow") + # arrow_batches = export.load_parquet() + logger.info("Write post-processed data into Parquet.") + export.sink_to_parquet(tmp_parquet_path, output_file_path) + except duckdb.Error as e: + logger.error("Error executing query: %s\nError message: %s", query, e) + raise + except ArrowException as e: + logger.error(e) + raise logger.info("JSONL successfully converted into Parquet file.") diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py index 8df44af415..d67b18d7f3 100644 --- a/robotoff/utils/export.py +++ b/robotoff/utils/export.py @@ -1,12 +1,9 @@ "Functions to postprocess the database conversion into Parquet." - import json -from typing import Iterator import pyarrow as pa import pyarrow.parquet as pq - ################ # Schemas ################ @@ -41,28 +38,35 @@ IMAGES_DATATYPE = pa.list_(_dict_schema) +SCHEMAS = { + "images": IMAGES_DATATYPE +} + + ################ # Functions ################ -def sink_to_parquet(path: str, batches: pa.RecordBatchReader): - schema = batches.schema - schema = schema.remove(schema.get_field_index("images")) - schema = schema.append(pa.field("images", IMAGES_DATATYPE)) - with pq.ParquetWriter(path, schema=schema) as writer: - for batch in batches: - batch = batches.read_next_batch() +def sink_to_parquet(parquet_path: str, output_path: str): + parquet_file = pq.ParquetFile(parquet_path) + updated_schema = update_schema(parquet_file.schema.to_arrow_schema()) + with pq.ParquetWriter(output_path, schema=updated_schema) as writer: + for batch in parquet_file.iter_batches(batch_size=1000): batch = _postprocess_arrow_batch(batch) - # batch = _postprocess_arrow_batch(batch) writer.write_batch(batch) -def postprocess_arrow_batches(batches: pa.RecordBatchReader) -> pa.RecordBatchReader: - - return pa.RecordBatchReader.from_batches( - schema=batches.schema, - batches=[_postprocess_arrow_batch(batch) for batch in batches] - ) +# def postprocess_arrow_batches(batches: pa.RecordBatchReader) -> pa.RecordBatchReader: +# schema = _udpate_schema_by_field( +# schema=batches.schema, +# field_name="images", +# field_datatype=IMAGES_DATATYPE, +# ) +# batches = [_postprocess_arrow_batch(batch) for batch in batches] +# return pa.RecordBatchReader.from_batches( +# schema=schema, +# batches=batches, +# ) def _postprocess_arrow_batch(batch: pa.RecordBatch) -> pa.RecordBatch: @@ -114,3 +118,19 @@ def _postprocess_images( images_array = pa.array(postprocessed_images, type=datatype) batch = batch.set_column(1, "images", images_array) return batch + + +def update_schema(schema: pa.Schema) -> pa.Schema: + for field_name, field_datatype in SCHEMAS.items(): + schema = _udpate_schema_by_field(schema=schema, field_name=field_name, field_datatype=field_datatype) + return schema + + +def _udpate_schema_by_field(schema: pa.Schema, field_name: str, field_datatype: pa.DataType) -> pa.schema: + field_index = schema.get_field_index(field_name) + schema = schema.remove(field_index) + schema = schema.insert( + field_index, + pa.field(field_name, field_datatype) + ) + return schema diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql index cdd6ba7445..f9d954ae94 100644 --- a/robotoff/utils/sql/jsonl_to_parquet.sql +++ b/robotoff/utils/sql/jsonl_to_parquet.sql @@ -1,132 +1,134 @@ SET threads to 4; SET preserve_insertion_order = false; -SELECT - code, - additives_n, - additives_tags, - allergens_from_ingredients, - allergens_from_user, - allergens_tags, - brands_tags, - categories_properties_tags, - categories, - checkers_tags, - cities_tags, - compared_to_category, - complete, - completeness, - correctors_tags, - countries_tags, - to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime - creator, - data_quality_errors_tags, - data_quality_info_tags, - data_quality_warnings_tags, - data_sources_tags, - ecoscore_data, - ecoscore_grade, - ecoscore_score, - ecoscore_tags, - editors, - emb_codes, - emb_codes_tags, - entry_dates_tags, - environment_impact_level, - food_groups_tags, - forest_footprint_data, - generic_name, - grades, - images, - informers_tags, - ingredients_analysis_tags, - ingredients_from_palm_oil_n, - ingredients_n, - ingredients_tags, - ingredients_text_with_allergens, - ingredients_text, - COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_.. - ingredients_with_specified_percent_n, - ingredients_with_unspecified_percent_n, - ciqual_food_name_tags, - ingredients_percent_analysis, - ingredients_original_tags, - ingredients_without_ciqual_codes_n, - ingredients_without_ciqual_codes, - ingredients, - known_ingredients_n, - labels_tags, - lang, - languages_tags, - languages_codes, - last_edit_dates_tags, - last_editor, - to_timestamp(last_image_t)::datetime AS last_image_t, - last_modified_by, - to_timestamp(last_modified_t)::datetime AS last_modified_t, - to_timestamp(last_updated_t)::datetime AS last_updated_t, - link, - main_countries_tags, - manufacturing_places, - manufacturing_places_tags, - max_imgid, - misc_tags, - minerals_tags, - new_additives_n, - no_nutrition_data, - nova_group, - nova_groups, - nova_groups_markers, - nova_groups_tags, - nucleotides_tags, - nutrient_levels_tags, - unknown_nutrients_tags, - nutriments, - nutriscore_data, - nutriscore_grade, - nutriscore_score, - nutriscore_tags, - nutrition_data_prepared_per, - nutrition_data, - nutrition_grades_tags, - nutrition_score_beverage, - nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients, - nutrition_score_warning_no_fiber, - nutrition_score_warning_no_fruits_vegetables_nuts, - obsolete_since_date, - obsolete, - origins_tags, - packaging_recycling_tags, - packaging_shapes_tags, - packaging_tags, - packagings_materials, - packagings_n, - packagings_n, - photographers, - pnns_groups_1_tags, - pnns_groups_2_tags, - popularity_key, - popularity_tags, - product_name, - product_quantity_unit, - product_quantity, - purchase_places_tags, - quantity, - rev, - scans_n, - scores, - serving_quantity, - serving_size, - sources, - sources_fields, - specific_ingredients, - states_tags, - stores, - stores_tags, - traces_tags, - unique_scans_n, - unknown_ingredients_n, - vitamins_tags, - weighers_tags, - with_non_nutritive_sweeteners, - with_sweeteners, -FROM read_ndjson('{dataset_path}', ignore_errors=True) \ No newline at end of file +COPY( + SELECT code, + additives_n, + additives_tags, + allergens_from_ingredients, + allergens_from_user, + allergens_tags, + brands_tags, + categories_properties_tags, + categories, + checkers_tags, + cities_tags, + compared_to_category, + complete, + completeness, + correctors_tags, + countries_tags, + to_timestamp(created_t)::datetime AS created_t, + creator, + data_quality_errors_tags, + data_quality_info_tags, + data_quality_warnings_tags, + data_sources_tags, + ecoscore_data, + ecoscore_grade, + ecoscore_score, + ecoscore_tags, + editors, + emb_codes, + emb_codes_tags, + entry_dates_tags, + environment_impact_level, + food_groups_tags, + forest_footprint_data, + generic_name, + grades, + images, + informers_tags, + ingredients_analysis_tags, + ingredients_from_palm_oil_n, + ingredients_n, + ingredients_tags, + ingredients_text_with_allergens, + ingredients_text, + COLUMNS('ingredients_text_\w{2}$'), + -- All columns containing ingredients_text_.. + ingredients_with_specified_percent_n, + ingredients_with_unspecified_percent_n, + ciqual_food_name_tags, + ingredients_percent_analysis, + ingredients_original_tags, + ingredients_without_ciqual_codes_n, + ingredients_without_ciqual_codes, + ingredients, + known_ingredients_n, + labels_tags, + lang, + languages_tags, + languages_codes, + last_edit_dates_tags, + last_editor, + to_timestamp(last_image_t)::datetime AS last_image_t, + last_modified_by, + to_timestamp(last_modified_t)::datetime AS last_modified_t, + to_timestamp(last_updated_t)::datetime AS last_updated_t, + link, + main_countries_tags, + manufacturing_places, + manufacturing_places_tags, + max_imgid, + misc_tags, + minerals_tags, + new_additives_n, + no_nutrition_data, + nova_group, + nova_groups, + nova_groups_markers, + nova_groups_tags, + nucleotides_tags, + nutrient_levels_tags, + unknown_nutrients_tags, + nutriments, + nutriscore_data, + nutriscore_grade, + nutriscore_score, + nutriscore_tags, + nutrition_data_prepared_per, + nutrition_data, + nutrition_grades_tags, + nutrition_score_beverage, + nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients, + nutrition_score_warning_no_fiber, + nutrition_score_warning_no_fruits_vegetables_nuts, + obsolete_since_date, + obsolete, + origins_tags, + packaging_recycling_tags, + packaging_shapes_tags, + packaging_tags, + packagings_materials, + packagings_n, + packagings_n, + photographers, + pnns_groups_1_tags, + pnns_groups_2_tags, + popularity_key, + popularity_tags, + product_name, + product_quantity_unit, + product_quantity, + purchase_places_tags, + quantity, + rev, + scans_n, + scores, + serving_quantity, + serving_size, + sources, + sources_fields, + specific_ingredients, + states_tags, + stores, + stores_tags, + traces_tags, + unique_scans_n, + unknown_ingredients_n, + vitamins_tags, + weighers_tags, + with_non_nutritive_sweeteners, + with_sweeteners, + FROM read_ndjson('{dataset_path}', ignore_errors = True) +) TO '{output_path}' (FORMAT PARQUET); \ No newline at end of file From 6a523eaf1817f09608531f15f151b6d83764f59c Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 12 Nov 2024 11:46:28 +0100 Subject: [PATCH 4/5] fix: :fire: Works! --- robotoff/utils/export.py | 75 +++++++++++++------------ robotoff/utils/sql/jsonl_to_parquet.sql | 1 - 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py index d67b18d7f3..6335da54f6 100644 --- a/robotoff/utils/export.py +++ b/robotoff/utils/export.py @@ -1,4 +1,5 @@ "Functions to postprocess the database conversion into Parquet." + import json import pyarrow as pa @@ -15,40 +16,38 @@ ] ) -_dict_schema = pa.struct( - [ - pa.field("key", pa.string(), nullable=True), - pa.field("imgid", pa.string(), nullable=True), - pa.field( - "sizes", - pa.struct( - [ - pa.field("100", _size_schema, nullable=True), - pa.field("200", _size_schema, nullable=True), - pa.field("400", _size_schema, nullable=True), - pa.field("full", _size_schema, nullable=True), - ] + +IMAGES_DATATYPE = pa.list_( + pa.struct( + [ + pa.field("key", pa.string(), nullable=True), + pa.field("imgid", pa.string(), nullable=True), + pa.field( + "sizes", + pa.struct( + [ + pa.field("100", _size_schema, nullable=True), + pa.field("200", _size_schema, nullable=True), + pa.field("400", _size_schema, nullable=True), + pa.field("full", _size_schema, nullable=True), + ] + ), + nullable=True, ), - nullable=True, - ), - pa.field("uploaded_t", pa.string(), nullable=True), - pa.field("uploader", pa.string(), nullable=True), - ] + pa.field("uploaded_t", pa.string(), nullable=True), + pa.field("uploader", pa.string(), nullable=True), + ] + ) ) -IMAGES_DATATYPE = pa.list_(_dict_schema) - -SCHEMAS = { - "images": IMAGES_DATATYPE -} - +SCHEMAS = {"images": IMAGES_DATATYPE} ################ # Functions ################ def sink_to_parquet(parquet_path: str, output_path: str): - parquet_file = pq.ParquetFile(parquet_path) + parquet_file = pq.ParquetFile(parquet_path) updated_schema = update_schema(parquet_file.schema.to_arrow_schema()) with pq.ParquetWriter(output_path, schema=updated_schema) as writer: for batch in parquet_file.iter_batches(batch_size=1000): @@ -74,10 +73,7 @@ def _postprocess_arrow_batch(batch: pa.RecordBatch) -> pa.RecordBatch: return batch -def _postprocess_images( - batch: pa.RecordBatch, - datatype: pa.DataType = IMAGES_DATATYPE - ): +def _postprocess_images(batch: pa.RecordBatch, datatype: pa.DataType = IMAGES_DATATYPE): postprocessed_images = [] images: list[dict | None] = [ json.loads(image) if image else None for image in batch["images"].to_pylist() @@ -115,22 +111,27 @@ def _postprocess_images( ) else: postprocessed_images.append([]) - images_array = pa.array(postprocessed_images, type=datatype) - batch = batch.set_column(1, "images", images_array) + images_col_index = batch.schema.get_field_index("images") + batch = batch.set_column( + images_col_index, + pa.field("images", datatype), + pa.array(postprocessed_images, type=datatype) + ) return batch def update_schema(schema: pa.Schema) -> pa.Schema: for field_name, field_datatype in SCHEMAS.items(): - schema = _udpate_schema_by_field(schema=schema, field_name=field_name, field_datatype=field_datatype) + schema = _udpate_schema_by_field( + schema=schema, field_name=field_name, field_datatype=field_datatype + ) return schema -def _udpate_schema_by_field(schema: pa.Schema, field_name: str, field_datatype: pa.DataType) -> pa.schema: +def _udpate_schema_by_field( + schema: pa.Schema, field_name: str, field_datatype: pa.DataType +) -> pa.schema: field_index = schema.get_field_index(field_name) schema = schema.remove(field_index) - schema = schema.insert( - field_index, - pa.field(field_name, field_datatype) - ) + schema = schema.insert(field_index, pa.field(field_name, field_datatype)) return schema diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql index f9d954ae94..bf469155e5 100644 --- a/robotoff/utils/sql/jsonl_to_parquet.sql +++ b/robotoff/utils/sql/jsonl_to_parquet.sql @@ -45,7 +45,6 @@ COPY( ingredients_text_with_allergens, ingredients_text, COLUMNS('ingredients_text_\w{2}$'), - -- All columns containing ingredients_text_.. ingredients_with_specified_percent_n, ingredients_with_unspecified_percent_n, ciqual_food_name_tags, From 7009f0dc5b268250a69d50ea4deabc3f0a96c933 Mon Sep 17 00:00:00 2001 From: jeremyarancio Date: Tue, 12 Nov 2024 14:29:35 +0100 Subject: [PATCH 5/5] fix: --- robotoff/utils/export.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py index 6335da54f6..1298fbc1eb 100644 --- a/robotoff/utils/export.py +++ b/robotoff/utils/export.py @@ -83,24 +83,24 @@ def _postprocess_images(batch: pa.RecordBatch, datatype: pa.DataType = IMAGES_DA postprocessed_images.append( [ { - "key": key, + "key": str(key), "imgid": str(value.get("imgid", "unknown")), "sizes": { "100": { - "h": value.get("sizes", {}).get("100", {}).get("h", 0), - "w": value.get("sizes", {}).get("100", {}).get("w", 0), + "h": int(value.get("sizes", {}).get("100", {}).get("h", 0) or 0), # (or 0) because "h" or "w" can be none, leading to an error with int + "w": int(value.get("sizes", {}).get("100", {}).get("w", 0) or 0), }, "200": { - "h": value.get("sizes", {}).get("200", {}).get("h", 0), - "w": value.get("sizes", {}).get("200", {}).get("w", 0), + "h": int(value.get("sizes", {}).get("200", {}).get("h", 0) or 0), + "w": int(value.get("sizes", {}).get("200", {}).get("w", 0) or 0), }, "400": { - "h": value.get("sizes", {}).get("400", {}).get("h", 0), - "w": value.get("sizes", {}).get("400", {}).get("w", 0), + "h": int(value.get("sizes", {}).get("400", {}).get("h", 0) or 0), + "w": int(value.get("sizes", {}).get("400", {}).get("w", 0) or 0), }, "full": { - "h": value.get("sizes", {}).get("full", {}).get("h", 0), - "w": value.get("sizes", {}).get("full", {}).get("w", 0), + "h": int(value.get("sizes", {}).get("full", {}).get("h", 0) or 0), + "w": int(value.get("sizes", {}).get("full", {}).get("w", 0) or 0), }, }, "uploaded_t": str(value.get("uploaded_t", "unknown")),