From d2259a8c16b20716a9407dbe10c06096540ccd51 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Wed, 30 Oct 2024 15:28:50 +0100
Subject: [PATCH 1/5] fix: :art: Correct timestamp data type

---
 robotoff/utils/sql/jsonl_to_parquet.sql | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql
index dfc8f8f175..deb91d7e9d 100644
--- a/robotoff/utils/sql/jsonl_to_parquet.sql
+++ b/robotoff/utils/sql/jsonl_to_parquet.sql
@@ -18,7 +18,7 @@ COPY (
         completeness,
         correctors_tags,
         countries_tags,
-        created_t,
+        to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime
         creator,
         data_quality_errors_tags,
         data_quality_info_tags,
@@ -45,7 +45,7 @@ COPY (
         ingredients_tags,
         ingredients_text_with_allergens,
         ingredients_text,
-        COLUMNS('ingredients_text_\w{2}$'),
+        COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_..
         ingredients_with_specified_percent_n,
         ingredients_with_unspecified_percent_n,
         ciqual_food_name_tags,
@@ -61,10 +61,10 @@ COPY (
         languages_codes,
         last_edit_dates_tags,
         last_editor,
-        last_image_t,
+        to_timestamp(last_image_t)::datetime AS last_image_t,
         last_modified_by,
-        last_modified_t,
-        last_updated_t,
+        to_timestamp(last_modified_t)::datetime AS last_modified_t,
+        to_timestamp(last_updated_t)::datetime AS last_updated_t,
         link,
         main_countries_tags,
         manufacturing_places,

From dbd45d753e15892abdc75e3d36e3b04a41ceba55 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Fri, 8 Nov 2024 19:08:00 +0100
Subject: [PATCH 2/5] fix: :art: WIP

---
 robotoff/products.py                    |  11 +-
 robotoff/utils/export.py                | 116 +++++++++++
 robotoff/utils/sql/jsonl_to_parquet.sql | 263 ++++++++++++------------
 3 files changed, 255 insertions(+), 135 deletions(-)
 create mode 100644 robotoff/utils/export.py

diff --git a/robotoff/products.py b/robotoff/products.py
index 284e4d8445..f90fc1db77 100644
--- a/robotoff/products.py
+++ b/robotoff/products.py
@@ -18,6 +18,7 @@
 from robotoff import settings
 from robotoff.types import JSONType, ProductIdentifier, ServerType
 from robotoff.utils import get_logger, gzip_jsonl_iter, http_session, jsonl_iter
+from robotoff.utils import export
 
 logger = get_logger(__name__)
 
@@ -592,9 +593,15 @@ def convert_jsonl_to_parquet(
         .replace("{output_path}", output_file_path)
     )
     try:
-        duckdb.sql(query)
+        logger.info("Query the JSONL using DuckDB.")
+        arrow_batches = duckdb.sql(query).fetch_arrow_reader(batch_size=100000)
+        logger.info("Post-process extracted data using Arrow")
+        # arrow_batches = export.postprocess_arrow_batches(arrow_batches)
+        logger.info("Write post-processed data into Parquet.")
+        export.sink_to_parquet(output_file_path, batches=arrow_batches)
+        
     except duckdb.Error as e:
-        logger.error(f"Error executing query: {query}\nError message: {e}")
+        logger.error("Error executing query: %s\nError message: %s", query, e)
         raise
     logger.info("JSONL successfully converted into Parquet file.")
 
diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py
new file mode 100644
index 0000000000..8df44af415
--- /dev/null
+++ b/robotoff/utils/export.py
@@ -0,0 +1,116 @@
+"Functions to postprocess the database conversion into Parquet."
+
+import json
+from typing import Iterator
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+
+################
+# Schemas
+################
+## Images field
+_size_schema = pa.struct(
+    [
+        pa.field("h", pa.int32(), nullable=True),
+        pa.field("w", pa.int32(), nullable=True),
+    ]
+)
+
+_dict_schema = pa.struct(
+    [
+        pa.field("key", pa.string(), nullable=True),
+        pa.field("imgid", pa.string(), nullable=True),
+        pa.field(
+            "sizes",
+            pa.struct(
+                [
+                    pa.field("100", _size_schema, nullable=True),
+                    pa.field("200", _size_schema, nullable=True),
+                    pa.field("400", _size_schema, nullable=True),
+                    pa.field("full", _size_schema, nullable=True),
+                ]
+            ),
+            nullable=True,
+        ),
+        pa.field("uploaded_t", pa.string(), nullable=True),
+        pa.field("uploader", pa.string(), nullable=True),
+    ]
+)
+
+IMAGES_DATATYPE = pa.list_(_dict_schema)
+
+
+################
+# Functions
+################
+def sink_to_parquet(path: str, batches: pa.RecordBatchReader):
+    schema = batches.schema
+    schema = schema.remove(schema.get_field_index("images"))
+    schema = schema.append(pa.field("images", IMAGES_DATATYPE))
+    with pq.ParquetWriter(path, schema=schema) as writer:
+        for batch in batches:
+            batch = batches.read_next_batch()
+            batch = _postprocess_arrow_batch(batch)
+            # batch = _postprocess_arrow_batch(batch)
+            writer.write_batch(batch)
+
+
+def postprocess_arrow_batches(batches: pa.RecordBatchReader) -> pa.RecordBatchReader:
+
+    return pa.RecordBatchReader.from_batches(
+        schema=batches.schema,
+        batches=[_postprocess_arrow_batch(batch) for batch in batches]
+    )
+
+
+def _postprocess_arrow_batch(batch: pa.RecordBatch) -> pa.RecordBatch:
+    batch = _postprocess_images(batch)
+    return batch
+
+
+def _postprocess_images(
+        batch: pa.RecordBatch, 
+        datatype: pa.DataType = IMAGES_DATATYPE
+    ):
+    postprocessed_images = []
+    images: list[dict | None] = [
+        json.loads(image) if image else None for image in batch["images"].to_pylist()
+    ]
+    for image in images:
+        if image:
+            postprocessed_images.append(
+                [
+                    {
+                        "key": key,
+                        "imgid": str(value.get("imgid", "unknown")),
+                        "sizes": {
+                            "100": {
+                                "h": value.get("sizes", {}).get("100", {}).get("h", 0),
+                                "w": value.get("sizes", {}).get("100", {}).get("w", 0),
+                            },
+                            "200": {
+                                "h": value.get("sizes", {}).get("200", {}).get("h", 0),
+                                "w": value.get("sizes", {}).get("200", {}).get("w", 0),
+                            },
+                            "400": {
+                                "h": value.get("sizes", {}).get("400", {}).get("h", 0),
+                                "w": value.get("sizes", {}).get("400", {}).get("w", 0),
+                            },
+                            "full": {
+                                "h": value.get("sizes", {}).get("full", {}).get("h", 0),
+                                "w": value.get("sizes", {}).get("full", {}).get("w", 0),
+                            },
+                        },
+                        "uploaded_t": str(value.get("uploaded_t", "unknown")),
+                        "uploader": str(value.get("uploader", "unknown")),
+                    }
+                    for key, value in image.items()
+                ]
+            )
+        else:
+            postprocessed_images.append([])
+    images_array = pa.array(postprocessed_images, type=datatype)
+    batch = batch.set_column(1, "images", images_array)
+    return batch
diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql
index deb91d7e9d..cdd6ba7445 100644
--- a/robotoff/utils/sql/jsonl_to_parquet.sql
+++ b/robotoff/utils/sql/jsonl_to_parquet.sql
@@ -1,135 +1,132 @@
 SET threads to 4;
 SET preserve_insertion_order = false;
-COPY ( 
-    SELECT
-        code,
-        additives_n,
-        additives_tags,
-        allergens_from_ingredients,
-        allergens_from_user,
-        allergens_tags,
-        brands_tags,
-        categories_properties_tags,
-        categories,
-        checkers_tags,
-        cities_tags,
-        compared_to_category,
-        complete,
-        completeness,
-        correctors_tags,
-        countries_tags,
-        to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime
-        creator,
-        data_quality_errors_tags,
-        data_quality_info_tags,
-        data_quality_warnings_tags,
-        data_sources_tags,
-        ecoscore_data,
-        ecoscore_grade,
-        ecoscore_score,
-        ecoscore_tags,
-        editors,
-        emb_codes,
-        emb_codes_tags,
-        entry_dates_tags,
-        environment_impact_level,
-        food_groups_tags,
-        forest_footprint_data,
-        generic_name,
-        grades,
-        images,
-        informers_tags,
-        ingredients_analysis_tags,
-        ingredients_from_palm_oil_n,
-        ingredients_n,
-        ingredients_tags,
-        ingredients_text_with_allergens,
-        ingredients_text,
-        COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_..
-        ingredients_with_specified_percent_n,
-        ingredients_with_unspecified_percent_n,
-        ciqual_food_name_tags,
-        ingredients_percent_analysis,
-        ingredients_original_tags,
-        ingredients_without_ciqual_codes_n,
-        ingredients_without_ciqual_codes,
-        ingredients,
-        known_ingredients_n,
-        labels_tags,
-        lang,
-        languages_tags,
-        languages_codes,
-        last_edit_dates_tags,
-        last_editor,
-        to_timestamp(last_image_t)::datetime AS last_image_t,
-        last_modified_by,
-        to_timestamp(last_modified_t)::datetime AS last_modified_t,
-        to_timestamp(last_updated_t)::datetime AS last_updated_t,
-        link,
-        main_countries_tags,
-        manufacturing_places,
-        manufacturing_places_tags,
-        max_imgid,
-        misc_tags,
-        minerals_tags,
-        new_additives_n,
-        no_nutrition_data,
-        nova_group,
-        nova_groups,
-        nova_groups_markers,
-        nova_groups_tags,
-        nucleotides_tags,
-        nutrient_levels_tags,
-        unknown_nutrients_tags,
-        nutriments,
-        nutriscore_data,
-        nutriscore_grade,
-        nutriscore_score,
-        nutriscore_tags,
-        nutrition_data_prepared_per,
-        nutrition_data,
-        nutrition_grades_tags,
-        nutrition_score_beverage,
-        nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients,
-        nutrition_score_warning_no_fiber,
-        nutrition_score_warning_no_fruits_vegetables_nuts,
-        obsolete_since_date,
-        obsolete,
-        origins_tags,
-        packaging_recycling_tags,
-        packaging_shapes_tags,
-        packaging_tags,
-        packagings_materials,
-        packagings_n,
-        packagings_n,
-        photographers,
-        pnns_groups_1_tags,
-        pnns_groups_2_tags,
-        popularity_key,
-        popularity_tags,
-        product_name,
-        product_quantity_unit,
-        product_quantity,
-        purchase_places_tags,
-        quantity,
-        rev,
-        scans_n,
-        scores,
-        serving_quantity,
-        serving_size,
-        sources,
-        sources_fields,
-        specific_ingredients,
-        states_tags,
-        stores,
-        stores_tags,
-        traces_tags,
-        unique_scans_n,
-        unknown_ingredients_n,
-        vitamins_tags,
-        weighers_tags,
-        with_non_nutritive_sweeteners,
-        with_sweeteners,
-    FROM read_ndjson('{dataset_path}', ignore_errors=True)
-) TO '{output_path}' (FORMAT PARQUET)
-;
\ No newline at end of file
+SELECT
+    code,
+    additives_n,
+    additives_tags,
+    allergens_from_ingredients,
+    allergens_from_user,
+    allergens_tags,
+    brands_tags,
+    categories_properties_tags,
+    categories,
+    checkers_tags,
+    cities_tags,
+    compared_to_category,
+    complete,
+    completeness,
+    correctors_tags,
+    countries_tags,
+    to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime
+    creator,
+    data_quality_errors_tags,
+    data_quality_info_tags,
+    data_quality_warnings_tags,
+    data_sources_tags,
+    ecoscore_data,
+    ecoscore_grade,
+    ecoscore_score,
+    ecoscore_tags,
+    editors,
+    emb_codes,
+    emb_codes_tags,
+    entry_dates_tags,
+    environment_impact_level,
+    food_groups_tags,
+    forest_footprint_data,
+    generic_name,
+    grades,
+    images,
+    informers_tags,
+    ingredients_analysis_tags,
+    ingredients_from_palm_oil_n,
+    ingredients_n,
+    ingredients_tags,
+    ingredients_text_with_allergens,
+    ingredients_text,
+    COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_..
+    ingredients_with_specified_percent_n,
+    ingredients_with_unspecified_percent_n,
+    ciqual_food_name_tags,
+    ingredients_percent_analysis,
+    ingredients_original_tags,
+    ingredients_without_ciqual_codes_n,
+    ingredients_without_ciqual_codes,
+    ingredients,
+    known_ingredients_n,
+    labels_tags,
+    lang,
+    languages_tags,
+    languages_codes,
+    last_edit_dates_tags,
+    last_editor,
+    to_timestamp(last_image_t)::datetime AS last_image_t,
+    last_modified_by,
+    to_timestamp(last_modified_t)::datetime AS last_modified_t,
+    to_timestamp(last_updated_t)::datetime AS last_updated_t,
+    link,
+    main_countries_tags,
+    manufacturing_places,
+    manufacturing_places_tags,
+    max_imgid,
+    misc_tags,
+    minerals_tags,
+    new_additives_n,
+    no_nutrition_data,
+    nova_group,
+    nova_groups,
+    nova_groups_markers,
+    nova_groups_tags,
+    nucleotides_tags,
+    nutrient_levels_tags,
+    unknown_nutrients_tags,
+    nutriments,
+    nutriscore_data,
+    nutriscore_grade,
+    nutriscore_score,
+    nutriscore_tags,
+    nutrition_data_prepared_per,
+    nutrition_data,
+    nutrition_grades_tags,
+    nutrition_score_beverage,
+    nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients,
+    nutrition_score_warning_no_fiber,
+    nutrition_score_warning_no_fruits_vegetables_nuts,
+    obsolete_since_date,
+    obsolete,
+    origins_tags,
+    packaging_recycling_tags,
+    packaging_shapes_tags,
+    packaging_tags,
+    packagings_materials,
+    packagings_n,
+    packagings_n,
+    photographers,
+    pnns_groups_1_tags,
+    pnns_groups_2_tags,
+    popularity_key,
+    popularity_tags,
+    product_name,
+    product_quantity_unit,
+    product_quantity,
+    purchase_places_tags,
+    quantity,
+    rev,
+    scans_n,
+    scores,
+    serving_quantity,
+    serving_size,
+    sources,
+    sources_fields,
+    specific_ingredients,
+    states_tags,
+    stores,
+    stores_tags,
+    traces_tags,
+    unique_scans_n,
+    unknown_ingredients_n,
+    vitamins_tags,
+    weighers_tags,
+    with_non_nutritive_sweeteners,
+    with_sweeteners,
+FROM read_ndjson('{dataset_path}', ignore_errors=True)
\ No newline at end of file

From b64a308e5523ed3e88826539be1789fa229ef953 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Mon, 11 Nov 2024 21:01:20 +0100
Subject: [PATCH 3/5] fix: :construction: WIP

---
 poetry.lock                             |  87 ++++----
 pyproject.toml                          |   2 +-
 robotoff/products.py                    |  37 ++--
 robotoff/utils/export.py                |  54 +++--
 robotoff/utils/sql/jsonl_to_parquet.sql | 262 ++++++++++++------------
 5 files changed, 236 insertions(+), 206 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 90bd8ec5ba..053d6f814d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2718,52 +2718,55 @@ six = "*"
 
 [[package]]
 name = "pyarrow"
-version = "17.0.0"
+version = "18.0.0"
 description = "Python library for Apache Arrow"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "pyarrow-17.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:a5c8b238d47e48812ee577ee20c9a2779e6a5904f1708ae240f53ecbee7c9f07"},
-    {file = "pyarrow-17.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:db023dc4c6cae1015de9e198d41250688383c3f9af8f565370ab2b4cb5f62655"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1e060b3876faa11cee287839f9cc7cdc00649f475714b8680a05fd9071d545"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75c06d4624c0ad6674364bb46ef38c3132768139ddec1c56582dbac54f2663e2"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:fa3c246cc58cb5a4a5cb407a18f193354ea47dd0648194e6265bd24177982fe8"},
-    {file = "pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f7ae2de664e0b158d1607699a16a488de3d008ba99b3a7aa5de1cbc13574d047"},
-    {file = "pyarrow-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5984f416552eea15fd9cee03da53542bf4cddaef5afecefb9aa8d1010c335087"},
-    {file = "pyarrow-17.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:1c8856e2ef09eb87ecf937104aacfa0708f22dfeb039c363ec99735190ffb977"},
-    {file = "pyarrow-17.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e19f569567efcbbd42084e87f948778eb371d308e137a0f97afe19bb860ccb3"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b244dc8e08a23b3e352899a006a26ae7b4d0da7bb636872fa8f5884e70acf15"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b72e87fe3e1db343995562f7fff8aee354b55ee83d13afba65400c178ab2597"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:dc5c31c37409dfbc5d014047817cb4ccd8c1ea25d19576acf1a001fe07f5b420"},
-    {file = "pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e3343cb1e88bc2ea605986d4b94948716edc7a8d14afd4e2c097232f729758b4"},
-    {file = "pyarrow-17.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:a27532c38f3de9eb3e90ecab63dfda948a8ca859a66e3a47f5f42d1e403c4d03"},
-    {file = "pyarrow-17.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9b8a823cea605221e61f34859dcc03207e52e409ccf6354634143e23af7c8d22"},
-    {file = "pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1e70de6cb5790a50b01d2b686d54aaf73da01266850b05e3af2a1bc89e16053"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0071ce35788c6f9077ff9ecba4858108eebe2ea5a3f7cf2cf55ebc1dbc6ee24a"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:757074882f844411fcca735e39aae74248a1531367a7c80799b4266390ae51cc"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ba11c4f16976e89146781a83833df7f82077cdab7dc6232c897789343f7891a"},
-    {file = "pyarrow-17.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b0c6ac301093b42d34410b187bba560b17c0330f64907bfa4f7f7f2444b0cf9b"},
-    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
-    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
-    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
-    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
-    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
-    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
-    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
-    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
-    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
-    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
+    {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2333f93260674e185cfbf208d2da3007132572e56871f451ba1a556b45dae6e2"},
+    {file = "pyarrow-18.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:4c381857754da44326f3a49b8b199f7f87a51c2faacd5114352fc78de30d3aba"},
+    {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:603cd8ad4976568954598ef0a6d4ed3dfb78aff3d57fa8d6271f470f0ce7d34f"},
+    {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58a62549a3e0bc9e03df32f350e10e1efb94ec6cf63e3920c3385b26663948ce"},
+    {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bc97316840a349485fbb137eb8d0f4d7057e1b2c1272b1a20eebbbe1848f5122"},
+    {file = "pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:2e549a748fa8b8715e734919923f69318c953e077e9c02140ada13e59d043310"},
+    {file = "pyarrow-18.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:606e9a3dcb0f52307c5040698ea962685fb1c852d72379ee9412be7de9c5f9e2"},
+    {file = "pyarrow-18.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d5795e37c0a33baa618c5e054cd61f586cf76850a251e2b21355e4085def6280"},
+    {file = "pyarrow-18.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:5f0510608ccd6e7f02ca8596962afb8c6cc84c453e7be0da4d85f5f4f7b0328a"},
+    {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:616ea2826c03c16e87f517c46296621a7c51e30400f6d0a61be645f203aa2b93"},
+    {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1824f5b029ddd289919f354bc285992cb4e32da518758c136271cf66046ef22"},
+    {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6dd1b52d0d58dd8f685ced9971eb49f697d753aa7912f0a8f50833c7a7426319"},
+    {file = "pyarrow-18.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:320ae9bd45ad7ecc12ec858b3e8e462578de060832b98fc4d671dee9f10d9954"},
+    {file = "pyarrow-18.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:2c992716cffb1088414f2b478f7af0175fd0a76fea80841b1706baa8fb0ebaad"},
+    {file = "pyarrow-18.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:e7ab04f272f98ebffd2a0661e4e126036f6936391ba2889ed2d44c5006237802"},
+    {file = "pyarrow-18.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:03f40b65a43be159d2f97fd64dc998f769d0995a50c00f07aab58b0b3da87e1f"},
+    {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be08af84808dff63a76860847c48ec0416928a7b3a17c2f49a072cac7c45efbd"},
+    {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c70c1965cde991b711a98448ccda3486f2a336457cf4ec4dca257a926e149c9"},
+    {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:00178509f379415a3fcf855af020e3340254f990a8534294ec3cf674d6e255fd"},
+    {file = "pyarrow-18.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a71ab0589a63a3e987beb2bc172e05f000a5c5be2636b4b263c44034e215b5d7"},
+    {file = "pyarrow-18.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:fe92efcdbfa0bcf2fa602e466d7f2905500f33f09eb90bf0bcf2e6ca41b574c8"},
+    {file = "pyarrow-18.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:907ee0aa8ca576f5e0cdc20b5aeb2ad4d3953a3b4769fc4b499e00ef0266f02f"},
+    {file = "pyarrow-18.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:66dcc216ebae2eb4c37b223feaf82f15b69d502821dde2da138ec5a3716e7463"},
+    {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc1daf7c425f58527900876354390ee41b0ae962a73ad0959b9d829def583bb1"},
+    {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:871b292d4b696b09120ed5bde894f79ee2a5f109cb84470546471df264cae136"},
+    {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:082ba62bdcb939824ba1ce10b8acef5ab621da1f4c4805e07bfd153617ac19d4"},
+    {file = "pyarrow-18.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:2c664ab88b9766413197733c1720d3dcd4190e8fa3bbdc3710384630a0a7207b"},
+    {file = "pyarrow-18.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:dc892be34dbd058e8d189b47db1e33a227d965ea8805a235c8a7286f7fd17d3a"},
+    {file = "pyarrow-18.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:28f9c39a56d2c78bf6b87dcc699d520ab850919d4a8c7418cd20eda49874a2ea"},
+    {file = "pyarrow-18.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:f1a198a50c409ab2d009fbf20956ace84567d67f2c5701511d4dd561fae6f32e"},
+    {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5bd7fd32e3ace012d43925ea4fc8bd1b02cc6cc1e9813b518302950e89b5a22"},
+    {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:336addb8b6f5208be1b2398442c703a710b6b937b1a046065ee4db65e782ff5a"},
+    {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:45476490dd4adec5472c92b4d253e245258745d0ccaabe706f8d03288ed60a79"},
+    {file = "pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420"},
+    {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb7e3abcda7e1e6b83c2dc2909c8d045881017270a119cc6ee7fdcfe71d02df8"},
+    {file = "pyarrow-18.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:09f30690b99ce34e0da64d20dab372ee54431745e4efb78ac938234a282d15f9"},
+    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5ca5d707e158540312e09fd907f9f49bacbe779ab5236d9699ced14d2293b8"},
+    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6331f280c6e4521c69b201a42dd978f60f7e129511a55da9e0bfe426b4ebb8d"},
+    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:3ac24b2be732e78a5a3ac0b3aa870d73766dd00beba6e015ea2ea7394f8b4e55"},
+    {file = "pyarrow-18.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b30a927c6dff89ee702686596f27c25160dd6c99be5bcc1513a763ae5b1bfc03"},
+    {file = "pyarrow-18.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:8f40ec677e942374e3d7f2fad6a67a4c2811a8b975e8703c6fd26d3b168a90e2"},
+    {file = "pyarrow-18.0.0.tar.gz", hash = "sha256:a6aa027b1a9d2970cf328ccd6dbe4a996bc13c39fd427f502782f5bdb9ca20f5"},
 ]
 
-[package.dependencies]
-numpy = ">=1.16.6"
-
 [package.extras]
 test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
 
@@ -4589,4 +4592,4 @@ watchdog = ["watchdog (>=2.3)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "2ea79042966beabcc2cf7384fd1c819610e59e1970edbcf1bd9453cd92ccc51f"
+content-hash = "ebaacb4dc0e299ab2fdd1fdd57d1e26a2fed1488e0d636c9a7793b447d05d632"
diff --git a/pyproject.toml b/pyproject.toml
index 07281fe934..1ebf56599d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,7 +82,7 @@ google-cloud-batch = "~0.17.26"
 duckdb = "~1.0.0"
 google-cloud-storage = "~2.14.0"
 pandas = "~2.2.2"
-pyarrow = "~17.0.0"
+pyarrow = "18.0.0"
 rich = "~13.9.2"  # Used for CLI pretty print
 
 [tool.poetry.dependencies.sentry-sdk]
diff --git a/robotoff/products.py b/robotoff/products.py
index f90fc1db77..bedb43bd75 100644
--- a/robotoff/products.py
+++ b/robotoff/products.py
@@ -582,27 +582,32 @@ def convert_jsonl_to_parquet(
     dataset_path: Path = settings.JSONL_DATASET_PATH,
     query_path: Path = settings.JSONL_TO_PARQUET_SQL_QUERY,
 ) -> None:
+    from pyarrow import ArrowException
     logger.info("Start JSONL to Parquet conversion process.")
     if not dataset_path.exists() or not query_path.exists():
         raise FileNotFoundError(
             f"{str(dataset_path)} or {str(query_path)} was not found."
         )
-    query = (
-        query_path.read_text()
-        .replace("{dataset_path}", str(dataset_path))
-        .replace("{output_path}", output_file_path)
-    )
-    try:
-        logger.info("Query the JSONL using DuckDB.")
-        arrow_batches = duckdb.sql(query).fetch_arrow_reader(batch_size=100000)
-        logger.info("Post-process extracted data using Arrow")
-        # arrow_batches = export.postprocess_arrow_batches(arrow_batches)
-        logger.info("Write post-processed data into Parquet.")
-        export.sink_to_parquet(output_file_path, batches=arrow_batches)
-        
-    except duckdb.Error as e:
-        logger.error("Error executing query: %s\nError message: %s", query, e)
-        raise
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        try:
+            tmp_parquet_path = os.path.join(tmp_dir, "temp.parquet")
+            query = (
+                query_path.read_text()
+                .replace("{dataset_path}", str(dataset_path))
+                .replace("{output_path}", tmp_parquet_path)
+            )
+            logger.info("Query the JSONL using DuckDB.")
+            duckdb.sql(query)
+            # logger.info("Post-process extracted data using Arrow")
+            # arrow_batches = export.load_parquet()
+            logger.info("Write post-processed data into Parquet.")
+            export.sink_to_parquet(tmp_parquet_path, output_file_path)
+        except duckdb.Error as e:   
+            logger.error("Error executing query: %s\nError message: %s", query, e)
+            raise
+        except ArrowException as e:
+            logger.error(e)
+            raise
     logger.info("JSONL successfully converted into Parquet file.")
 
 
diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py
index 8df44af415..d67b18d7f3 100644
--- a/robotoff/utils/export.py
+++ b/robotoff/utils/export.py
@@ -1,12 +1,9 @@
 "Functions to postprocess the database conversion into Parquet."
-
 import json
-from typing import Iterator
 
 import pyarrow as pa
 import pyarrow.parquet as pq
 
-
 ################
 # Schemas
 ################
@@ -41,28 +38,35 @@
 
 IMAGES_DATATYPE = pa.list_(_dict_schema)
 
+SCHEMAS = {
+    "images": IMAGES_DATATYPE
+}
+
+
 
 ################
 # Functions
 ################
-def sink_to_parquet(path: str, batches: pa.RecordBatchReader):
-    schema = batches.schema
-    schema = schema.remove(schema.get_field_index("images"))
-    schema = schema.append(pa.field("images", IMAGES_DATATYPE))
-    with pq.ParquetWriter(path, schema=schema) as writer:
-        for batch in batches:
-            batch = batches.read_next_batch()
+def sink_to_parquet(parquet_path: str, output_path: str):
+    parquet_file = pq.ParquetFile(parquet_path) 
+    updated_schema = update_schema(parquet_file.schema.to_arrow_schema())
+    with pq.ParquetWriter(output_path, schema=updated_schema) as writer:
+        for batch in parquet_file.iter_batches(batch_size=1000):
             batch = _postprocess_arrow_batch(batch)
-            # batch = _postprocess_arrow_batch(batch)
             writer.write_batch(batch)
 
 
-def postprocess_arrow_batches(batches: pa.RecordBatchReader) -> pa.RecordBatchReader:
-
-    return pa.RecordBatchReader.from_batches(
-        schema=batches.schema,
-        batches=[_postprocess_arrow_batch(batch) for batch in batches]
-    )
+# def postprocess_arrow_batches(batches: pa.RecordBatchReader) -> pa.RecordBatchReader:
+#     schema = _udpate_schema_by_field(
+#         schema=batches.schema,
+#         field_name="images",
+#         field_datatype=IMAGES_DATATYPE,
+#     )
+#     batches = [_postprocess_arrow_batch(batch) for batch in batches]
+#     return pa.RecordBatchReader.from_batches(
+#         schema=schema,
+#         batches=batches,
+#     )
 
 
 def _postprocess_arrow_batch(batch: pa.RecordBatch) -> pa.RecordBatch:
@@ -114,3 +118,19 @@ def _postprocess_images(
     images_array = pa.array(postprocessed_images, type=datatype)
     batch = batch.set_column(1, "images", images_array)
     return batch
+
+
+def update_schema(schema: pa.Schema) -> pa.Schema:
+    for field_name, field_datatype in SCHEMAS.items():
+        schema = _udpate_schema_by_field(schema=schema, field_name=field_name, field_datatype=field_datatype)
+    return schema
+
+
+def _udpate_schema_by_field(schema: pa.Schema, field_name: str, field_datatype: pa.DataType) -> pa.schema:
+    field_index = schema.get_field_index(field_name)
+    schema = schema.remove(field_index)
+    schema = schema.insert(
+        field_index,
+        pa.field(field_name, field_datatype)
+    )
+    return schema
diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql
index cdd6ba7445..f9d954ae94 100644
--- a/robotoff/utils/sql/jsonl_to_parquet.sql
+++ b/robotoff/utils/sql/jsonl_to_parquet.sql
@@ -1,132 +1,134 @@
 SET threads to 4;
 SET preserve_insertion_order = false;
-SELECT
-    code,
-    additives_n,
-    additives_tags,
-    allergens_from_ingredients,
-    allergens_from_user,
-    allergens_tags,
-    brands_tags,
-    categories_properties_tags,
-    categories,
-    checkers_tags,
-    cities_tags,
-    compared_to_category,
-    complete,
-    completeness,
-    correctors_tags,
-    countries_tags,
-    to_timestamp(created_t)::datetime AS created_t, -- Convert from unixtime to datetime
-    creator,
-    data_quality_errors_tags,
-    data_quality_info_tags,
-    data_quality_warnings_tags,
-    data_sources_tags,
-    ecoscore_data,
-    ecoscore_grade,
-    ecoscore_score,
-    ecoscore_tags,
-    editors,
-    emb_codes,
-    emb_codes_tags,
-    entry_dates_tags,
-    environment_impact_level,
-    food_groups_tags,
-    forest_footprint_data,
-    generic_name,
-    grades,
-    images,
-    informers_tags,
-    ingredients_analysis_tags,
-    ingredients_from_palm_oil_n,
-    ingredients_n,
-    ingredients_tags,
-    ingredients_text_with_allergens,
-    ingredients_text,
-    COLUMNS('ingredients_text_\w{2}$'), -- All columns containing ingredients_text_..
-    ingredients_with_specified_percent_n,
-    ingredients_with_unspecified_percent_n,
-    ciqual_food_name_tags,
-    ingredients_percent_analysis,
-    ingredients_original_tags,
-    ingredients_without_ciqual_codes_n,
-    ingredients_without_ciqual_codes,
-    ingredients,
-    known_ingredients_n,
-    labels_tags,
-    lang,
-    languages_tags,
-    languages_codes,
-    last_edit_dates_tags,
-    last_editor,
-    to_timestamp(last_image_t)::datetime AS last_image_t,
-    last_modified_by,
-    to_timestamp(last_modified_t)::datetime AS last_modified_t,
-    to_timestamp(last_updated_t)::datetime AS last_updated_t,
-    link,
-    main_countries_tags,
-    manufacturing_places,
-    manufacturing_places_tags,
-    max_imgid,
-    misc_tags,
-    minerals_tags,
-    new_additives_n,
-    no_nutrition_data,
-    nova_group,
-    nova_groups,
-    nova_groups_markers,
-    nova_groups_tags,
-    nucleotides_tags,
-    nutrient_levels_tags,
-    unknown_nutrients_tags,
-    nutriments,
-    nutriscore_data,
-    nutriscore_grade,
-    nutriscore_score,
-    nutriscore_tags,
-    nutrition_data_prepared_per,
-    nutrition_data,
-    nutrition_grades_tags,
-    nutrition_score_beverage,
-    nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients,
-    nutrition_score_warning_no_fiber,
-    nutrition_score_warning_no_fruits_vegetables_nuts,
-    obsolete_since_date,
-    obsolete,
-    origins_tags,
-    packaging_recycling_tags,
-    packaging_shapes_tags,
-    packaging_tags,
-    packagings_materials,
-    packagings_n,
-    packagings_n,
-    photographers,
-    pnns_groups_1_tags,
-    pnns_groups_2_tags,
-    popularity_key,
-    popularity_tags,
-    product_name,
-    product_quantity_unit,
-    product_quantity,
-    purchase_places_tags,
-    quantity,
-    rev,
-    scans_n,
-    scores,
-    serving_quantity,
-    serving_size,
-    sources,
-    sources_fields,
-    specific_ingredients,
-    states_tags,
-    stores,
-    stores_tags,
-    traces_tags,
-    unique_scans_n,
-    unknown_ingredients_n,
-    vitamins_tags,
-    weighers_tags,
-    with_non_nutritive_sweeteners,
-    with_sweeteners,
-FROM read_ndjson('{dataset_path}', ignore_errors=True)
\ No newline at end of file
+COPY(
+    SELECT code,
+        additives_n,
+        additives_tags,
+        allergens_from_ingredients,
+        allergens_from_user,
+        allergens_tags,
+        brands_tags,
+        categories_properties_tags,
+        categories,
+        checkers_tags,
+        cities_tags,
+        compared_to_category,
+        complete,
+        completeness,
+        correctors_tags,
+        countries_tags,
+        to_timestamp(created_t)::datetime AS created_t,
+        creator,
+        data_quality_errors_tags,
+        data_quality_info_tags,
+        data_quality_warnings_tags,
+        data_sources_tags,
+        ecoscore_data,
+        ecoscore_grade,
+        ecoscore_score,
+        ecoscore_tags,
+        editors,
+        emb_codes,
+        emb_codes_tags,
+        entry_dates_tags,
+        environment_impact_level,
+        food_groups_tags,
+        forest_footprint_data,
+        generic_name,
+        grades,
+        images,
+        informers_tags,
+        ingredients_analysis_tags,
+        ingredients_from_palm_oil_n,
+        ingredients_n,
+        ingredients_tags,
+        ingredients_text_with_allergens,
+        ingredients_text,
+        COLUMNS('ingredients_text_\w{2}$'),
+        -- All columns containing ingredients_text_..
+        ingredients_with_specified_percent_n,
+        ingredients_with_unspecified_percent_n,
+        ciqual_food_name_tags,
+        ingredients_percent_analysis,
+        ingredients_original_tags,
+        ingredients_without_ciqual_codes_n,
+        ingredients_without_ciqual_codes,
+        ingredients,
+        known_ingredients_n,
+        labels_tags,
+        lang,
+        languages_tags,
+        languages_codes,
+        last_edit_dates_tags,
+        last_editor,
+        to_timestamp(last_image_t)::datetime AS last_image_t,
+        last_modified_by,
+        to_timestamp(last_modified_t)::datetime AS last_modified_t,
+        to_timestamp(last_updated_t)::datetime AS last_updated_t,
+        link,
+        main_countries_tags,
+        manufacturing_places,
+        manufacturing_places_tags,
+        max_imgid,
+        misc_tags,
+        minerals_tags,
+        new_additives_n,
+        no_nutrition_data,
+        nova_group,
+        nova_groups,
+        nova_groups_markers,
+        nova_groups_tags,
+        nucleotides_tags,
+        nutrient_levels_tags,
+        unknown_nutrients_tags,
+        nutriments,
+        nutriscore_data,
+        nutriscore_grade,
+        nutriscore_score,
+        nutriscore_tags,
+        nutrition_data_prepared_per,
+        nutrition_data,
+        nutrition_grades_tags,
+        nutrition_score_beverage,
+        nutrition_score_warning_fruits_vegetables_nuts_estimate_from_ingredients,
+        nutrition_score_warning_no_fiber,
+        nutrition_score_warning_no_fruits_vegetables_nuts,
+        obsolete_since_date,
+        obsolete,
+        origins_tags,
+        packaging_recycling_tags,
+        packaging_shapes_tags,
+        packaging_tags,
+        packagings_materials,
+        packagings_n,
+        packagings_n,
+        photographers,
+        pnns_groups_1_tags,
+        pnns_groups_2_tags,
+        popularity_key,
+        popularity_tags,
+        product_name,
+        product_quantity_unit,
+        product_quantity,
+        purchase_places_tags,
+        quantity,
+        rev,
+        scans_n,
+        scores,
+        serving_quantity,
+        serving_size,
+        sources,
+        sources_fields,
+        specific_ingredients,
+        states_tags,
+        stores,
+        stores_tags,
+        traces_tags,
+        unique_scans_n,
+        unknown_ingredients_n,
+        vitamins_tags,
+        weighers_tags,
+        with_non_nutritive_sweeteners,
+        with_sweeteners,
+        FROM read_ndjson('{dataset_path}', ignore_errors = True)
+) TO '{output_path}' (FORMAT PARQUET);
\ No newline at end of file

From 6a523eaf1817f09608531f15f151b6d83764f59c Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 12 Nov 2024 11:46:28 +0100
Subject: [PATCH 4/5] fix: :fire: Works!

---
 robotoff/utils/export.py                | 75 +++++++++++++------------
 robotoff/utils/sql/jsonl_to_parquet.sql |  1 -
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py
index d67b18d7f3..6335da54f6 100644
--- a/robotoff/utils/export.py
+++ b/robotoff/utils/export.py
@@ -1,4 +1,5 @@
 "Functions to postprocess the database conversion into Parquet."
+
 import json
 
 import pyarrow as pa
@@ -15,40 +16,38 @@
     ]
 )
 
-_dict_schema = pa.struct(
-    [
-        pa.field("key", pa.string(), nullable=True),
-        pa.field("imgid", pa.string(), nullable=True),
-        pa.field(
-            "sizes",
-            pa.struct(
-                [
-                    pa.field("100", _size_schema, nullable=True),
-                    pa.field("200", _size_schema, nullable=True),
-                    pa.field("400", _size_schema, nullable=True),
-                    pa.field("full", _size_schema, nullable=True),
-                ]
+
+IMAGES_DATATYPE = pa.list_(
+    pa.struct(
+        [
+            pa.field("key", pa.string(), nullable=True),
+            pa.field("imgid", pa.string(), nullable=True),
+            pa.field(
+                "sizes",
+                pa.struct(
+                    [
+                        pa.field("100", _size_schema, nullable=True),
+                        pa.field("200", _size_schema, nullable=True),
+                        pa.field("400", _size_schema, nullable=True),
+                        pa.field("full", _size_schema, nullable=True),
+                    ]
+                ),
+                nullable=True,
             ),
-            nullable=True,
-        ),
-        pa.field("uploaded_t", pa.string(), nullable=True),
-        pa.field("uploader", pa.string(), nullable=True),
-    ]
+            pa.field("uploaded_t", pa.string(), nullable=True),
+            pa.field("uploader", pa.string(), nullable=True),
+        ]
+    )
 )
 
-IMAGES_DATATYPE = pa.list_(_dict_schema)
-
-SCHEMAS = {
-    "images": IMAGES_DATATYPE
-}
-
+SCHEMAS = {"images": IMAGES_DATATYPE}
 
 
 ################
 # Functions
 ################
 def sink_to_parquet(parquet_path: str, output_path: str):
-    parquet_file = pq.ParquetFile(parquet_path) 
+    parquet_file = pq.ParquetFile(parquet_path)
     updated_schema = update_schema(parquet_file.schema.to_arrow_schema())
     with pq.ParquetWriter(output_path, schema=updated_schema) as writer:
         for batch in parquet_file.iter_batches(batch_size=1000):
@@ -74,10 +73,7 @@ def _postprocess_arrow_batch(batch: pa.RecordBatch) -> pa.RecordBatch:
     return batch
 
 
-def _postprocess_images(
-        batch: pa.RecordBatch, 
-        datatype: pa.DataType = IMAGES_DATATYPE
-    ):
+def _postprocess_images(batch: pa.RecordBatch, datatype: pa.DataType = IMAGES_DATATYPE):
     postprocessed_images = []
     images: list[dict | None] = [
         json.loads(image) if image else None for image in batch["images"].to_pylist()
@@ -115,22 +111,27 @@ def _postprocess_images(
             )
         else:
             postprocessed_images.append([])
-    images_array = pa.array(postprocessed_images, type=datatype)
-    batch = batch.set_column(1, "images", images_array)
+    images_col_index = batch.schema.get_field_index("images")
+    batch = batch.set_column(
+        images_col_index,
+        pa.field("images", datatype),
+        pa.array(postprocessed_images, type=datatype)
+    )
     return batch
 
 
 def update_schema(schema: pa.Schema) -> pa.Schema:
     for field_name, field_datatype in SCHEMAS.items():
-        schema = _udpate_schema_by_field(schema=schema, field_name=field_name, field_datatype=field_datatype)
+        schema = _udpate_schema_by_field(
+            schema=schema, field_name=field_name, field_datatype=field_datatype
+        )
     return schema
 
 
-def _udpate_schema_by_field(schema: pa.Schema, field_name: str, field_datatype: pa.DataType) -> pa.schema:
+def _udpate_schema_by_field(
+    schema: pa.Schema, field_name: str, field_datatype: pa.DataType
+) -> pa.schema:
     field_index = schema.get_field_index(field_name)
     schema = schema.remove(field_index)
-    schema = schema.insert(
-        field_index,
-        pa.field(field_name, field_datatype)
-    )
+    schema = schema.insert(field_index, pa.field(field_name, field_datatype))
     return schema
diff --git a/robotoff/utils/sql/jsonl_to_parquet.sql b/robotoff/utils/sql/jsonl_to_parquet.sql
index f9d954ae94..bf469155e5 100644
--- a/robotoff/utils/sql/jsonl_to_parquet.sql
+++ b/robotoff/utils/sql/jsonl_to_parquet.sql
@@ -45,7 +45,6 @@ COPY(
         ingredients_text_with_allergens,
         ingredients_text,
         COLUMNS('ingredients_text_\w{2}$'),
-        -- All columns containing ingredients_text_..
         ingredients_with_specified_percent_n,
         ingredients_with_unspecified_percent_n,
         ciqual_food_name_tags,

From 7009f0dc5b268250a69d50ea4deabc3f0a96c933 Mon Sep 17 00:00:00 2001
From: jeremyarancio <jeremyarancio@gmail.com>
Date: Tue, 12 Nov 2024 14:29:35 +0100
Subject: [PATCH 5/5] fix:

---
 robotoff/utils/export.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/robotoff/utils/export.py b/robotoff/utils/export.py
index 6335da54f6..1298fbc1eb 100644
--- a/robotoff/utils/export.py
+++ b/robotoff/utils/export.py
@@ -83,24 +83,24 @@ def _postprocess_images(batch: pa.RecordBatch, datatype: pa.DataType = IMAGES_DA
             postprocessed_images.append(
                 [
                     {
-                        "key": key,
+                        "key": str(key),
                         "imgid": str(value.get("imgid", "unknown")),
                         "sizes": {
                             "100": {
-                                "h": value.get("sizes", {}).get("100", {}).get("h", 0),
-                                "w": value.get("sizes", {}).get("100", {}).get("w", 0),
+                                "h": int(value.get("sizes", {}).get("100", {}).get("h", 0) or 0), # (or 0) because "h" or "w" can be none, leading to an error with int
+                                "w": int(value.get("sizes", {}).get("100", {}).get("w", 0) or 0),
                             },
                             "200": {
-                                "h": value.get("sizes", {}).get("200", {}).get("h", 0),
-                                "w": value.get("sizes", {}).get("200", {}).get("w", 0),
+                                "h": int(value.get("sizes", {}).get("200", {}).get("h", 0) or 0),
+                                "w": int(value.get("sizes", {}).get("200", {}).get("w", 0) or 0),
                             },
                             "400": {
-                                "h": value.get("sizes", {}).get("400", {}).get("h", 0),
-                                "w": value.get("sizes", {}).get("400", {}).get("w", 0),
+                                "h": int(value.get("sizes", {}).get("400", {}).get("h", 0) or 0),
+                                "w": int(value.get("sizes", {}).get("400", {}).get("w", 0) or 0),
                             },
                             "full": {
-                                "h": value.get("sizes", {}).get("full", {}).get("h", 0),
-                                "w": value.get("sizes", {}).get("full", {}).get("w", 0),
+                                "h": int(value.get("sizes", {}).get("full", {}).get("h", 0) or 0),
+                                "w": int(value.get("sizes", {}).get("full", {}).get("w", 0) or 0),
                             },
                         },
                         "uploaded_t": str(value.get("uploaded_t", "unknown")),