diff --git a/data-processing-lib/python/requirements.txt b/data-processing-lib/python/requirements.txt index 7b363f2b5..318d715d5 100644 --- a/data-processing-lib/python/requirements.txt +++ b/data-processing-lib/python/requirements.txt @@ -4,3 +4,4 @@ argparse mmh3 psutil + polars>=1.9.0 diff --git a/data-processing-lib/python/src/data_processing/utils/transform_utils.py b/data-processing-lib/python/src/data_processing/utils/transform_utils.py index e2d37581c..ccb7f3fe8 100644 --- a/data-processing-lib/python/src/data_processing/utils/transform_utils.py +++ b/data-processing-lib/python/src/data_processing/utils/transform_utils.py @@ -11,6 +11,7 @@ ################################################################################ import hashlib +import io import os import string import sys @@ -144,8 +145,21 @@ def convert_binary_to_arrow(data: bytes, schema: pa.schema = None) -> pa.Table: table = pq.read_table(reader, schema=schema) return table except Exception as e: - logger.error(f"Failed to convert byte array to arrow table, exception {e}. Skipping it") - return None + logger.warning(f"Could not convert bytes to pyarrow: {e}") + + # We have seen this exception before when using pyarrow, but polars does not throw it. + # "Nested data conversions not implemented for chunked array outputs" + # See issue 816 https://github.com/IBM/data-prep-kit/issues/816. + logger.info(f"Attempting read of pyarrow Table using polars") + try: + import polars + + df = polars.read_parquet(io.BytesIO(data)) + table = df.to_arrow() + except Exception as e: + logger.error(f"Could not convert bytes to pyarrow using polars: {e}. Skipping.") + table = None + return table @staticmethod def convert_arrow_to_binary(table: pa.Table) -> bytes: