From 37cc39fa7ffc29cf740c1ce85b1e752703074de7 Mon Sep 17 00:00:00 2001 From: Stefan Siegel Date: Thu, 7 Nov 2024 15:46:20 +0100 Subject: [PATCH 1/2] fix: Support Pandas future.infer_string=True in report generation Previously, report generation encountered issues when `future.infer_string=True` was set. This resulted in multiple warnings ("FutureWarning: Dtype inference on a pandas object is deprecated") and failures when string columns contained only empty strings ("AttributeError: 'StringDtype' object has no attribute 'pyarrow_dtype'"). This change resolves the issue by explicitly setting the dtype to "object" for the relevant operations. --- .../pandas/describe_categorical_pandas.py | 6 +++--- .../model/pandas/summary_pandas.py | 3 ++- tests/unit/test_pd_future_infer_string.py | 21 +++++++++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 tests/unit/test_pd_future_infer_string.py diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 31ae57417..8c6226d8d 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -19,7 +19,7 @@ def get_character_counts_vc(vc: pd.Series) -> pd.Series: - series = pd.Series(vc.index, index=vc) + series = pd.Series(vc.index, index=vc, dtype=object) characters = series[series != ""].apply(list) characters = characters.explode() @@ -169,7 +169,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict: # TODO: configurable lowercase/punctuation etc. # TODO: remove punctuation in words - series = pd.Series(vc.index, index=vc) + series = pd.Series(vc.index, index=vc, dtype=object) word_lists = series.str.lower().str.split() words = word_lists.explode().str.strip(string.punctuation + string.whitespace) word_counts = pd.Series(words.index, index=words) @@ -187,7 +187,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict: def length_summary_vc(vc: pd.Series) -> dict: - series = pd.Series(vc.index, index=vc) + series = pd.Series(vc.index, index=vc, dtype=object) length = series.str.len() length_counts = pd.Series(length.index, index=length) length_counts = length_counts.groupby(level=0, sort=False).sum() diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 68e019451..f392752df 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -44,7 +44,8 @@ def pandas_describe_1d( """ # Make sure pd.NA is not in the series - series = series.fillna(np.nan) + with pd.option_context("future.no_silent_downcasting", True): + series = series.fillna(np.nan) has_cast_type = _is_cast_type_defined(typeset, series.name) cast_type = str(typeset.type_schema[series.name]) if has_cast_type else None diff --git a/tests/unit/test_pd_future_infer_string.py b/tests/unit/test_pd_future_infer_string.py new file mode 100644 index 000000000..ca9838029 --- /dev/null +++ b/tests/unit/test_pd_future_infer_string.py @@ -0,0 +1,21 @@ +import pandas as pd +import pytest + +from ydata_profiling import ProfileReport + + +@pytest.fixture() +def df(): + df = pd.DataFrame( + { + "foo": [1, 2, 3], + "bar": ["", "", ""], + } + ) + return df + + +def test_pd_future_infer_string(df: pd.DataFrame): + with pd.option_context("future.infer_string", True): + profile_report = ProfileReport(df, title="Test Report", progress_bar=False) + assert len(profile_report.to_html()) > 0 From 8a2a8279c8b2caef8638a4afe25803b3cc44371d Mon Sep 17 00:00:00 2001 From: Stefan Siegel Date: Sat, 1 Feb 2025 23:27:46 +0100 Subject: [PATCH 2/2] refactor: Refactor pandas option usage for backward compatibility - Introduce the `optional_option_context` helper to replace the direct use of `pd.option_context("future.no_silent_downcasting", True)`, ensuring compatibility with older pandas versions that lack this option. - Update the `future.infer_string` test to run only on pandas >= 2.1, where it is applicable. --- .../model/pandas/summary_pandas.py | 3 ++- src/ydata_profiling/utils/compat.py | 15 +++++++++++++++ tests/unit/test_pd_future_infer_string.py | 4 ++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index f392752df..927ffacaf 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -13,6 +13,7 @@ from ydata_profiling.model.summarizer import BaseSummarizer from ydata_profiling.model.summary import describe_1d, get_series_descriptions from ydata_profiling.model.typeset import ProfilingTypeSet +from ydata_profiling.utils.compat import optional_option_context from ydata_profiling.utils.dataframe import sort_column_names @@ -44,7 +45,7 @@ def pandas_describe_1d( """ # Make sure pd.NA is not in the series - with pd.option_context("future.no_silent_downcasting", True): + with optional_option_context("future.no_silent_downcasting", True): series = series.fillna(np.nan) has_cast_type = _is_cast_type_defined(typeset, series.name) diff --git a/src/ydata_profiling/utils/compat.py b/src/ydata_profiling/utils/compat.py index 879316e99..b01c7db55 100644 --- a/src/ydata_profiling/utils/compat.py +++ b/src/ydata_profiling/utils/compat.py @@ -1,4 +1,6 @@ """Utility functions for (version) compatibility""" + +from contextlib import contextmanager from functools import lru_cache from typing import Tuple @@ -12,3 +14,16 @@ def pandas_version_info() -> Tuple[int, ...]: akin to `sys.version_info` for the Python version. """ return tuple(int(s) for s in pd.__version__.split(".")) + + +@contextmanager +def optional_option_context(option_key, value): + """ + A context manager that sets an option only if it is available in the + current pandas version; otherwise, it is a no-op. + """ + try: + with pd.option_context(option_key, value): + yield + except pd.errors.OptionError: + yield diff --git a/tests/unit/test_pd_future_infer_string.py b/tests/unit/test_pd_future_infer_string.py index ca9838029..aa37d121b 100644 --- a/tests/unit/test_pd_future_infer_string.py +++ b/tests/unit/test_pd_future_infer_string.py @@ -2,6 +2,7 @@ import pytest from ydata_profiling import ProfileReport +from ydata_profiling.utils.compat import pandas_version_info @pytest.fixture() @@ -15,6 +16,9 @@ def df(): return df +@pytest.mark.skipif( + pandas_version_info() < (2, 1, 0), reason="requires pandas 2.1 or higher" +) def test_pd_future_infer_string(df: pd.DataFrame): with pd.option_context("future.infer_string", True): profile_report = ProfileReport(df, title="Test Report", progress_bar=False)