From 37cc39fa7ffc29cf740c1ce85b1e752703074de7 Mon Sep 17 00:00:00 2001
From: Stefan Siegel <ssiegel@sdas.net>
Date: Thu, 7 Nov 2024 15:46:20 +0100
Subject: [PATCH 1/2] fix: Support Pandas future.infer_string=True in report
 generation

Previously, report generation encountered issues when
`future.infer_string=True` was set. This resulted in multiple warnings
("FutureWarning: Dtype inference on a pandas object is deprecated") and
failures when string columns contained only empty strings
("AttributeError: 'StringDtype' object has no attribute
'pyarrow_dtype'").
This change resolves the issue by explicitly setting the dtype to
"object" for the relevant operations.
---
 .../pandas/describe_categorical_pandas.py     |  6 +++---
 .../model/pandas/summary_pandas.py            |  3 ++-
 tests/unit/test_pd_future_infer_string.py     | 21 +++++++++++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit/test_pd_future_infer_string.py

diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
index 31ae57417..8c6226d8d 100644
--- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -19,7 +19,7 @@
 
 
 def get_character_counts_vc(vc: pd.Series) -> pd.Series:
-    series = pd.Series(vc.index, index=vc)
+    series = pd.Series(vc.index, index=vc, dtype=object)
     characters = series[series != ""].apply(list)
     characters = characters.explode()
 
@@ -169,7 +169,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
     # TODO: configurable lowercase/punctuation etc.
     # TODO: remove punctuation in words
 
-    series = pd.Series(vc.index, index=vc)
+    series = pd.Series(vc.index, index=vc, dtype=object)
     word_lists = series.str.lower().str.split()
     words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
     word_counts = pd.Series(words.index, index=words)
@@ -187,7 +187,7 @@ def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
 
 
 def length_summary_vc(vc: pd.Series) -> dict:
-    series = pd.Series(vc.index, index=vc)
+    series = pd.Series(vc.index, index=vc, dtype=object)
     length = series.str.len()
     length_counts = pd.Series(length.index, index=length)
     length_counts = length_counts.groupby(level=0, sort=False).sum()
diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
index 68e019451..f392752df 100644
--- a/src/ydata_profiling/model/pandas/summary_pandas.py
+++ b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -44,7 +44,8 @@ def pandas_describe_1d(
     """
 
     # Make sure pd.NA is not in the series
-    series = series.fillna(np.nan)
+    with pd.option_context("future.no_silent_downcasting", True):
+        series = series.fillna(np.nan)
 
     has_cast_type = _is_cast_type_defined(typeset, series.name)
     cast_type = str(typeset.type_schema[series.name]) if has_cast_type else None
diff --git a/tests/unit/test_pd_future_infer_string.py b/tests/unit/test_pd_future_infer_string.py
new file mode 100644
index 000000000..ca9838029
--- /dev/null
+++ b/tests/unit/test_pd_future_infer_string.py
@@ -0,0 +1,21 @@
+import pandas as pd
+import pytest
+
+from ydata_profiling import ProfileReport
+
+
+@pytest.fixture()
+def df():
+    df = pd.DataFrame(
+        {
+            "foo": [1, 2, 3],
+            "bar": ["", "", ""],
+        }
+    )
+    return df
+
+
+def test_pd_future_infer_string(df: pd.DataFrame):
+    with pd.option_context("future.infer_string", True):
+        profile_report = ProfileReport(df, title="Test Report", progress_bar=False)
+        assert len(profile_report.to_html()) > 0

From 8a2a8279c8b2caef8638a4afe25803b3cc44371d Mon Sep 17 00:00:00 2001
From: Stefan Siegel <ssiegel@sdas.net>
Date: Sat, 1 Feb 2025 23:27:46 +0100
Subject: [PATCH 2/2] refactor: Refactor pandas option usage for backward
 compatibility

- Introduce the `optional_option_context` helper to replace the direct
  use of `pd.option_context("future.no_silent_downcasting", True)`,
  ensuring compatibility with older pandas versions that lack this
  option.
- Update the `future.infer_string` test to run only on pandas >= 2.1,
  where it is applicable.
---
 .../model/pandas/summary_pandas.py                |  3 ++-
 src/ydata_profiling/utils/compat.py               | 15 +++++++++++++++
 tests/unit/test_pd_future_infer_string.py         |  4 ++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
index f392752df..927ffacaf 100644
--- a/src/ydata_profiling/model/pandas/summary_pandas.py
+++ b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -13,6 +13,7 @@
 from ydata_profiling.model.summarizer import BaseSummarizer
 from ydata_profiling.model.summary import describe_1d, get_series_descriptions
 from ydata_profiling.model.typeset import ProfilingTypeSet
+from ydata_profiling.utils.compat import optional_option_context
 from ydata_profiling.utils.dataframe import sort_column_names
 
 
@@ -44,7 +45,7 @@ def pandas_describe_1d(
     """
 
     # Make sure pd.NA is not in the series
-    with pd.option_context("future.no_silent_downcasting", True):
+    with optional_option_context("future.no_silent_downcasting", True):
         series = series.fillna(np.nan)
 
     has_cast_type = _is_cast_type_defined(typeset, series.name)
diff --git a/src/ydata_profiling/utils/compat.py b/src/ydata_profiling/utils/compat.py
index 879316e99..b01c7db55 100644
--- a/src/ydata_profiling/utils/compat.py
+++ b/src/ydata_profiling/utils/compat.py
@@ -1,4 +1,6 @@
 """Utility functions for (version) compatibility"""
+
+from contextlib import contextmanager
 from functools import lru_cache
 from typing import Tuple
 
@@ -12,3 +14,16 @@ def pandas_version_info() -> Tuple[int, ...]:
     akin to `sys.version_info` for the Python version.
     """
     return tuple(int(s) for s in pd.__version__.split("."))
+
+
+@contextmanager
+def optional_option_context(option_key, value):
+    """
+    A context manager that sets an option only if it is available in the
+    current pandas version; otherwise, it is a no-op.
+    """
+    try:
+        with pd.option_context(option_key, value):
+            yield
+    except pd.errors.OptionError:
+        yield
diff --git a/tests/unit/test_pd_future_infer_string.py b/tests/unit/test_pd_future_infer_string.py
index ca9838029..aa37d121b 100644
--- a/tests/unit/test_pd_future_infer_string.py
+++ b/tests/unit/test_pd_future_infer_string.py
@@ -2,6 +2,7 @@
 import pytest
 
 from ydata_profiling import ProfileReport
+from ydata_profiling.utils.compat import pandas_version_info
 
 
 @pytest.fixture()
@@ -15,6 +16,9 @@ def df():
     return df
 
 
+@pytest.mark.skipif(
+    pandas_version_info() < (2, 1, 0), reason="requires pandas 2.1 or higher"
+)
 def test_pd_future_infer_string(df: pd.DataFrame):
     with pd.option_context("future.infer_string", True):
         profile_report = ProfileReport(df, title="Test Report", progress_bar=False)