From 05709efc7a19a727ed9d1508df9a16a2c4f4db33 Mon Sep 17 00:00:00 2001 From: E33605 Date: Fri, 13 Dec 2024 12:19:37 +0530 Subject: [PATCH 01/11] ENH: removed median percentile to be always included in describe - fixes #60550 - median percentile is default when a blank list of percentiles is passed --- pandas/core/methods/describe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 17d4d38c97f33..f8dbd669058d3 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -351,13 +351,13 @@ def _refine_percentiles( # explicit conversion of `percentiles` to list percentiles = list(percentiles) + # median should be included only if blank list is passed + if len(percentiles) == 0: + percentiles.append(0.5) + # get them all to be in [0, 1] validate_percentile(percentiles) - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) # sort and check for duplicates From 5c745f32577fd025eac8a6e1be6e7fbec86f55af Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sat, 14 Dec 2024 15:35:04 +0530 Subject: [PATCH 02/11] skip validation for blank list, return median percentile --- pandas/core/methods/describe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index f8dbd669058d3..0f03b00ddac8c 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -351,9 +351,9 @@ def _refine_percentiles( # explicit conversion of `percentiles` to list percentiles = list(percentiles) - # median should be included only if blank list is passed + # median should be included only if blank iterable is passed if len(percentiles) == 0: - percentiles.append(0.5) + return np.array([0.5]) # get them all to be in [0, 1] validate_percentile(percentiles) From 7264ee6edcf7d1388c063b594c5ee81ddcbb433e Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sat, 14 Dec 2024 15:42:20 +0530 Subject: [PATCH 03/11] add enhancement changelog for gh#60550 --- doc/source/whatsnew/v2.3.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index b107a5d3ba100..e37368ce7e79b 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -36,7 +36,8 @@ Other enhancements when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- +- Median percentile is only included in :meth:`~Series.describe` when a blank + list is passed (:issue:`60550`). .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: From 3e0868438ce60b93cdeb878a3cc66bdd91106ee1 Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sat, 14 Dec 2024 18:47:40 +0530 Subject: [PATCH 04/11] DOC: add the default median behavior in function docstring --- pandas/core/generic.py | 3 ++- pandas/core/methods/describe.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d1aa20501b060..bb003a1d11f1b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10795,7 +10795,8 @@ def describe( The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. + 75th percentiles. If a blank list is passed, then returns + only the 50th percentile value. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. Here are the options: diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 0f03b00ddac8c..1587fc0a4add0 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -74,7 +74,8 @@ def describe_ndframe( percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. + 75th percentiles.If a blank list is passed, then returns only the + 50th percentile value. Returns ------- From c1c187911e1fe0d0b69fbed8485a584aa2a60b65 Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Dec 2024 11:02:30 +0530 Subject: [PATCH 05/11] add test cases for percentile refine for describe function --- .../tests/reductions/test_describe_ndframe.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 pandas/tests/reductions/test_describe_ndframe.py diff --git a/pandas/tests/reductions/test_describe_ndframe.py b/pandas/tests/reductions/test_describe_ndframe.py new file mode 100644 index 0000000000000..9bb45f3ae68ca --- /dev/null +++ b/pandas/tests/reductions/test_describe_ndframe.py @@ -0,0 +1,28 @@ +# -*- encoding: utf-8 -*- + +""" +We test the describe_ndframe function. +""" + +import pytest +import numpy as np + +from pandas.core.methods.describe import _refine_percentiles + +def test_refine_percentiles(): + """ + Check the performance of the _refine_percentiles when multiple + values are passed. + """ + + # by default 0.25, 0.50, 0.75 is returned + # or, when None is passed return behavior is the same + assert _refine_percentiles() == np.array([0.25, 0.5, 0.75]) + assert _refine_percentiles(percentiles = None) == np.array([0.25, 0.5, 0.75]) + + # when any value is passed, then the function should return + percentiles_ = [0.3, 0.6] + assert _refine_percentiles(percentiles_) == np.array(percentiles_) + + # when a blank list is passed, then should return only 0.5 + assert _refine_percentiles(percentiles = []) == np.array([0.5]) From dd875dadd55bee63946e3c611ae1fbc11eac2021 Mon Sep 17 00:00:00 2001 From: E33605 Date: Wed, 18 Dec 2024 12:03:55 +0530 Subject: [PATCH 06/11] use pytest mark parametrize to fix input and expected values --- .../tests/reductions/test_describe_ndframe.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/pandas/tests/reductions/test_describe_ndframe.py b/pandas/tests/reductions/test_describe_ndframe.py index 9bb45f3ae68ca..ec252fd4316f5 100644 --- a/pandas/tests/reductions/test_describe_ndframe.py +++ b/pandas/tests/reductions/test_describe_ndframe.py @@ -4,25 +4,22 @@ We test the describe_ndframe function. """ -import pytest import numpy as np +import pytest from pandas.core.methods.describe import _refine_percentiles -def test_refine_percentiles(): +@pytest.mark.parametrize( + "percentiles_, expected", [ + (None, np.array([0.25, 0.5, 0.75])), + ([], np.array([0.5])), + ([0.3, 0.6], np.array([0.3, 0.6])), + ] +) +def test_refine_percentiles(percentiles_, expected): """ Check the performance of the _refine_percentiles when multiple values are passed. """ - # by default 0.25, 0.50, 0.75 is returned - # or, when None is passed return behavior is the same - assert _refine_percentiles() == np.array([0.25, 0.5, 0.75]) - assert _refine_percentiles(percentiles = None) == np.array([0.25, 0.5, 0.75]) - - # when any value is passed, then the function should return - percentiles_ = [0.3, 0.6] - assert _refine_percentiles(percentiles_) == np.array(percentiles_) - - # when a blank list is passed, then should return only 0.5 - assert _refine_percentiles(percentiles = []) == np.array([0.5]) + assert np.array_equal(_refine_percentiles(percentiles_), expected) From 98a014331b7097d8c76fc2b88f64636d73ea1410 Mon Sep 17 00:00:00 2001 From: Debmalya Pramanik Date: Tue, 24 Dec 2024 11:30:58 +0530 Subject: [PATCH 07/11] Fix typo in docstring, add space Co-authored-by: Asish Mahapatra --- pandas/core/methods/describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index 1587fc0a4add0..fa53af4c2bac9 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -74,7 +74,7 @@ def describe_ndframe( percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles.If a blank list is passed, then returns only the + 75th percentiles. If a blank list is passed, then returns only the 50th percentile value. Returns From aee5795bdf52681a99a41c9ef988072eb6b35bed Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sat, 11 Jan 2025 01:07:17 +0530 Subject: [PATCH 08/11] move test under tests/frame/methods/test_describe.py --- pandas/tests/frame/methods/test_describe.py | 31 +++++++++++++++++++ .../tests/reductions/test_describe_ndframe.py | 25 --------------- 2 files changed, 31 insertions(+), 25 deletions(-) delete mode 100644 pandas/tests/reductions/test_describe_ndframe.py diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index e9206e86b7b08..3f1a405a6e04b 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -413,3 +413,34 @@ def test_describe_exclude_pa_dtype(self): dtype=pd.ArrowDtype(pa.float64()), ) tm.assert_frame_equal(result, expected) + + def test_refine_percentiles(self): + # GH#60550 + df = DataFrame({"a" : np.arange(0, 10, 1)}) + + # the default behavior is to return [0.25, 0.5, 0.75] + result = df.describe() + expected = DataFrame( + {"a" : [10, df.a.mean(), df.a.std(), 0, 2.25, 4.5, 6.75, 9]}, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"] + ) + + tm.assert_frame_equal(result, expected) + + # if an empty list is passed, it should return [0.5] + result = df.describe(percentiles=[]) + expected = DataFrame( + {"a" : [10, df.a.mean(), df.a.std(), 0, 4.5, 9]}, + index=["count", "mean", "std", "min", "50%", "max"] + ) + + tm.assert_frame_equal(result, expected) + + # if a list is passed, it should return with the same values + result = df.describe(percentiles=[0.2]) + expected = DataFrame( + {"a" : [10, df.a.mean(), df.a.std(), 0, 1.8, 9]}, + index=["count", "mean", "std", "min", "20%", "max"] + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reductions/test_describe_ndframe.py b/pandas/tests/reductions/test_describe_ndframe.py deleted file mode 100644 index ec252fd4316f5..0000000000000 --- a/pandas/tests/reductions/test_describe_ndframe.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- encoding: utf-8 -*- - -""" -We test the describe_ndframe function. -""" - -import numpy as np -import pytest - -from pandas.core.methods.describe import _refine_percentiles - -@pytest.mark.parametrize( - "percentiles_, expected", [ - (None, np.array([0.25, 0.5, 0.75])), - ([], np.array([0.5])), - ([0.3, 0.6], np.array([0.3, 0.6])), - ] -) -def test_refine_percentiles(percentiles_, expected): - """ - Check the performance of the _refine_percentiles when multiple - values are passed. - """ - - assert np.array_equal(_refine_percentiles(percentiles_), expected) From 5cf9a70d70d4710b25b6e620f1cf95e713ef2727 Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sun, 19 Jan 2025 14:57:59 +0530 Subject: [PATCH 09/11] move refine percentiles changes to v3.0.0 --- doc/source/whatsnew/v2.3.0.rst | 2 -- doc/source/whatsnew/v3.0.0.rst | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index e37368ce7e79b..86f376042f967 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -36,8 +36,6 @@ Other enhancements when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been updated to work correctly with NumPy >= 2 (:issue:`57739`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- Median percentile is only included in :meth:`~Series.describe` when a blank - list is passed (:issue:`60550`). .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 005818b0779e6..302a4f3c0b417 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -65,6 +65,7 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- Fixed bug where median percentile is included in :meth:`~Series.describe` when a blank list is passed (:issue:`60550`). .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From 258cfc6361346a9f44c0c91d777264fb98c0d9e0 Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sun, 19 Jan 2025 15:07:30 +0530 Subject: [PATCH 10/11] remove median percentile if user explictly pass blank list - fixed docstring for percentiles behavior --- pandas/core/generic.py | 3 +-- pandas/core/methods/describe.py | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb003a1d11f1b..d1aa20501b060 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10795,8 +10795,7 @@ def describe( The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. If a blank list is passed, then returns - only the 50th percentile value. + 75th percentiles. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. Here are the options: diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index fa53af4c2bac9..7291e748dfec7 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -74,8 +74,7 @@ def describe_ndframe( percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. If a blank list is passed, then returns only the - 50th percentile value. + 75th percentiles. Returns ------- @@ -352,9 +351,9 @@ def _refine_percentiles( # explicit conversion of `percentiles` to list percentiles = list(percentiles) - # median should be included only if blank iterable is passed + # percentiles are removed if an user explictly pass blank list if len(percentiles) == 0: - return np.array([0.5]) + return np.array([]) # get them all to be in [0, 1] validate_percentile(percentiles) From bfbffd07891e3e8f1fbf9d8f1687b66b4955cfb9 Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Sun, 19 Jan 2025 15:09:20 +0530 Subject: [PATCH 11/11] fix test case check when a blank list is passed --- pandas/tests/frame/methods/test_describe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 3f1a405a6e04b..8418e9db95d42 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -427,11 +427,11 @@ def test_refine_percentiles(self): tm.assert_frame_equal(result, expected) - # if an empty list is passed, it should return [0.5] + # no percentiles if an user explictly pass blank list result = df.describe(percentiles=[]) expected = DataFrame( - {"a" : [10, df.a.mean(), df.a.std(), 0, 4.5, 9]}, - index=["count", "mean", "std", "min", "50%", "max"] + {"a" : [10, df.a.mean(), df.a.std(), 0, 9]}, + index=["count", "mean", "std", "min", "max"] ) tm.assert_frame_equal(result, expected)