Skip to content

Commit eb489f2

Browse files
authored
BUG: groupby.idxmin/idxmax with all NA values should raise (#62026)
1 parent 618de88 commit eb489f2

File tree

7 files changed

+80
-23
lines changed

7 files changed

+80
-23
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ Renamed the following offset aliases (:issue:`57986`):
507507

508508
Other Removals
509509
^^^^^^^^^^^^^^
510-
- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
510+
- :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when a group has all NA values, or when used with ``skipna=False`` and any NA value is encountered (:issue:`10694`, :issue:`57745`)
511511
- :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
512512
- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
513513
- :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)

pandas/_libs/groupby.pyx

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2048,9 +2048,8 @@ def group_idxmin_idxmax(
20482048
group_min_or_max = np.empty_like(out, dtype=values.dtype)
20492049
seen = np.zeros_like(out, dtype=np.uint8)
20502050

2051-
# When using transform, we need a valid value for take in the case
2052-
# a category is not observed; these values will be dropped
2053-
out[:] = 0
2051+
# Sentinel for no valid values.
2052+
out[:] = -1
20542053

20552054
with nogil(numeric_object_t is not object):
20562055
for i in range(N):

pandas/core/groupby/generic.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,7 +1404,15 @@ def idxmin(self, skipna: bool = True) -> Series:
14041404
Raises
14051405
------
14061406
ValueError
1407-
If the Series is empty or skipna=False and any value is NA.
1407+
When there are no valid values for a group. Then can happen if:
1408+
1409+
* There is an unobserved group and ``observed=False``.
1410+
* All values for a group are NA.
1411+
* Some values for a group are NA and ``skipna=False``.
1412+
1413+
.. versionchanged:: 3.0.0
1414+
Previously if all values for a group are NA or some values for a group are
1415+
NA and ``skipna=False``, this method would return NA. Now it raises instead.
14081416
14091417
See Also
14101418
--------
@@ -1457,7 +1465,15 @@ def idxmax(self, skipna: bool = True) -> Series:
14571465
Raises
14581466
------
14591467
ValueError
1460-
If the Series is empty or skipna=False and any value is NA.
1468+
When there are no valid values for a group. Then can happen if:
1469+
1470+
* There is an unobserved group and ``observed=False``.
1471+
* All values for a group are NA.
1472+
* Some values for a group are NA and ``skipna=False``.
1473+
1474+
.. versionchanged:: 3.0.0
1475+
Previously if all values for a group are NA or some values for a group are
1476+
NA and ``skipna=False``, this method would return NA. Now it raises instead.
14611477
14621478
See Also
14631479
--------
@@ -2597,7 +2613,15 @@ def idxmax(
25972613
Raises
25982614
------
25992615
ValueError
2600-
* If a column is empty or skipna=False and any value is NA.
2616+
When there are no valid values for a group. Then can happen if:
2617+
2618+
* There is an unobserved group and ``observed=False``.
2619+
* All values for a group are NA.
2620+
* Some values for a group are NA and ``skipna=False``.
2621+
2622+
.. versionchanged:: 3.0.0
2623+
Previously if all values for a group are NA or some values for a group are
2624+
NA and ``skipna=False``, this method would return NA. Now it raises instead.
26012625
26022626
See Also
26032627
--------
@@ -2663,7 +2687,15 @@ def idxmin(
26632687
Raises
26642688
------
26652689
ValueError
2666-
* If a column is empty or skipna=False and any value is NA.
2690+
When there are no valid values for a group. Then can happen if:
2691+
2692+
* There is an unobserved group and ``observed=False``.
2693+
* All values for a group are NA.
2694+
* Some values for a group are NA and ``skipna=False``.
2695+
2696+
.. versionchanged:: 3.0.0
2697+
Previously if all values for a group are NA or some values for a group are
2698+
NA and ``skipna=False``, this method would return NA. Now it raises instead.
26672699
26682700
See Also
26692701
--------

pandas/core/groupby/groupby.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1784,7 +1784,8 @@ def array_func(values: ArrayLike) -> ArrayLike:
17841784
new_mgr = data.grouped_reduce(array_func)
17851785
res = self._wrap_agged_manager(new_mgr)
17861786
if how in ["idxmin", "idxmax"]:
1787-
res = self._wrap_idxmax_idxmin(res)
1787+
# mypy expects how to be Literal["idxmin", "idxmax"].
1788+
res = self._wrap_idxmax_idxmin(res, how=how, skipna=kwargs["skipna"]) # type: ignore[arg-type]
17881789
out = self._wrap_aggregated_output(res)
17891790
return out
17901791

@@ -5715,10 +5716,17 @@ def _idxmax_idxmin(
57155716
)
57165717
return result
57175718

5718-
def _wrap_idxmax_idxmin(self, res: NDFrameT) -> NDFrameT:
5719+
def _wrap_idxmax_idxmin(
5720+
self, res: NDFrameT, how: Literal["idxmax", "idxmin"], skipna: bool
5721+
) -> NDFrameT:
57195722
index = self.obj.index
57205723
if res.size == 0:
57215724
result = res.astype(index.dtype)
5725+
elif skipna and res.lt(0).any(axis=None):
5726+
raise ValueError(
5727+
f"{type(self).__name__}.{how} with skipna=True encountered all NA "
5728+
f"values in a group."
5729+
)
57225730
else:
57235731
if isinstance(index, MultiIndex):
57245732
index = index.to_flat_index()

pandas/core/groupby/grouper.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -286,18 +286,22 @@ def __init__(
286286
self._indexer: npt.NDArray[np.intp] | None = None
287287

288288
def _get_grouper(
289-
self, obj: NDFrameT, validate: bool = True
289+
self, obj: NDFrameT, validate: bool = True, observed: bool = True
290290
) -> tuple[ops.BaseGrouper, NDFrameT]:
291291
"""
292292
Parameters
293293
----------
294294
obj : Series or DataFrame
295+
Object being grouped.
295296
validate : bool, default True
296-
if True, validate the grouper
297+
If True, validate the grouper.
298+
observed : bool, default True
299+
Whether only observed groups should be in the result. Only
300+
has an impact when grouping on categorical data.
297301
298302
Returns
299303
-------
300-
a tuple of grouper, obj (possibly sorted)
304+
A tuple of grouper, obj (possibly sorted)
301305
"""
302306
obj, _, _ = self._set_grouper(obj)
303307
grouper, _, obj = get_grouper(
@@ -307,6 +311,7 @@ def _get_grouper(
307311
sort=self.sort,
308312
validate=validate,
309313
dropna=self.dropna,
314+
observed=observed,
310315
)
311316

312317
return grouper, obj
@@ -787,7 +792,7 @@ def get_grouper(
787792

788793
# a passed-in Grouper, directly convert
789794
if isinstance(key, Grouper):
790-
grouper, obj = key._get_grouper(obj, validate=False)
795+
grouper, obj = key._get_grouper(obj, validate=False, observed=observed)
791796
if key.key is None:
792797
return grouper, frozenset(), obj
793798
else:

pandas/core/resample.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2305,8 +2305,22 @@ def _get_resampler(self, obj: NDFrame) -> Resampler:
23052305
)
23062306

23072307
def _get_grouper(
2308-
self, obj: NDFrameT, validate: bool = True
2308+
self, obj: NDFrameT, validate: bool = True, observed: bool = True
23092309
) -> tuple[BinGrouper, NDFrameT]:
2310+
"""
2311+
Parameters
2312+
----------
2313+
obj : Series or DataFrame
2314+
Object being grouped.
2315+
validate : bool, default True
2316+
Unused. Only for compatibility with ``Grouper._get_grouper``.
2317+
observed : bool, default True
2318+
Unused. Only for compatibility with ``Grouper._get_grouper``.
2319+
2320+
Returns
2321+
-------
2322+
A tuple of grouper, obj (possibly sorted)
2323+
"""
23102324
# create the resampler and return our binner
23112325
r = self._get_resampler(obj)
23122326
return r._grouper, cast(NDFrameT, r.obj)

pandas/tests/groupby/test_reductions.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
272272
max_value = np.finfo(float_numpy_dtype).max
273273
df = DataFrame(
274274
{
275-
"a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"),
275+
"a": Series(np.repeat(range(1, 5), repeats=2), dtype="intp"),
276276
"b": Series(
277277
[
278278
np.nan,
@@ -283,8 +283,6 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
283283
np.nan,
284284
max_value,
285285
np.nan,
286-
np.nan,
287-
np.nan,
288286
],
289287
dtype=float_numpy_dtype,
290288
),
@@ -299,7 +297,7 @@ def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
299297
return
300298
result = getattr(gb, how)(skipna=skipna)
301299
expected = DataFrame(
302-
{"b": [1, 3, 4, 6, np.nan]}, index=pd.Index(range(1, 6), name="a", dtype="intp")
300+
{"b": [1, 3, 4, 6]}, index=pd.Index(range(1, 5), name="a", dtype="intp")
303301
)
304302
tm.assert_frame_equal(result, expected)
305303

@@ -1003,8 +1001,6 @@ def test_string_dtype_all_na(
10031001
else:
10041002
expected_dtype = "int64"
10051003
expected_value = 1 if reduction_func == "size" else 0
1006-
elif reduction_func in ["idxmin", "idxmax"]:
1007-
expected_dtype, expected_value = "float64", np.nan
10081004
elif not skipna or min_count > 0:
10091005
expected_value = pd.NA
10101006
elif reduction_func == "sum":
@@ -1032,8 +1028,11 @@ def test_string_dtype_all_na(
10321028
with pytest.raises(TypeError, match=msg):
10331029
method(*args, **kwargs)
10341030
return
1035-
elif reduction_func in ["idxmin", "idxmax"] and not skipna:
1036-
msg = f"{reduction_func} with skipna=False encountered an NA value."
1031+
elif reduction_func in ["idxmin", "idxmax"]:
1032+
if skipna:
1033+
msg = f"{reduction_func} with skipna=True encountered all NA values"
1034+
else:
1035+
msg = f"{reduction_func} with skipna=False encountered an NA value."
10371036
with pytest.raises(ValueError, match=msg):
10381037
method(*args, **kwargs)
10391038
return

0 commit comments

Comments
 (0)