Skip to content

Commit 8475758

Browse files
authored
API: rank with nullable dtypes preserve NA (#62043)
1 parent 8ec4cc6 commit 8475758

File tree

3 files changed

+60
-3
lines changed

3 files changed

+60
-3
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ Other enhancements
8181
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
8282
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
8383
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
84+
- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`)
8485
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
8586
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
8687
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)

pandas/core/arrays/masked.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import numpy as np
1313

1414
from pandas._libs import (
15+
algos as libalgos,
1516
lib,
1617
missing as libmissing,
1718
)
@@ -992,6 +993,49 @@ def copy(self) -> Self:
992993
mask = self._mask.copy()
993994
return self._simple_new(data, mask)
994995

996+
def _rank(
997+
self,
998+
*,
999+
axis: AxisInt = 0,
1000+
method: str = "average",
1001+
na_option: str = "keep",
1002+
ascending: bool = True,
1003+
pct: bool = False,
1004+
):
1005+
# GH#62043 Avoid going through copy-making ensure_data in algorithms.rank
1006+
if axis != 0 or self.ndim != 1:
1007+
raise NotImplementedError
1008+
1009+
from pandas.core.arrays import FloatingArray
1010+
1011+
data = self._data
1012+
if data.dtype.kind == "b":
1013+
data = data.view("uint8")
1014+
1015+
result = libalgos.rank_1d(
1016+
data,
1017+
is_datetimelike=False,
1018+
ties_method=method,
1019+
ascending=ascending,
1020+
na_option=na_option,
1021+
pct=pct,
1022+
mask=self.isna(),
1023+
)
1024+
if na_option in ["top", "bottom"]:
1025+
mask = np.zeros(self.shape, dtype=bool)
1026+
else:
1027+
mask = self._mask.copy()
1028+
1029+
if method != "average" and not pct:
1030+
if na_option not in ["top", "bottom"]:
1031+
result[self._mask] = 0 # avoid warning on casting
1032+
result = result.astype("uint64", copy=False)
1033+
from pandas.core.arrays import IntegerArray
1034+
1035+
return IntegerArray(result, mask=mask)
1036+
1037+
return FloatingArray(result, mask=mask)
1038+
9951039
@doc(ExtensionArray.duplicated)
9961040
def duplicated(
9971041
self, keep: Literal["first", "last", False] = "first"

pandas/tests/series/methods/test_rank.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ def expected_dtype(dtype, method, pct=False):
6868
exp_dtype = "double[pyarrow]"
6969
else:
7070
exp_dtype = "uint64[pyarrow]"
71+
elif dtype in ["Float64", "Int64"]:
72+
if method == "average" or pct:
73+
exp_dtype = "Float64"
74+
else:
75+
exp_dtype = "UInt64"
7176

7277
return exp_dtype
7378

@@ -257,7 +262,7 @@ def test_rank_nullable_integer(self):
257262
exp = Series([None, 2, None, 3, 3, 2, 3, 1], dtype="Int64")
258263
result = exp.rank(na_option="keep")
259264

260-
expected = Series([np.nan, 2.5, np.nan, 5.0, 5.0, 2.5, 5.0, 1.0])
265+
expected = Series([None, 2.5, None, 5.0, 5.0, 2.5, 5.0, 1.0], dtype="Float64")
261266

262267
tm.assert_series_equal(result, expected)
263268

@@ -302,6 +307,12 @@ def test_rank_tie_methods_on_infs_nans(
302307
exp_dtype = "float64[pyarrow]"
303308
else:
304309
exp_dtype = "uint64[pyarrow]"
310+
elif dtype == "Float64":
311+
# GH#62043
312+
if rank_method == "average":
313+
exp_dtype = "Float64"
314+
else:
315+
exp_dtype = "UInt64"
305316
else:
306317
exp_dtype = "float64"
307318

@@ -327,7 +338,8 @@ def test_rank_tie_methods_on_infs_nans(
327338
result = iseries.rank(
328339
method=rank_method, na_option=na_option, ascending=ascending
329340
)
330-
tm.assert_series_equal(result, Series(expected, dtype=exp_dtype))
341+
exp_ser = Series(expected, dtype=exp_dtype)
342+
tm.assert_series_equal(result, exp_ser)
331343

332344
def test_rank_desc_mix_nans_infs(self):
333345
# GH 19538
@@ -439,7 +451,7 @@ def test_rank_ea_small_values(self):
439451
dtype="Float64",
440452
)
441453
result = ser.rank(method="min")
442-
expected = Series([4, 1, 3, np.nan, 2])
454+
expected = Series([4, 1, 3, NA, 2], dtype="UInt64")
443455
tm.assert_series_equal(result, expected)
444456

445457

0 commit comments

Comments
 (0)