Skip to content

API: mode.nan_is_na to consistently distinguish NaN-vs-NA #62040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 39 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
98bedc4
BUG: read_csv with engine=pyarrow and numpy-nullable dtype
jbrockmendel Aug 6, 2025
7aa640d
mypy fixup, error message compat for 32bit builds
jbrockmendel Aug 6, 2025
e5b752e
minimum version compat
jbrockmendel Aug 6, 2025
323414c
not-infer-string compat
jbrockmendel Aug 6, 2025
96bed9d
mypy fixup
jbrockmendel Aug 6, 2025
1fa7e06
API: rank with nullable dtypes preserve NA
jbrockmendel Aug 4, 2025
b7a303a
API: improve dtype in df.where with EA other
jbrockmendel Aug 3, 2025
c3790ca
GH refs
jbrockmendel Aug 3, 2025
eb01ef7
doc fixup
jbrockmendel Aug 3, 2025
1bcfbeb
BUG: Decimal(NaN) incorrectly allowed in ArrowEA constructor with tim…
jbrockmendel Jul 3, 2025
11df1f9
GH ref
jbrockmendel Jul 3, 2025
6060386
BUG: ArrowEA constructor with timestamp type
jbrockmendel Jul 4, 2025
5e9eba7
POC: consistent NaN treatment for pyarrow dtypes
jbrockmendel Jun 28, 2025
42c1190
comment
jbrockmendel Jun 28, 2025
ca686b4
Down to 40 failing tests
jbrockmendel Jul 5, 2025
6cf66ef
Fix rank, json tests
jbrockmendel Jul 6, 2025
7687f84
CLN: remove outdated
jbrockmendel Jul 6, 2025
f79950d
Fix where kludge
jbrockmendel Jul 6, 2025
57cbdaa
update tests
jbrockmendel Jul 6, 2025
e3fc389
Fix remaining tests
jbrockmendel Jul 6, 2025
4108cc0
mypy fixup
jbrockmendel Jul 7, 2025
b220433
old-numpy compat
jbrockmendel Jul 7, 2025
6ed24a0
simplify
jbrockmendel Jul 7, 2025
05d4a94
Better option name, fixture
jbrockmendel Jul 31, 2025
cbc14d5
default True
jbrockmendel Jul 31, 2025
a238601
Patch ops
jbrockmendel Jul 31, 2025
3f15ca8
mypy fixup
jbrockmendel Jul 31, 2025
a5d3848
Test for setitem/construction
jbrockmendel Jul 31, 2025
670a940
update ufunc test
jbrockmendel Jul 31, 2025
3a032a4
Improve rank test skips
jbrockmendel Jul 31, 2025
c59b9de
ENH: mode.nan_is_na for numpy-nullable dtypes
jbrockmendel Aug 4, 2025
95f9ad9
update style test
jbrockmendel Aug 4, 2025
1e3d105
update asvs, mypy ignores
jbrockmendel Aug 4, 2025
cefeb6b
pre-commit fixup
jbrockmendel Aug 4, 2025
bc9c889
doc fixup
jbrockmendel Aug 4, 2025
74ab221
Remove special-casing
jbrockmendel Aug 4, 2025
0773a86
comment
jbrockmendel Aug 4, 2025
35a3c14
ruff format
jbrockmendel Aug 5, 2025
71d1c03
Set default to True
jbrockmendel Aug 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ class SortIntegerArray:
params = [10**3, 10**5]

def setup(self, N):
data = np.arange(N, dtype=float)
data[40] = np.nan
data = np.arange(N, dtype=float).astype(object)
data[40] = pd.NA
self.array = pd.array(data, dtype="Int64")

def time_argsort(self, N):
Expand Down
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np

from pandas import (
NA,
DataFrame,
Index,
MultiIndex,
Expand Down Expand Up @@ -445,6 +446,8 @@ def setup(self, inplace, dtype):
values[::2] = np.nan
if dtype == "Int64":
values = values.round()
values = values.astype(object)
values[::2] = NA
self.df = DataFrame(values, dtype=dtype)
self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict()

Expand Down
4 changes: 4 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,10 @@ def setup(self, dtype, method, with_nans):
null_vals = vals.astype(float, copy=True)
null_vals[::2, :] = np.nan
null_vals[::3, :] = np.nan
if dtype in ["Int64", "Float64"]:
null_vals = null_vals.astype(object)
null_vals[::2, :] = NA
null_vals[::3, :] = NA
df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
df["key"] = keys
self.df = df
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ or convert from existing pandas data:

.. ipython:: python

s1 = pd.Series([1, 2, np.nan], dtype="Int64")
s1 = pd.Series([1, 2, pd.NA], dtype="Int64")
s1
s2 = s1.astype("string")
s2
Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series``

.. ipython:: python

s = pd.Series([1, 2, np.nan], dtype='Int64')
s = pd.Series([1, 2, pd.NA], dtype='Int64')
s


Expand Down Expand Up @@ -166,7 +166,7 @@ See the :ref:`dtypes docs <basics.dtypes>` for more on extension arrays.

.. ipython:: python

pd.array([1, 2, np.nan], dtype='Int64')
pd.array([1, 2, pd.NA], dtype='Int64')
pd.array(['a', 'b', 'c'], dtype='category')

Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.)
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
5 changes: 5 additions & 0 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@
def using_string_dtype() -> bool:
_mode_options = _global_config["future"]
return _mode_options["infer_string"]


def is_nan_na() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["nan_is_na"]
1 change: 1 addition & 0 deletions pandas/_libs/missing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ...
def checknull(val: object) -> bool: ...
def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
18 changes: 18 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_pdna_or_none(values: ndarray) -> ndarray:
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if val is None or val is C_NA:
result[i] = True
return result.view(bool)


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ from csv import (
)
import warnings

from pandas._config import is_nan_na

from pandas.util._exceptions import find_stack_level

from pandas import StringDtype
Expand Down Expand Up @@ -43,7 +45,6 @@ from libc.string cimport (
strncpy,
)


import numpy as np

cimport numpy as cnp
Expand Down Expand Up @@ -1461,7 +1462,7 @@ def _maybe_upcast(
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na()))

return arr

Expand Down
7 changes: 7 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,3 +2116,10 @@ def temp_file(tmp_path):
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp


@pytest.fixture(params=[True, False])
def using_nan_is_na(request):
opt = request.param
with pd.option_context("mode.nan_is_na", opt):
yield opt
14 changes: 12 additions & 2 deletions pandas/core/arrays/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

import numpy as np

from pandas._config import is_nan_na

from pandas._libs import lib
from pandas._libs.missing import NA
from pandas.errors import LossySetitemError

from pandas.core.dtypes.cast import np_can_hold_element
Expand All @@ -21,7 +24,10 @@


def to_numpy_dtype_inference(
arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool
arr: ArrayLike,
dtype: npt.DTypeLike | None,
na_value,
hasna: bool,
) -> tuple[npt.DTypeLike, Any]:
if dtype is None and is_numeric_dtype(arr.dtype):
dtype_given = False
Expand All @@ -34,7 +40,11 @@ def to_numpy_dtype_inference(
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
na_value = np.nan
if not is_nan_na():
na_value = NA
dtype = np.dtype(object)
else:
na_value = np.nan
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
elif dtype is not None:
Expand Down
Loading
Loading