Skip to content

API: mode.nan_is_na to consistently distinguish NaN-vs-NA #62040

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 48 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
5e88fde
BUG: read_csv with engine=pyarrow and numpy-nullable dtype
jbrockmendel Aug 6, 2025
eae6f64
mypy fixup, error message compat for 32bit builds
jbrockmendel Aug 6, 2025
2861b16
minimum version compat
jbrockmendel Aug 6, 2025
5369afa
not-infer-string compat
jbrockmendel Aug 6, 2025
db35a9c
mypy fixup
jbrockmendel Aug 6, 2025
505bfb6
update usage
jbrockmendel Aug 11, 2025
febe83c
CLN: remove redundant check
jbrockmendel Aug 11, 2025
c81cbec
Use Matts idea
jbrockmendel Aug 11, 2025
26a3049
re-xfail
jbrockmendel Aug 12, 2025
a70b429
API: rank with nullable dtypes preserve NA
jbrockmendel Aug 4, 2025
99a71b7
API: improve dtype in df.where with EA other
jbrockmendel Aug 3, 2025
c86747d
GH refs
jbrockmendel Aug 3, 2025
9d222d8
doc fixup
jbrockmendel Aug 3, 2025
6f800b3
BUG: Decimal(NaN) incorrectly allowed in ArrowEA constructor with tim…
jbrockmendel Jul 3, 2025
514a56f
GH ref
jbrockmendel Jul 3, 2025
fca3c7c
BUG: ArrowEA constructor with timestamp type
jbrockmendel Jul 4, 2025
f20758a
POC: consistent NaN treatment for pyarrow dtypes
jbrockmendel Jun 28, 2025
cc416fa
comment
jbrockmendel Jun 28, 2025
7094d85
Down to 40 failing tests
jbrockmendel Jul 5, 2025
eeb0d32
Fix rank, json tests
jbrockmendel Jul 6, 2025
814d001
CLN: remove outdated
jbrockmendel Jul 6, 2025
5db5e4b
Fix where kludge
jbrockmendel Jul 6, 2025
87536a7
update tests
jbrockmendel Jul 6, 2025
64f4271
Fix remaining tests
jbrockmendel Jul 6, 2025
26d1177
mypy fixup
jbrockmendel Jul 7, 2025
bcb2506
old-numpy compat
jbrockmendel Jul 7, 2025
8f99d05
simplify
jbrockmendel Jul 7, 2025
5abd585
Better option name, fixture
jbrockmendel Jul 31, 2025
70830f7
default True
jbrockmendel Jul 31, 2025
58b3c4f
Patch ops
jbrockmendel Jul 31, 2025
cd7ec33
mypy fixup
jbrockmendel Jul 31, 2025
cf7b229
Test for setitem/construction
jbrockmendel Jul 31, 2025
eb12ea1
update ufunc test
jbrockmendel Jul 31, 2025
f0262ef
Improve rank test skips
jbrockmendel Jul 31, 2025
544faf1
ENH: mode.nan_is_na for numpy-nullable dtypes
jbrockmendel Aug 4, 2025
6c4b68f
update style test
jbrockmendel Aug 4, 2025
90d3a28
update asvs, mypy ignores
jbrockmendel Aug 4, 2025
408aa06
pre-commit fixup
jbrockmendel Aug 4, 2025
9e5ebec
doc fixup
jbrockmendel Aug 4, 2025
0fd2e2d
Remove special-casing
jbrockmendel Aug 4, 2025
7de9f40
comment
jbrockmendel Aug 4, 2025
2f61a58
ruff format
jbrockmendel Aug 5, 2025
36143ad
Set default to True
jbrockmendel Aug 6, 2025
b7ea9ae
whatsnew
jbrockmendel Aug 12, 2025
a625190
Merge branch 'main' into api-nan-vs-na
jbrockmendel Aug 20, 2025
d471aa8
update _cast_pointwise_result
jbrockmendel Aug 20, 2025
27cd097
update cast_pointwise_result
jbrockmendel Aug 20, 2025
1bb0a4e
Merge branch 'main' into api-nan-vs-na
jbrockmendel Aug 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Better option name, fixture
  • Loading branch information
jbrockmendel committed Aug 12, 2025
commit 5abd585579e9933c31bde6fd07038e1e066fafe6
4 changes: 2 additions & 2 deletions pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ def using_string_dtype() -> bool:
return _mode_options["infer_string"]


def using_pyarrow_strict_nans() -> bool:
def is_nan_na() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["pyarrow_strict_nans"]
return _mode_options["nan_is_na"]
7 changes: 7 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,3 +2116,10 @@ def temp_file(tmp_path):
def monkeysession():
with pytest.MonkeyPatch.context() as mp:
yield mp


@pytest.fixture(params=[True, False])
def using_nan_is_na(request):
opt = request.param
with pd.option_context("mode.nan_is_na", opt):
yield opt
4 changes: 2 additions & 2 deletions pandas/core/arrays/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np

from pandas._config import using_pyarrow_strict_nans
from pandas._config import is_nan_na

from pandas._libs import lib
from pandas._libs.missing import NA
Expand Down Expand Up @@ -41,7 +41,7 @@ def to_numpy_dtype_inference(
else:
dtype = arr.dtype.numpy_dtype # type: ignore[union-attr]
if na_value is lib.no_default:
if is_pyarrow and using_pyarrow_strict_nans():
if is_pyarrow and not is_nan_na():
na_value = NA
dtype = np.dtype(object)
else:
Expand Down
26 changes: 22 additions & 4 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import numpy as np

from pandas._config import using_pyarrow_strict_nans
from pandas._config import is_nan_na

from pandas._libs import lib
from pandas._libs.missing import is_pdna_or_none
Expand All @@ -36,6 +36,7 @@

from pandas.core.dtypes.cast import (
can_hold_element,
construct_1d_object_array_from_listlike,
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -555,7 +556,22 @@ def _box_pa_array(
return pa_array

mask = None
if getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf":
if is_nan_na():
try:
arr_value = np.asarray(value)
if arr_value.ndim > 1:
# e.g. test_fixed_size_list we have list data. ndim > 1
# means there were no scalar (NA) entries.
mask = np.zeros(len(value), dtype=np.bool_)
else:
mask = isna(arr_value)
except ValueError:
# Ragged data that numpy raises on
arr_value = construct_1d_object_array_from_listlike(value)
mask = isna(arr_value)
elif (
getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf"
):
arr_value = np.asarray(value, dtype=object)
# similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
mask = is_pdna_or_none(arr_value)
Expand Down Expand Up @@ -1490,7 +1506,9 @@ def to_numpy(
na_value: object = lib.no_default,
) -> np.ndarray:
original_na_value = na_value
dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
dtype, na_value = to_numpy_dtype_inference(
self, dtype, na_value, self._hasna, is_pyarrow=True
)
pa_type = self._pa_array.type
if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
data = self
Expand Down Expand Up @@ -1522,7 +1540,7 @@ def to_numpy(
or (
original_na_value is lib.no_default
and is_float_dtype(dtype)
and not using_pyarrow_strict_nans()
and is_nan_na()
)
)
):
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,10 +429,11 @@ def is_terminal() -> bool:

with cf.config_prefix("mode"):
cf.register_option(
"pyarrow_strict_nans",
True,
"nan_is_na",
False,
# TODO: Change this to False before merging
"Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA",
"Whether to make ArrowDtype arrays consistently treat NaN as "
"interchangeable with pd.NA",
validator=is_one_of_factory([True, False]),
)

Expand Down
27 changes: 10 additions & 17 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_strict_nans

from pandas._libs import lib
from pandas._libs.tslibs import timezones
from pandas.compat import (
Expand Down Expand Up @@ -277,17 +275,14 @@ def test_compare_scalar(self, data, comparison_op):
self._compare_other(ser, data, comparison_op, data[0])

@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data_missing, na_action):
def test_map(self, data_missing, na_action, using_nan_is_na):
if data_missing.dtype.kind in "mM":
result = data_missing.map(lambda x: x, na_action=na_action)
expected = data_missing.to_numpy(dtype=object)
tm.assert_numpy_array_equal(result, expected)
else:
result = data_missing.map(lambda x: x, na_action=na_action)
if (
data_missing.dtype == "float32[pyarrow]"
and not using_pyarrow_strict_nans()
):
if data_missing.dtype == "float32[pyarrow]" and using_nan_is_na:
# map roundtrips through objects, which converts to float64
expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
else:
Expand Down Expand Up @@ -698,7 +693,7 @@ def test_setitem_preserves_views(self, data):

@pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default])
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data, dtype_backend, request):
def test_EA_types(self, engine, data, dtype_backend, request, using_nan_is_na):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_decimal(pa_dtype):
request.applymarker(
Expand All @@ -719,7 +714,7 @@ def test_EA_types(self, engine, data, dtype_backend, request):
pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
)
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
if using_pyarrow_strict_nans():
if not using_nan_is_na:
csv_output = df.to_csv(index=False, na_rep="NA")
else:
csv_output = df.to_csv(index=False, na_rep=np.nan)
Expand Down Expand Up @@ -1536,7 +1531,7 @@ def test_astype_errors_ignore():
tm.assert_frame_equal(result, expected)


def test_to_numpy_with_defaults(data):
def test_to_numpy_with_defaults(data, using_nan_is_na):
# GH49973
result = data.to_numpy()

Expand All @@ -1548,21 +1543,19 @@ def test_to_numpy_with_defaults(data):
else:
expected = np.array(data._pa_array)

if data._hasna and (
not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans()
):
if data._hasna and (not is_numeric_dtype(data.dtype) or not using_nan_is_na):
expected = expected.astype(object)
expected[pd.isna(data)] = pd.NA

tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_int_with_na():
def test_to_numpy_int_with_na(using_nan_is_na):
# GH51227: ensure to_numpy does not convert int to float
data = [1, None]
arr = pd.array(data, dtype="int64[pyarrow]")
result = arr.to_numpy()
if using_pyarrow_strict_nans():
if not using_nan_is_na:
expected = np.array([1, pd.NA], dtype=object)
else:
expected = np.array([1, np.nan])
Expand Down Expand Up @@ -3528,10 +3521,10 @@ def test_cast_dictionary_different_value_dtype(arrow_type):
assert result.dtypes.iloc[0] == data_type


def test_map_numeric_na_action():
def test_map_numeric_na_action(using_nan_is_na):
ser = pd.Series([32, 40, None], dtype="int64[pyarrow]")
result = ser.map(lambda x: 42, na_action="ignore")
if using_pyarrow_strict_nans():
if not using_nan_is_na:
expected = pd.Series([42.0, 42.0, pd.NA], dtype="object")
else:
expected = pd.Series([42.0, 42.0, np.nan], dtype="float64")
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_strict_nans

import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -61,7 +59,7 @@ def test_convert_dtypes_retain_column_names(self):
tm.assert_index_equal(result.columns, df.columns)
assert result.columns.name == "cols"

def test_pyarrow_dtype_backend(self):
def test_pyarrow_dtype_backend(self, using_nan_is_na):
pa = pytest.importorskip("pyarrow")
df = pd.DataFrame(
{
Expand All @@ -76,7 +74,7 @@ def test_pyarrow_dtype_backend(self):
)
result = df.convert_dtypes(dtype_backend="pyarrow")

item = None if not using_pyarrow_strict_nans() else np.nan
item = None if using_nan_is_na else np.nan
expected = pd.DataFrame(
{
"a": pd.arrays.ArrowExtensionArray(
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/series/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):
dtype == "int64"
or dtype == "int64[pyarrow]"
or dtype == "uint64[pyarrow]"
or dtype == "float64[pyarrow]"
or (not using_infer_string and dtype == "str")
):
pytest.skip("int64/str does not support NaN")
Expand Down