From 98bedc4635718eefe50dfcf109f7c45d5f92f49a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 17:49:00 -0700 Subject: [PATCH 01/39] BUG: read_csv with engine=pyarrow and numpy-nullable dtype --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 74 +++++++++++++++---- .../io/parser/dtypes/test_dtypes_basic.py | 4 - pandas/tests/io/parser/test_na_values.py | 17 ++++- 4 files changed, 74 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ec5027840dfd5..0f8e026761db0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -814,6 +814,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8cadde1ad6537..e446f7f4fb897 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,6 +3,8 @@ from typing import TYPE_CHECKING import warnings +import numpy as np + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -12,8 +14,13 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, +) from pandas.core.dtypes.inference import is_integer +from pandas.core.arrays.string_ import StringDtype + from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase @@ -140,20 +147,7 @@ def handle_warning(invalid_row) -> str: "encoding": self.encoding, } - def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: - """ - Processes data read in based on kwargs. - - Parameters - ---------- - frame: DataFrame - The DataFrame to process. - - Returns - ------- - DataFrame - The processed DataFrame. - """ + def _finalize_column_names(self, frame: DataFrame) -> DataFrame: num_cols = len(frame.columns) multi_index_named = True if self.header is None: @@ -196,6 +190,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) + return frame + + def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: + """ + Processes data read in based on kwargs. + + Parameters + ---------- + frame: DataFrame + The DataFrame to process. + + Returns + ------- + DataFrame + The processed DataFrame. + """ + if self.dtype is not None: # Ignore non-existent columns from dtype mapping # like other parsers do @@ -282,6 +293,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) + workaround = False + pass_backend = dtype_backend + if self.dtype is not None and dtype_backend != "pyarrow": + # We pass dtype_backend="pyarrow" and subsequently cast + # to avoid lossy conversion e.g. GH#56136 + workaround = True + pass_backend = "numpy_nullable" + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -289,7 +308,32 @@ def read(self) -> DataFrame: DeprecationWarning, ) frame = arrow_table_to_pandas( - table, dtype_backend=dtype_backend, null_to_int64=True + table, dtype_backend=pass_backend, null_to_int64=True ) + frame = self._finalize_column_names(frame) + + if workaround and dtype_backend != "numpy_nullable": + old_dtype = self.dtype + if not isinstance(old_dtype, dict): + # e.g. test_categorical_dtype_utf16 + old_dtype = dict.fromkeys(frame.columns, old_dtype) + + # _finalize_pandas_output will call astype, but we need to make + # sure all keys are populated appropriately. + new_dtype = {} + for key in frame.columns: + ser = frame[key] + if isinstance(ser.dtype, BaseMaskedDtype): + new_dtype[key] = ser.dtype.numpy_dtype + elif isinstance(ser.dtype, StringDtype): + # We cast here in case the user passed "category" in + # order to get the correct dtype.categories.dtype + # e.g. test_categorical_dtype_utf16 + new_dtype[key] = StringDtype(na_value=np.nan) + frame[key] = frame[key].astype(new_dtype[key]) + + new_dtype.update(old_dtype) + self.dtype = new_dtype + return self._finalize_pandas_output(frame) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 75b7cf0d42cb8..e4563afc631c5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): tm.assert_frame_equal(result, expected) -# pyarrow engine failing: -# https://github.com/pandas-dev/pandas/issues/56136 -@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 213fa2c01cef4..d60074243a526 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) -def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): +def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request): # see gh-20377 parser = all_parsers + if parser.engine == "pyarrow" and na_filter is False: + mark = pytest.mark.xfail(reason="mismatched shape") + request.applymarker(mark) + data = "a,b,c\n1,,3\n4,5,6" # na_filter=True --> missing value becomes NaN. @@ -798,7 +801,15 @@ def test_bool_and_nan_to_int(all_parsers): True False """ - with pytest.raises(ValueError, match="convert|NoneType"): + msg = ( + "cannot safely convert passed user dtype of int64 for " + " dtyped data in column 0 due to NA values" + ) + if parser.engine == "python": + msg = "Unable to convert column 0 to type int64" + elif parser.engine == "pyarrow": + msg = r"cannot convert NA to integer" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype="int") From 7aa640d2c30c4a99170110c4b97bd816649147c3 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 18:14:49 -0700 Subject: [PATCH 02/39] mypy fixup, error message compat for 32bit builds --- pandas/io/parsers/arrow_parser_wrapper.py | 3 ++- pandas/tests/io/parser/test_na_values.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index e446f7f4fb897..75cb16a93c493 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -330,7 +330,8 @@ def read(self) -> DataFrame: # We cast here in case the user passed "category" in # order to get the correct dtype.categories.dtype # e.g. test_categorical_dtype_utf16 - new_dtype[key] = StringDtype(na_value=np.nan) + sdt = StringDtype(na_value=np.nan) + new_dtype[key] = sdt # type: ignore[assignment] frame[key] = frame[key].astype(new_dtype[key]) new_dtype.update(old_dtype) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index d60074243a526..d0cc92c5a73af 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -802,11 +802,11 @@ def test_bool_and_nan_to_int(all_parsers): False """ msg = ( - "cannot safely convert passed user dtype of int64 for " + "cannot safely convert passed user dtype of int(64|32) for " " dtyped data in column 0 due to NA values" ) if parser.engine == "python": - msg = "Unable to convert column 0 to type int64" + msg = "Unable to convert column 0 to type int(64|32)" elif parser.engine == "pyarrow": msg = r"cannot convert NA to integer" with pytest.raises(ValueError, match=msg): From e5b752ef6580486b8273f921d9a246bc32180bf8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 19:06:31 -0700 Subject: [PATCH 03/39] minimum version compat --- pandas/tests/io/parser/test_na_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index d0cc92c5a73af..5f08f5ef466cf 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -803,7 +803,7 @@ def test_bool_and_nan_to_int(all_parsers): """ msg = ( "cannot safely convert passed user dtype of int(64|32) for " - " dtyped data in column 0 due to NA values" + " dtyped data in column 0 due to NA values" ) if parser.engine == "python": msg = "Unable to convert column 0 to type int(64|32)" From 323414c504446a30b3aa9a4f6fbdc286273a1a8d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 20:57:03 -0700 Subject: [PATCH 04/39] not-infer-string compat --- pandas/io/parsers/arrow_parser_wrapper.py | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 75cb16a93c493..039841747c9a8 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -5,6 +5,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -13,7 +15,10 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import ( + is_string_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ( BaseMaskedDtype, ) @@ -326,13 +331,29 @@ def read(self) -> DataFrame: ser = frame[key] if isinstance(ser.dtype, BaseMaskedDtype): new_dtype[key] = ser.dtype.numpy_dtype + if ( + key in old_dtype + and not using_string_dtype() + and is_string_dtype(old_dtype[key]) + and not isinstance(old_dtype[key], StringDtype) + and ser.array._hasna + ): + # Cast to make sure we get "NaN" string instead of "NA" + frame[key] = ser.astype(old_dtype[key]) + frame.loc[ser.isna(), key] = np.nan + old_dtype[key] = object # Avoid re-casting elif isinstance(ser.dtype, StringDtype): # We cast here in case the user passed "category" in # order to get the correct dtype.categories.dtype # e.g. test_categorical_dtype_utf16 - sdt = StringDtype(na_value=np.nan) + if not using_string_dtype(): + sdt = np.dtype(object) + frame[key] = ser.astype(sdt) + frame.loc[ser.isna(), key] = np.nan + else: + sdt = StringDtype(na_value=np.nan) + frame[key] = frame[key].astype(sdt) new_dtype[key] = sdt # type: ignore[assignment] - frame[key] = frame[key].astype(new_dtype[key]) new_dtype.update(old_dtype) self.dtype = new_dtype From 96bed9d5258a2b45858c4c16d5301dbcfe666882 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 6 Aug 2025 07:22:05 -0700 Subject: [PATCH 05/39] mypy fixup --- pandas/io/parsers/arrow_parser_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 039841747c9a8..09759d4127ac8 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -351,9 +351,9 @@ def read(self) -> DataFrame: frame[key] = ser.astype(sdt) frame.loc[ser.isna(), key] = np.nan else: - sdt = StringDtype(na_value=np.nan) + sdt = StringDtype(na_value=np.nan) # type: ignore[assignment] frame[key] = frame[key].astype(sdt) - new_dtype[key] = sdt # type: ignore[assignment] + new_dtype[key] = sdt new_dtype.update(old_dtype) self.dtype = new_dtype From 1fa7e06a3d24cd3373d7fae9277ed22d26446a36 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 11:08:46 -0700 Subject: [PATCH 06/39] API: rank with nullable dtypes preserve NA --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0f8e026761db0..8be62f04f1c6e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -97,7 +97,6 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From b7a303a8022b3169c93f2926870bfa2a97e89f20 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 3 Aug 2025 14:08:00 -0700 Subject: [PATCH 07/39] API: improve dtype in df.where with EA other --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8be62f04f1c6e..8c775193c3ead 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -97,7 +97,7 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) - +- Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`??`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From c3790ca13be6de1a502e54d56ec98a7a6da3edd2 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 3 Aug 2025 14:09:58 -0700 Subject: [PATCH 08/39] GH refs --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8c775193c3ead..4de2c73a493e8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -97,7 +97,6 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`??`) .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From eb01ef743fc09dd2bd93bc7a55780124a12291da Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 3 Aug 2025 14:23:55 -0700 Subject: [PATCH 09/39] doc fixup --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4de2c73a493e8..0f8e026761db0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -97,6 +97,8 @@ Other enhancements - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) - Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) +- + .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: From 1bcfbeb78a7f7ceb1b9bb778555ab724c7976ae1 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jul 2025 15:47:02 -0700 Subject: [PATCH 10/39] BUG: Decimal(NaN) incorrectly allowed in ArrowEA constructor with timestamp type --- doc/source/whatsnew/v3.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0f8e026761db0..e0eb8194f3235 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -721,6 +721,8 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) +- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`??`) +- Timedelta ^^^^^^^^^ From 11df1f95fbdc321a11e3e47cc05721d2699cef7e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jul 2025 15:49:04 -0700 Subject: [PATCH 11/39] GH ref --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e0eb8194f3235..38ba80c0ace5d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -721,7 +721,6 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) -- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`??`) - Timedelta From 606038602f57cd496294c5223ef5d5ba103989c8 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Jul 2025 08:21:07 -0700 Subject: [PATCH 12/39] BUG: ArrowEA constructor with timestamp type --- doc/source/whatsnew/v3.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 38ba80c0ace5d..0f8e026761db0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -721,7 +721,6 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) -- Timedelta ^^^^^^^^^ From 5e9eba70acd5dc2d0063df0fa9fffbfcd5885531 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Jun 2025 10:07:44 -0700 Subject: [PATCH 13/39] POC: consistent NaN treatment for pyarrow dtypes --- pandas/_libs/parsers.pyx | 2 +- pandas/core/arrays/arrow/array.py | 54 ++++++++++++++++++------ pandas/core/arrays/string_.py | 8 +++- pandas/core/generic.py | 19 ++++++++- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/groupby/test_reductions.py | 6 ++- pandas/tests/series/methods/test_rank.py | 9 ++++ 7 files changed, 81 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 5b94f45490da4..1f5813940c058 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1461,7 +1461,7 @@ def _maybe_upcast( if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow arr = arr.to_numpy(na_value=None) - arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) + arr = ArrowExtensionArray(pa.array(arr)) return arr diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ad1d576bfec32..1fc97b41b8d4f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -17,6 +17,7 @@ import numpy as np from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -353,7 +354,7 @@ def _from_sequence_of_strings( # duration to string casting behavior mask = isna(scalars) if not isinstance(strings, (pa.Array, pa.ChunkedArray)): - strings = pa.array(strings, type=pa.string(), from_pandas=True) + strings = pa.array(strings, type=pa.string()) strings = pc.if_else(mask, None, strings) try: scalars = strings.cast(pa.int64()) @@ -374,7 +375,7 @@ def _from_sequence_of_strings( if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings else: - scalars = pa.array(strings, type=pa.string(), from_pandas=True) + scalars = pa.array(strings, type=pa.string()) scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) scalars = scalars.cast(pa.bool_()) @@ -386,6 +387,13 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") + if not pa.types.is_decimal(pa_type): + # TODO: figure out why doing this cast breaks with decimal dtype + # in test_from_sequence_of_strings_pa_array + mask = strings.is_null() + scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) + # TODO: could we just do strings.cast(pa_type)? + else: raise NotImplementedError( f"Converting strings to {pa_type} is not implemented." @@ -428,7 +436,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: """ if isinstance(value, pa.Scalar): pa_scalar = value - elif isna(value): + elif isna(value) and not lib.is_float(value): pa_scalar = pa.scalar(None, type=pa_type) else: # Workaround https://github.com/apache/arrow/issues/37291 @@ -445,7 +453,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: value = value.as_unit(pa_type.unit) value = value._value - pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + pa_scalar = pa.scalar(value, type=pa_type) if pa_type is not None and pa_scalar.type != pa_type: pa_scalar = pa_scalar.cast(pa_type) @@ -477,6 +485,13 @@ def _box_pa_array( if copy: value = value.copy() pa_array = value.__arrow_array__() + + elif hasattr(value, "__arrow_array__"): + # e.g. StringArray + if copy: + value = value.copy() + pa_array = value.__arrow_array__() + else: if ( isinstance(value, np.ndarray) @@ -530,11 +545,24 @@ def _box_pa_array( pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask) return pa_array + mask = None + if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM": + # similar to isna(value) but exclude NaN + # TODO: cythonize! + mask = np.array([x is NA or x is None for x in value], dtype=bool) + + from_pandas = False + if pa.types.is_integer(pa_type): + # If user specifically asks to cast a numpy float array with NaNs + # to pyarrow integer, we'll treat those NaNs as NA + from_pandas = True try: - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array( + value, type=pa_type, mask=mask, from_pandas=from_pandas + ) except (pa.ArrowInvalid, pa.ArrowTypeError): # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, from_pandas=True) + pa_array = pa.array(value, mask=mask, from_pandas=from_pandas) if pa_type is None and pa.types.is_duration(pa_array.type): # Workaround https://github.com/apache/arrow/issues/37291 @@ -542,7 +570,7 @@ def _box_pa_array( value = to_timedelta(value) value = value.to_numpy() - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array(value, type=pa_type) if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: # GH52843: upstream bug for duration types when originally @@ -1208,7 +1236,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) + result = pc.is_in(self._pa_array, value_set=pa.array(values)) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -2015,7 +2043,7 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") chunks = [ *self._pa_array[:key].chunks, - pa.array([value], type=self._pa_array.type, from_pandas=True), + pa.array([value], type=self._pa_array.type), *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() @@ -2069,7 +2097,7 @@ def _rank_calc( pa_type = pa.float64() else: pa_type = pa.uint64() - result = pa.array(ranked, type=pa_type, from_pandas=True) + result = pa.array(ranked, type=pa_type) return result data = self._pa_array.combine_chunks() @@ -2321,7 +2349,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: right, right_type = _to_numpy_and_type(right) pa_type = left_type or right_type result = np.where(cond, left, right) - return pa.array(result, type=pa_type, from_pandas=True) + return pa.array(result, type=pa_type) @classmethod def _replace_with_mask( @@ -2364,7 +2392,7 @@ def _replace_with_mask( replacements = replacements.as_py() result = np.array(values, dtype=object) result[mask] = replacements - return pa.array(result, type=values.type, from_pandas=True) + return pa.array(result, type=values.type) # ------------------------------------------------------------------ # GroupBy Methods @@ -2443,7 +2471,7 @@ def _groupby_op( return type(self)(pa_result) else: # DatetimeArray, TimedeltaArray - pa_result = pa.array(result, from_pandas=True) + pa_result = pa.array(result) return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 198dc4c483277..719686ab71a29 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -502,6 +502,12 @@ def _str_map_str_or_object( if self.dtype.storage == "pyarrow": import pyarrow as pa + # TODO: shouldn't this already be caught my passed mask? + # it isn't in test_extract_expand_capture_groups_index + # mask = mask | np.array( + # [x is libmissing.NA for x in result], dtype=bool + # ) + result = pa.array( result, mask=mask, type=pa.large_string(), from_pandas=True ) @@ -754,7 +760,7 @@ def __arrow_array__(self, type=None): values = self._ndarray.copy() values[self.isna()] = None - return pa.array(values, type=type, from_pandas=True) + return pa.array(values, type=type) def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7c407b03965df..4707cb28ca060 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9919,7 +9919,7 @@ def where( def where( self, cond, - other=np.nan, + other=lib.no_default, *, inplace: bool = False, axis: Axis | None = None, @@ -10077,6 +10077,23 @@ def where( stacklevel=2, ) + if other is lib.no_default: + if self.ndim == 1: + if isinstance(self.dtype, ExtensionDtype): + other = self.dtype.na_value + else: + other = np.nan + else: + if self._mgr.nblocks == 1 and isinstance( + self._mgr.blocks[0].values.dtype, ExtensionDtype + ): + # FIXME: checking this is kludgy! + other = self._mgr.blocks[0].values.dtype.na_value + else: + # FIXME: the same problem we had with Series will now + # show up column-by-column! + other = np.nan + other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c3e1d33ec93df..4b322466a8b62 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -721,7 +721,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - csv_output = df.to_csv(index=False, na_rep=np.nan) + csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA? if pa.types.is_binary(pa_dtype): csv_output = BytesIO(csv_output) else: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index e9527ed3a9c0e..e60e7d6bc05d4 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -379,8 +379,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): df = DataFrame( { "a": [2, 1, 1, 2, 3, 3], - "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], - "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + # TODO: test that has mixed na_value and NaN either working for + # float or raising for int? + "b": [na_value, 3.0, na_value, 4.0, na_value, na_value], + "c": [na_value, 3.0, na_value, 4.0, na_value, na_value], }, dtype=any_real_nullable_dtype, ) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index ecd52b2c8498a..49142a859e434 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -280,6 +280,13 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) + if dtype == "float64[pyarrow]": + # the NaNs are not treated as NA + exp = exp.copy() + if method == "average": + exp[np.isnan(ser)] = 9.5 + elif method == "dense": + exp[np.isnan(ser)] = 6 tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @@ -331,6 +338,8 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] + elif dtype == "float64[pyarrow]": + order = [ranks[0], [NA] * chunk, ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] From 42c1190f4b273af2014b3a45ef1b212af2ef656d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Jun 2025 10:23:00 -0700 Subject: [PATCH 14/39] comment --- pandas/tests/extension/base/setitem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 185d6d750cace..99ab5d2f7e86f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -422,6 +422,7 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) + # FIXME: Breaks for pyarrow float dtype bc df.values changes NAs to NaN df.iloc[:] = df.values tm.assert_frame_equal(df, orig) From ca686b4ae4f2ed6006251978ee9265dd8eb9652e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Jul 2025 09:41:02 -0700 Subject: [PATCH 15/39] Down to 40 failing tests --- pandas/_config/__init__.py | 5 +++ pandas/_libs/missing.pyi | 1 + pandas/_libs/missing.pyx | 18 ++++++++ pandas/core/arrays/_utils.py | 15 ++++++- pandas/core/arrays/arrow/array.py | 66 +++++++++++++++++++--------- pandas/core/arrays/base.py | 3 ++ pandas/core/arrays/masked.py | 4 +- pandas/core/config_init.py | 9 ++++ pandas/tests/extension/test_arrow.py | 14 ++++-- 9 files changed, 109 insertions(+), 26 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 463e8af7cc561..fbf388224254f 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -33,3 +33,8 @@ def using_string_dtype() -> bool: _mode_options = _global_config["future"] return _mode_options["infer_string"] + + +def using_pyarrow_strict_nans() -> bool: + _mode_options = _global_config["mode"] + return _mode_options["pyarrow_strict_nans"] diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 6bf30a03cef32..6c76fe49330b6 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,3 +14,4 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... +def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index c7f905c4d0be0..164a47cb5adb7 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -249,6 +249,24 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA +@cython.wraparound(False) +@cython.boundscheck(False) +def is_pdna_or_none(values: ndarray) -> ndarray: + cdef: + ndarray[uint8_t] result + Py_ssize_t i, N + object val + + N = len(values) + result = np.zeros(N, dtype=np.uint8) + + for i in range(N): + val = values[i] + if val is None or val is C_NA: + result[i] = True + return result.view(bool) + + @cython.wraparound(False) @cython.boundscheck(False) def is_numeric_na(values: ndarray) -> ndarray: diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 6b46396d5efdf..9adde3846ca03 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -7,7 +7,10 @@ import numpy as np +from pandas._config import using_pyarrow_strict_nans + from pandas._libs import lib +from pandas._libs.missing import NA from pandas.errors import LossySetitemError from pandas.core.dtypes.cast import np_can_hold_element @@ -21,7 +24,11 @@ def to_numpy_dtype_inference( - arr: ArrayLike, dtype: npt.DTypeLike | None, na_value, hasna: bool + arr: ArrayLike, + dtype: npt.DTypeLike | None, + na_value, + hasna: bool, + is_pyarrow: bool = True, ) -> tuple[npt.DTypeLike, Any]: if dtype is None and is_numeric_dtype(arr.dtype): dtype_given = False @@ -34,7 +41,11 @@ def to_numpy_dtype_inference( else: dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] if na_value is lib.no_default: - na_value = np.nan + if is_pyarrow and using_pyarrow_strict_nans(): + na_value = NA + dtype = np.dtype(object) + else: + na_value = np.nan else: dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1fc97b41b8d4f..9a969ba352122 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,8 +16,10 @@ import numpy as np +from pandas._config import using_pyarrow_strict_nans + from pandas._libs import lib -from pandas._libs.missing import NA +from pandas._libs.missing import is_pdna_or_none from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -326,6 +328,11 @@ def _from_sequence_of_strings( """ Construct a new ExtensionArray from a sequence of strings. """ + mask = isna(strings) + + if isinstance(strings, cls): + strings = strings._pa_array + pa_type = to_pyarrow_type(dtype) if ( pa_type is None @@ -344,22 +351,35 @@ def _from_sequence_of_strings( from pandas.core.tools.datetimes import to_datetime scalars = to_datetime(strings, errors="raise").date + + if isinstance(strings, cls): + # Avoid an object path + # TODO: this assumes that pyarrows str->date casting is the + # same as to_datetime. Is that a fair assumption? + scalars = strings._pa_array.cast(pa_type) + else: + scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) + elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta scalars = to_timedelta(strings, errors="raise") + if pa_type.unit != "ns": # GH51175: test_from_sequence_of_strings_pa_array # attempt to parse as int64 reflecting pyarrow's # duration to string casting behavior mask = isna(scalars) - if not isinstance(strings, (pa.Array, pa.ChunkedArray)): - strings = pa.array(strings, type=pa.string()) + if isinstance(strings, cls): + strings = strings._pa_array + elif not isinstance(strings, (pa.Array, pa.ChunkedArray)): + strings = pa.array(strings, type=pa.string(), mask=mask) strings = pc.if_else(mask, None, strings) try: scalars = strings.cast(pa.int64()) except pa.ArrowInvalid: pass + elif pa.types.is_time(pa_type): from pandas.core.tools.times import to_time @@ -375,7 +395,7 @@ def _from_sequence_of_strings( if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings else: - scalars = pa.array(strings, type=pa.string()) + scalars = pa.array(strings, type=pa.string(), mask=mask) scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) scalars = scalars.cast(pa.bool_()) @@ -387,12 +407,16 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") - if not pa.types.is_decimal(pa_type): + if not pa.types.is_decimal(pa_type) and isinstance( + strings, (pa.Array, pa.ChunkedArray) + ): # TODO: figure out why doing this cast breaks with decimal dtype # in test_from_sequence_of_strings_pa_array mask = strings.is_null() scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) # TODO: could we just do strings.cast(pa_type)? + elif mask is not None: + scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) else: raise NotImplementedError( @@ -546,23 +570,20 @@ def _box_pa_array( return pa_array mask = None - if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM": - # similar to isna(value) but exclude NaN - # TODO: cythonize! - mask = np.array([x is NA or x is None for x in value], dtype=bool) - - from_pandas = False - if pa.types.is_integer(pa_type): - # If user specifically asks to cast a numpy float array with NaNs - # to pyarrow integer, we'll treat those NaNs as NA - from_pandas = True + if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf": + try: + arr_value = np.asarray(value) + except ValueError: + # e.g. list dtype with mixed-length lists + arr_value = np.asarray(value, dtype=object) + # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like + mask = is_pdna_or_none(arr_value) + try: - pa_array = pa.array( - value, type=pa_type, mask=mask, from_pandas=from_pandas - ) + pa_array = pa.array(value, type=pa_type, mask=mask) except (pa.ArrowInvalid, pa.ArrowTypeError): # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, mask=mask, from_pandas=from_pandas) + pa_array = pa.array(value, mask=mask) if pa_type is None and pa.types.is_duration(pa_array.type): # Workaround https://github.com/apache/arrow/issues/37291 @@ -1517,7 +1538,11 @@ def to_numpy( pa.types.is_floating(pa_type) and ( na_value is np.nan - or (original_na_value is lib.no_default and is_float_dtype(dtype)) + or ( + original_na_value is lib.no_default + and is_float_dtype(dtype) + and not using_pyarrow_strict_nans() + ) ) ): result = data._pa_array.to_numpy() @@ -2390,6 +2415,7 @@ def _replace_with_mask( replacements = np.array(replacements, dtype=object) elif isinstance(replacements, pa.Scalar): replacements = replacements.as_py() + result = np.array(values, dtype=object) result[mask] = replacements return pa.array(result, type=values.type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bfa2309bb023a..f71d2480e45e9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -778,6 +778,9 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) + # if dtype.kind == "U": + # dtype = np.dtype(object) + # return self.to_numpy(dtype=dtype, copy=copy) if not copy: return np.asarray(self, dtype=dtype) else: diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 9c5965951da68..7f924db0dcc3b 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -497,7 +497,9 @@ def to_numpy( array([ True, False, False]) """ hasna = self._hasna - dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + dtype, na_value = to_numpy_dtype_inference( + self, dtype, na_value, hasna, is_pyarrow=False + ) if dtype is None: dtype = object diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bf7e8fb02b58e..02b600eb5fee4 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -427,6 +427,15 @@ def is_terminal() -> bool: validator=is_one_of_factory([True, False, "warn"]), ) +with cf.config_prefix("mode"): + cf.register_option( + "pyarrow_strict_nans", + True, + # TODO: Change this to False before merging + "Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA", + validator=is_one_of_factory([True, False]), + ) + # user warnings chained_assignment = """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4b322466a8b62..3be812f9c1562 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -32,6 +32,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_strict_nans + from pandas._libs import lib from pandas._libs.tslibs import timezones from pandas.compat import ( @@ -721,7 +723,10 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA? + if using_pyarrow_strict_nans(): + csv_output = df.to_csv(index=False, na_rep="NA") + else: + csv_output = df.to_csv(index=False, na_rep=np.nan) if pa.types.is_binary(pa_dtype): csv_output = BytesIO(csv_output) else: @@ -1512,7 +1517,8 @@ def test_pickle_roundtrip(data): def test_astype_from_non_pyarrow(data): # GH49795 - pd_array = data._pa_array.to_pandas().array + np_arr = data.to_numpy() + pd_array = pd.array(np_arr, dtype=np_arr.dtype) result = pd_array.astype(data.dtype) assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) @@ -1546,7 +1552,9 @@ def test_to_numpy_with_defaults(data): else: expected = np.array(data._pa_array) - if data._hasna and not is_numeric_dtype(data.dtype): + if data._hasna and ( + not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans() + ): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA From 6cf66ef03ee7073f336b2194dc8d0f603ba3df2c Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 10:17:51 -0700 Subject: [PATCH 16/39] Fix rank, json tests --- pandas/io/json/_json.py | 14 +++++++++++++ pandas/tests/extension/test_arrow.py | 5 ++++- pandas/tests/series/methods/test_rank.py | 25 ++++++++++++++++++++---- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 59911a57acc02..53a10c7a680f6 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -994,6 +994,13 @@ def _read_ujson(self) -> DataFrame | Series: else: obj = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: + if self.dtype_backend == "pyarrow": + # The construction above takes "null" to NaN, which we want to + # convert to NA. But .convert_dtypes to pyarrow doesn't allow + # that, so we do a 2-step conversion through numpy-nullable. + obj = obj.convert_dtypes( + infer_objects=False, dtype_backend="numpy_nullable" + ) return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend ) @@ -1071,6 +1078,13 @@ def __next__(self) -> DataFrame | Series: raise ex if self.dtype_backend is not lib.no_default: + if self.dtype_backend == "pyarrow": + # The construction above takes "null" to NaN, which we want to + # convert to NA. But .convert_dtypes to pyarrow doesn't allow + # that, so we do a 2-step conversion through numpy-nullable. + obj = obj.convert_dtypes( + infer_objects=False, dtype_backend="numpy_nullable" + ) return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3be812f9c1562..60a5a8d9081bc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -285,7 +285,10 @@ def test_map(self, data_missing, na_action): tm.assert_numpy_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == "float32[pyarrow]": + if ( + data_missing.dtype == "float32[pyarrow]" + and not using_pyarrow_strict_nans() + ): # map roundtrips through objects, which converts to float64 expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) else: diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 49142a859e434..e8c49dcce31e0 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -275,7 +275,12 @@ def test_rank_signature(self): def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): + if ( + dtype == "int64" + or dtype == "int64[pyarrow]" + or dtype == "uint64[pyarrow]" + or (not using_infer_string and dtype == "str") + ): pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) @@ -287,7 +292,15 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): exp[np.isnan(ser)] = 9.5 elif method == "dense": exp[np.isnan(ser)] = 6 - tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) + elif method == "max": + exp[np.isnan(ser)] = 10 + elif method == "min": + exp[np.isnan(ser)] = 9 + elif method == "first": + exp[np.isnan(ser)] = [9, 10] + + expected = Series(exp, dtype=expected_dtype(dtype, method)) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( @@ -406,8 +419,12 @@ def test_rank_dense_method(self, dtype, ser, exp): def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if dtype == "int64" or (not using_infer_string and dtype == "str"): - s = ser.dropna() + if ( + dtype == "int64" + or dtype == "int64[pyarrow]" + or (not using_infer_string and dtype == "str") + ): + s = ser.dropna().astype(dtype) else: s = ser.astype(dtype) From 7687f84aa1d1757638ad468c2278a0a81db0fb05 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 10:33:51 -0700 Subject: [PATCH 17/39] CLN: remove outdated --- pandas/core/arrays/arrow/array.py | 12 ++---------- pandas/core/arrays/base.py | 3 --- pandas/tests/extension/base/setitem.py | 1 - 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9a969ba352122..25046c17a9555 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -352,13 +352,7 @@ def _from_sequence_of_strings( scalars = to_datetime(strings, errors="raise").date - if isinstance(strings, cls): - # Avoid an object path - # TODO: this assumes that pyarrows str->date casting is the - # same as to_datetime. Is that a fair assumption? - scalars = strings._pa_array.cast(pa_type) - else: - scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) + scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta @@ -370,9 +364,7 @@ def _from_sequence_of_strings( # attempt to parse as int64 reflecting pyarrow's # duration to string casting behavior mask = isna(scalars) - if isinstance(strings, cls): - strings = strings._pa_array - elif not isinstance(strings, (pa.Array, pa.ChunkedArray)): + if not isinstance(strings, (pa.Array, pa.ChunkedArray)): strings = pa.array(strings, type=pa.string(), mask=mask) strings = pc.if_else(mask, None, strings) try: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f71d2480e45e9..bfa2309bb023a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -778,9 +778,6 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy) - # if dtype.kind == "U": - # dtype = np.dtype(object) - # return self.to_numpy(dtype=dtype, copy=copy) if not copy: return np.asarray(self, dtype=dtype) else: diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 99ab5d2f7e86f..185d6d750cace 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -422,7 +422,6 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) - # FIXME: Breaks for pyarrow float dtype bc df.values changes NAs to NaN df.iloc[:] = df.values tm.assert_frame_equal(df, orig) From f79950d9ffa8fb80e0056c300c35c2c196dd6680 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 14:12:03 -0700 Subject: [PATCH 18/39] Fix where kludge --- pandas/core/arrays/arrow/array.py | 2 ++ pandas/core/generic.py | 17 ----------------- pandas/tests/extension/test_arrow.py | 7 +++++-- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 25046c17a9555..59ffce66602e5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -407,6 +407,8 @@ def _from_sequence_of_strings( mask = strings.is_null() scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) # TODO: could we just do strings.cast(pa_type)? + elif isinstance(strings, (pa.Array, pa.ChunkedArray)): + scalars = strings.cast(pa_type) elif mask is not None: scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4707cb28ca060..a2f652d77246f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10077,23 +10077,6 @@ def where( stacklevel=2, ) - if other is lib.no_default: - if self.ndim == 1: - if isinstance(self.dtype, ExtensionDtype): - other = self.dtype.na_value - else: - other = np.nan - else: - if self._mgr.nblocks == 1 and isinstance( - self._mgr.blocks[0].values.dtype, ExtensionDtype - ): - # FIXME: checking this is kludgy! - other = self._mgr.blocks[0].values.dtype.na_value - else: - # FIXME: the same problem we had with Series will now - # show up column-by-column! - other = np.nan - other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 60a5a8d9081bc..d5bf338ceef98 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1569,8 +1569,11 @@ def test_to_numpy_int_with_na(): data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - expected = np.array([1, np.nan]) - assert isinstance(result[0], float) + if using_pyarrow_strict_nans(): + expected = np.array([1, pd.NA], dtype=object) + else: + expected = np.array([1, np.nan]) + assert isinstance(result[0], float) tm.assert_numpy_array_equal(result, expected) From 57cbdaa274bf1144b0650bb6ed62f16a9db1032d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 14:21:09 -0700 Subject: [PATCH 19/39] update tests --- pandas/tests/extension/test_arrow.py | 5 ++++- pandas/tests/frame/methods/test_convert_dtypes.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d5bf338ceef98..a762f5eccfafa 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3537,7 +3537,10 @@ def test_cast_dictionary_different_value_dtype(arrow_type): def test_map_numeric_na_action(): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + if using_pyarrow_strict_nans(): + expected = pd.Series([42.0, 42.0, pd.NA], dtype="object") + else: + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ab847e2f8e81e..21f7811100d43 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_strict_nans + import pandas.util._test_decorators as td import pandas as pd @@ -73,6 +75,8 @@ def test_pyarrow_dtype_backend(self): } ) result = df.convert_dtypes(dtype_backend="pyarrow") + + item = None if not using_pyarrow_strict_nans() else np.nan expected = pd.DataFrame( { "a": pd.arrays.ArrowExtensionArray( @@ -80,7 +84,7 @@ def test_pyarrow_dtype_backend(self): ), "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), - "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "d": pd.arrays.ArrowExtensionArray(pa.array([item, 100.5, 200.0])), "e": pd.arrays.ArrowExtensionArray( pa.array( [ From e3fc3892ba1faf910049e801735847242d4728a9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Jul 2025 14:32:52 -0700 Subject: [PATCH 20/39] Fix remaining tests --- pandas/core/arrays/base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bfa2309bb023a..013a10784cd5a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2539,6 +2539,14 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if result is not NotImplemented: return result + # TODO: putting this here is hacky as heck + if self.dtype == "float64[pyarrow]": + # e.g. test_log_arrow_backed_missing_value + new_inputs = [ + x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs + ] + return getattr(ufunc, method)(*new_inputs, **kwargs) + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) def map(self, mapper, na_action: Literal["ignore"] | None = None): From 4108cc07ce026377fd9f1ab50548e6b5d9ad5cea Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 07:49:33 -0700 Subject: [PATCH 21/39] mypy fixup --- pandas/_libs/missing.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi index 6c76fe49330b6..64256ae4b36ad 100644 --- a/pandas/_libs/missing.pyi +++ b/pandas/_libs/missing.pyi @@ -14,4 +14,4 @@ def isneginf_scalar(val: object) -> bool: ... def checknull(val: object) -> bool: ... def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ... def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ... -def is_pdna_or_none(values: npt.ndarray) -> npt.NDArray[np.bool_]: ... +def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ... From b220433325145b9145dfdf7e2d29389f06e397ec Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 12:59:54 -0700 Subject: [PATCH 22/39] old-numpy compat --- pandas/core/arrays/arrow/array.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 59ffce66602e5..b8bcafa24f003 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -564,12 +564,8 @@ def _box_pa_array( return pa_array mask = None - if getattr(value, "dtype", None) is None or value.dtype.kind not in "mMf": - try: - arr_value = np.asarray(value) - except ValueError: - # e.g. list dtype with mixed-length lists - arr_value = np.asarray(value, dtype=object) + if getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf": + arr_value = np.asarray(value, dtype=object) # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like mask = is_pdna_or_none(arr_value) From 6ed24a0898081d947f8769effb920a5d64110b15 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 15:45:04 -0700 Subject: [PATCH 23/39] simplify --- pandas/core/arrays/arrow/array.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b8bcafa24f003..7d91ee316cbc5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -371,7 +371,6 @@ def _from_sequence_of_strings( scalars = strings.cast(pa.int64()) except pa.ArrowInvalid: pass - elif pa.types.is_time(pa_type): from pandas.core.tools.times import to_time @@ -399,18 +398,10 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") - if not pa.types.is_decimal(pa_type) and isinstance( - strings, (pa.Array, pa.ChunkedArray) - ): - # TODO: figure out why doing this cast breaks with decimal dtype - # in test_from_sequence_of_strings_pa_array - mask = strings.is_null() - scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) - # TODO: could we just do strings.cast(pa_type)? - elif isinstance(strings, (pa.Array, pa.ChunkedArray)): + if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings.cast(pa_type) elif mask is not None: - scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) + scalars = pa.array(scalars, mask=mask, type=pa_type) else: raise NotImplementedError( From 05d4a94f9866dd3f98a30bfe90bf35736c2daad2 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 09:36:32 -0700 Subject: [PATCH 24/39] Better option name, fixture --- pandas/_config/__init__.py | 4 +-- pandas/conftest.py | 7 +++++ pandas/core/arrays/_utils.py | 4 +-- pandas/core/arrays/arrow/array.py | 26 +++++++++++++++--- pandas/core/config_init.py | 7 ++--- pandas/tests/extension/test_arrow.py | 27 +++++++------------ .../frame/methods/test_convert_dtypes.py | 6 ++--- pandas/tests/series/methods/test_rank.py | 1 + 8 files changed, 50 insertions(+), 32 deletions(-) diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index fbf388224254f..ee709eff2eeae 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -35,6 +35,6 @@ def using_string_dtype() -> bool: return _mode_options["infer_string"] -def using_pyarrow_strict_nans() -> bool: +def is_nan_na() -> bool: _mode_options = _global_config["mode"] - return _mode_options["pyarrow_strict_nans"] + return _mode_options["nan_is_na"] diff --git a/pandas/conftest.py b/pandas/conftest.py index 774936be33631..d69c7e0113310 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -2116,3 +2116,10 @@ def temp_file(tmp_path): def monkeysession(): with pytest.MonkeyPatch.context() as mp: yield mp + + +@pytest.fixture(params=[True, False]) +def using_nan_is_na(request): + opt = request.param + with pd.option_context("mode.nan_is_na", opt): + yield opt diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 9adde3846ca03..e511b481887a9 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._config import using_pyarrow_strict_nans +from pandas._config import is_nan_na from pandas._libs import lib from pandas._libs.missing import NA @@ -41,7 +41,7 @@ def to_numpy_dtype_inference( else: dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] if na_value is lib.no_default: - if is_pyarrow and using_pyarrow_strict_nans(): + if is_pyarrow and not is_nan_na(): na_value = NA dtype = np.dtype(object) else: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7d91ee316cbc5..03be3a87b0e2e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,7 +16,7 @@ import numpy as np -from pandas._config import using_pyarrow_strict_nans +from pandas._config import is_nan_na from pandas._libs import lib from pandas._libs.missing import is_pdna_or_none @@ -35,6 +35,7 @@ from pandas.core.dtypes.cast import ( can_hold_element, + construct_1d_object_array_from_listlike, infer_dtype_from_scalar, ) from pandas.core.dtypes.common import ( @@ -555,7 +556,22 @@ def _box_pa_array( return pa_array mask = None - if getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf": + if is_nan_na(): + try: + arr_value = np.asarray(value) + if arr_value.ndim > 1: + # e.g. test_fixed_size_list we have list data. ndim > 1 + # means there were no scalar (NA) entries. + mask = np.zeros(len(value), dtype=np.bool_) + else: + mask = isna(arr_value) + except ValueError: + # Ragged data that numpy raises on + arr_value = construct_1d_object_array_from_listlike(value) + mask = isna(arr_value) + elif ( + getattr(value, "dtype", None) is None or value.dtype.kind not in "iumMf" + ): arr_value = np.asarray(value, dtype=object) # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like mask = is_pdna_or_none(arr_value) @@ -1490,7 +1506,9 @@ def to_numpy( na_value: object = lib.no_default, ) -> np.ndarray: original_na_value = na_value - dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) + dtype, na_value = to_numpy_dtype_inference( + self, dtype, na_value, self._hasna, is_pyarrow=True + ) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): data = self @@ -1522,7 +1540,7 @@ def to_numpy( or ( original_na_value is lib.no_default and is_float_dtype(dtype) - and not using_pyarrow_strict_nans() + and is_nan_na() ) ) ): diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 02b600eb5fee4..26c4f7c080799 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -429,10 +429,11 @@ def is_terminal() -> bool: with cf.config_prefix("mode"): cf.register_option( - "pyarrow_strict_nans", - True, + "nan_is_na", + False, # TODO: Change this to False before merging - "Whether to make ArrowDtype arrays consistently treat NaN as distinct from NA", + "Whether to make ArrowDtype arrays consistently treat NaN as " + "interchangeable with pd.NA", validator=is_one_of_factory([True, False]), ) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a762f5eccfafa..6de8ebf8d03ad 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -32,8 +32,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_strict_nans - from pandas._libs import lib from pandas._libs.tslibs import timezones from pandas.compat import ( @@ -278,17 +276,14 @@ def test_compare_scalar(self, data, comparison_op): self._compare_other(ser, data, comparison_op, data[0]) @pytest.mark.parametrize("na_action", [None, "ignore"]) - def test_map(self, data_missing, na_action): + def test_map(self, data_missing, na_action, using_nan_is_na): if data_missing.dtype.kind in "mM": result = data_missing.map(lambda x: x, na_action=na_action) expected = data_missing.to_numpy(dtype=object) tm.assert_numpy_array_equal(result, expected) else: result = data_missing.map(lambda x: x, na_action=na_action) - if ( - data_missing.dtype == "float32[pyarrow]" - and not using_pyarrow_strict_nans() - ): + if data_missing.dtype == "float32[pyarrow]" and using_nan_is_na: # map roundtrips through objects, which converts to float64 expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) else: @@ -705,7 +700,7 @@ def test_setitem_preserves_views(self, data): @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default]) @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data, dtype_backend, request): + def test_EA_types(self, engine, data, dtype_backend, request, using_nan_is_na): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_decimal(pa_dtype): request.applymarker( @@ -726,7 +721,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - if using_pyarrow_strict_nans(): + if not using_nan_is_na: csv_output = df.to_csv(index=False, na_rep="NA") else: csv_output = df.to_csv(index=False, na_rep=np.nan) @@ -1543,7 +1538,7 @@ def test_astype_errors_ignore(): tm.assert_frame_equal(result, expected) -def test_to_numpy_with_defaults(data): +def test_to_numpy_with_defaults(data, using_nan_is_na): # GH49973 result = data.to_numpy() @@ -1555,21 +1550,19 @@ def test_to_numpy_with_defaults(data): else: expected = np.array(data._pa_array) - if data._hasna and ( - not is_numeric_dtype(data.dtype) or using_pyarrow_strict_nans() - ): + if data._hasna and (not is_numeric_dtype(data.dtype) or not using_nan_is_na): expected = expected.astype(object) expected[pd.isna(data)] = pd.NA tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_int_with_na(): +def test_to_numpy_int_with_na(using_nan_is_na): # GH51227: ensure to_numpy does not convert int to float data = [1, None] arr = pd.array(data, dtype="int64[pyarrow]") result = arr.to_numpy() - if using_pyarrow_strict_nans(): + if not using_nan_is_na: expected = np.array([1, pd.NA], dtype=object) else: expected = np.array([1, np.nan]) @@ -3534,10 +3527,10 @@ def test_cast_dictionary_different_value_dtype(arrow_type): assert result.dtypes.iloc[0] == data_type -def test_map_numeric_na_action(): +def test_map_numeric_na_action(using_nan_is_na): ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") result = ser.map(lambda x: 42, na_action="ignore") - if using_pyarrow_strict_nans(): + if not using_nan_is_na: expected = pd.Series([42.0, 42.0, pd.NA], dtype="object") else: expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 21f7811100d43..cd850f8019ea1 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_strict_nans - import pandas.util._test_decorators as td import pandas as pd @@ -61,7 +59,7 @@ def test_convert_dtypes_retain_column_names(self): tm.assert_index_equal(result.columns, df.columns) assert result.columns.name == "cols" - def test_pyarrow_dtype_backend(self): + def test_pyarrow_dtype_backend(self, using_nan_is_na): pa = pytest.importorskip("pyarrow") df = pd.DataFrame( { @@ -76,7 +74,7 @@ def test_pyarrow_dtype_backend(self): ) result = df.convert_dtypes(dtype_backend="pyarrow") - item = None if not using_pyarrow_strict_nans() else np.nan + item = None if using_nan_is_na else np.nan expected = pd.DataFrame( { "a": pd.arrays.ArrowExtensionArray( diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index e8c49dcce31e0..7d3aa8f171534 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -279,6 +279,7 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): dtype == "int64" or dtype == "int64[pyarrow]" or dtype == "uint64[pyarrow]" + or dtype == "float64[pyarrow]" or (not using_infer_string and dtype == "str") ): pytest.skip("int64/str does not support NaN") From cbc14d5988389211cd9949d10194fe096da37567 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 09:38:35 -0700 Subject: [PATCH 25/39] default True --- pandas/core/config_init.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 26c4f7c080799..a8014afb225bb 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -430,8 +430,7 @@ def is_terminal() -> bool: with cf.config_prefix("mode"): cf.register_option( "nan_is_na", - False, - # TODO: Change this to False before merging + True, "Whether to make ArrowDtype arrays consistently treat NaN as " "interchangeable with pd.NA", validator=is_one_of_factory([True, False]), From a238601f7c16e9448a3db744ea6f556e1f687854 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 11:15:58 -0700 Subject: [PATCH 26/39] Patch ops --- pandas/core/arrays/arrow/array.py | 47 +++++++++++++++++++++++++++- pandas/core/arrays/base.py | 8 ----- pandas/tests/extension/test_arrow.py | 23 ++++++++++++++ pandas/tests/series/test_npfuncs.py | 2 +- 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 03be3a87b0e2e..829931d04c7af 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -50,10 +50,16 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) from pandas.core.dtypes.missing import isna from pandas.core import ( algorithms as algos, + arraylike, missing, ops, roperator, @@ -752,6 +758,39 @@ def __array__( return self.to_numpy(dtype=dtype, copy=copy) + def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): + if any( + isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) for other in inputs + ): + return NotImplemented + + result = arraylike.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if "out" in kwargs: + return arraylike.dispatch_ufunc_with_out( + self, ufunc, method, *inputs, **kwargs + ) + + if method == "reduce": + result = arraylike.dispatch_reduction_ufunc( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + if self.dtype.kind == "f": + # e.g. test_log_arrow_backed_missing_value + new_inputs = [ + x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs + ] + return getattr(ufunc, method)(*new_inputs, **kwargs) + + return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + def __invert__(self) -> Self: # This is a bit wise op for integer types if pa.types.is_integer(self._pa_array.type): @@ -923,7 +962,13 @@ def _logical_method(self, other, op) -> Self: return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) def _arith_method(self, other, op) -> Self: - return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) + result = self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) + if is_nan_na() and result.dtype.kind == "f": + parr = result._pa_array + mask = pc.is_nan(parr).to_numpy() + arr = pc.replace_with_mask(parr, mask, pa.scalar(None, type=parr.type)) + result = type(self)(arr) + return result def equals(self, other) -> bool: if not isinstance(other, ArrowExtensionArray): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 013a10784cd5a..bfa2309bb023a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2539,14 +2539,6 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): if result is not NotImplemented: return result - # TODO: putting this here is hacky as heck - if self.dtype == "float64[pyarrow]": - # e.g. test_log_arrow_backed_missing_value - new_inputs = [ - x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs - ] - return getattr(ufunc, method)(*new_inputs, **kwargs) - return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) def map(self, mapper, na_action: Literal["ignore"] | None = None): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6de8ebf8d03ad..40d60bdc3418a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3589,3 +3589,26 @@ def test_timestamp_dtype_matches_to_datetime(): expected = pd.Series([ts], dtype=dtype1).convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) + + +def test_ops_with_nan_is_na(using_nan_is_na): + # GH#61732 + ser = pd.Series([-1, 0, 1], dtype="int64[pyarrow]") + + result = ser - np.nan + if using_nan_is_na: + assert result.isna().all() + else: + assert not result.isna().any() + + result = ser * np.nan + if using_nan_is_na: + assert result.isna().all() + else: + assert not result.isna().any() + + result = ser / 0 + if using_nan_is_na: + assert result.isna()[1] + else: + assert not result.isna()[1] diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py index 11a51c4700d5c..a681420ea6b38 100644 --- a/pandas/tests/series/test_npfuncs.py +++ b/pandas/tests/series/test_npfuncs.py @@ -38,7 +38,7 @@ def test_numpy_argwhere(index): @td.skip_if_no("pyarrow") -def test_log_arrow_backed_missing_value(): +def test_log_arrow_backed_missing_value(using_nan_is_na): # GH#56285 ser = Series([1, 2, None], dtype="float64[pyarrow]") result = np.log(ser) From 3f15ca86d98f8268a37b70b5003d013469b78c4f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 11:18:18 -0700 Subject: [PATCH 27/39] mypy fixup --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 829931d04c7af..68076fefd9a65 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -580,7 +580,7 @@ def _box_pa_array( ): arr_value = np.asarray(value, dtype=object) # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like - mask = is_pdna_or_none(arr_value) + mask = is_pdna_or_none(arr_value) # type: ignore[assignment] try: pa_array = pa.array(value, type=pa_type, mask=mask) From a5d3848f6a221997a1ae3a1ae9384f985dfc2ffc Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 11:34:24 -0700 Subject: [PATCH 28/39] Test for setitem/construction --- pandas/core/arrays/arrow/array.py | 2 +- pandas/io/json/_json.py | 20 +++++++---------- pandas/tests/extension/test_arrow.py | 32 ++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 68076fefd9a65..d8cf2f23f0c7d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -452,7 +452,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: """ if isinstance(value, pa.Scalar): pa_scalar = value - elif isna(value) and not lib.is_float(value): + elif isna(value) and not (lib.is_float(value) and not is_nan_na()): pa_scalar = pa.scalar(None, type=pa_type) else: # Workaround https://github.com/apache/arrow/issues/37291 diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 53a10c7a680f6..1c79f24a9fd96 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -18,6 +18,8 @@ import numpy as np +from pandas._config import option_context + from pandas._libs import lib from pandas._libs.json import ( ujson_dumps, @@ -994,16 +996,13 @@ def _read_ujson(self) -> DataFrame | Series: else: obj = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: - if self.dtype_backend == "pyarrow": + with option_context("mode.nan_is_na", True): # The construction above takes "null" to NaN, which we want to # convert to NA. But .convert_dtypes to pyarrow doesn't allow # that, so we do a 2-step conversion through numpy-nullable. - obj = obj.convert_dtypes( - infer_objects=False, dtype_backend="numpy_nullable" + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend ) - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) else: return obj @@ -1078,16 +1077,13 @@ def __next__(self) -> DataFrame | Series: raise ex if self.dtype_backend is not lib.no_default: - if self.dtype_backend == "pyarrow": + with option_context("mode.nan_is_na", True): # The construction above takes "null" to NaN, which we want to # convert to NA. But .convert_dtypes to pyarrow doesn't allow # that, so we do a 2-step conversion through numpy-nullable. - obj = obj.convert_dtypes( - infer_objects=False, dtype_backend="numpy_nullable" + return obj.convert_dtypes( + infer_objects=False, dtype_backend=self.dtype_backend ) - return obj.convert_dtypes( - infer_objects=False, dtype_backend=self.dtype_backend - ) else: return obj diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 40d60bdc3418a..4d56edfa9ffae 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3612,3 +3612,35 @@ def test_ops_with_nan_is_na(using_nan_is_na): assert result.isna()[1] else: assert not result.isna()[1] + + +def test_setitem_float_nan_is_na(using_nan_is_na): + # GH#61732 + import pyarrow as pa + + ser = pd.Series([-1, 0, 1], dtype="int64[pyarrow]") + + if using_nan_is_na: + ser[1] = np.nan + assert ser.isna()[1] + else: + msg = "Could not convert nan with type float: tried to convert to int64" + with pytest.raises(pa.lib.ArrowInvalid, match=msg): + ser[1] = np.nan + + ser = pd.Series([-1, np.nan, 1], dtype="float64[pyarrow]") + if using_nan_is_na: + assert ser.isna()[1] + assert ser[1] is pd.NA + + ser[1] = np.nan + assert ser[1] is pd.NA + + else: + assert not ser.isna()[1] + assert isinstance(ser[1], float) + assert np.isnan(ser[1]) + + ser[2] = np.nan + assert isinstance(ser[2], float) + assert np.isnan(ser[2]) From 670a940f6619f6bac33307550bab5be16dc4220b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 13:28:57 -0700 Subject: [PATCH 29/39] update ufunc test --- pandas/core/arrays/arrow/array.py | 39 ----------------------------- pandas/tests/series/test_npfuncs.py | 12 ++++++--- 2 files changed, 9 insertions(+), 42 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d8cf2f23f0c7d..7aeeefbe2913a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -50,16 +50,10 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndex, - ABCSeries, -) from pandas.core.dtypes.missing import isna from pandas.core import ( algorithms as algos, - arraylike, missing, ops, roperator, @@ -758,39 +752,6 @@ def __array__( return self.to_numpy(dtype=dtype, copy=copy) - def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): - if any( - isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) for other in inputs - ): - return NotImplemented - - result = arraylike.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - if "out" in kwargs: - return arraylike.dispatch_ufunc_with_out( - self, ufunc, method, *inputs, **kwargs - ) - - if method == "reduce": - result = arraylike.dispatch_reduction_ufunc( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - if self.dtype.kind == "f": - # e.g. test_log_arrow_backed_missing_value - new_inputs = [ - x if x is not self else x.to_numpy(na_value=np.nan) for x in inputs - ] - return getattr(ufunc, method)(*new_inputs, **kwargs) - - return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) - def __invert__(self) -> Self: # This is a bit wise op for integer types if pa.types.is_integer(self._pa_array.type): diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py index a681420ea6b38..b72ac8efbaa6d 100644 --- a/pandas/tests/series/test_npfuncs.py +++ b/pandas/tests/series/test_npfuncs.py @@ -41,6 +41,12 @@ def test_numpy_argwhere(index): def test_log_arrow_backed_missing_value(using_nan_is_na): # GH#56285 ser = Series([1, 2, None], dtype="float64[pyarrow]") - result = np.log(ser) - expected = np.log(Series([1, 2, None], dtype="float64")) - tm.assert_series_equal(result, expected) + if using_nan_is_na: + result = np.log(ser) + expected = np.log(Series([1, 2, None], dtype="float64")) + tm.assert_series_equal(result, expected) + else: + # we get cast to object which raises + msg = "loop of ufunc does not support argument" + with pytest.raises(TypeError, match=msg): + np.log(ser) From 3a032a485286e87ef612e51127ffe91289740d3b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 31 Jul 2025 13:55:06 -0700 Subject: [PATCH 30/39] Improve rank test skips --- pandas/tests/series/methods/test_rank.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 7d3aa8f171534..7d96f7f862fce 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -273,20 +273,20 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): + def test_rank_tie_methods( + self, ser, results, dtype, using_infer_string, using_nan_is_na + ): method, exp = results if ( dtype == "int64" - or dtype == "int64[pyarrow]" - or dtype == "uint64[pyarrow]" - or dtype == "float64[pyarrow]" + or (dtype in ["int64[pyarrow]", "uint64[pyarrow]"] and not using_nan_is_na) or (not using_infer_string and dtype == "str") ): pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - if dtype == "float64[pyarrow]": + if dtype == "float64[pyarrow]" and not using_nan_is_na: # the NaNs are not treated as NA exp = exp.copy() if method == "average": @@ -418,11 +418,13 @@ def test_rank_dense_method(self, dtype, ser, exp): expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype, using_infer_string): + def test_rank_descending( + self, ser, results, dtype, using_infer_string, using_nan_is_na + ): method, _ = results if ( dtype == "int64" - or dtype == "int64[pyarrow]" + or (dtype in ["int64[pyarrow]"] and not using_nan_is_na) or (not using_infer_string and dtype == "str") ): s = ser.dropna().astype(dtype) From c59b9de0d4978b49364f03ec78cd11a8170b724d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 08:35:37 -0700 Subject: [PATCH 31/39] ENH: mode.nan_is_na for numpy-nullable dtypes --- doc/source/user_guide/text.rst | 2 +- pandas/_libs/parsers.pyx | 5 +- pandas/core/algorithms.py | 11 +++- pandas/core/arrays/_utils.py | 3 +- pandas/core/arrays/arrow/array.py | 21 ++++--- pandas/core/arrays/masked.py | 15 +++-- pandas/core/arrays/numeric.py | 25 +++++++- pandas/core/config_init.py | 8 +-- pandas/core/dtypes/cast.py | 15 ++++- pandas/core/indexes/base.py | 10 ++++ pandas/core/internals/construction.py | 29 ++++++--- pandas/io/json/_json.py | 6 -- pandas/io/json/_table_schema.py | 5 +- pandas/io/parsers/arrow_parser_wrapper.py | 4 +- .../tests/arrays/floating/test_arithmetic.py | 30 ++++++---- .../tests/arrays/floating/test_comparison.py | 12 +++- .../arrays/floating/test_construction.py | 13 +++- pandas/tests/arrays/floating/test_contains.py | 7 ++- pandas/tests/arrays/floating/test_function.py | 46 +++++++++++---- pandas/tests/arrays/floating/test_to_numpy.py | 22 +++++-- .../tests/arrays/integer/test_arithmetic.py | 37 ++++++++---- .../tests/arrays/integer/test_construction.py | 46 ++++++++++++--- pandas/tests/arrays/integer/test_function.py | 59 ++++++++++++++----- pandas/tests/arrays/integer/test_reduction.py | 4 +- .../arrays/interval/test_interval_pyarrow.py | 2 +- pandas/tests/arrays/masked/test_function.py | 15 ++--- pandas/tests/arrays/string_/test_string.py | 3 +- pandas/tests/base/test_conversion.py | 5 +- pandas/tests/base/test_unique.py | 8 ++- pandas/tests/extension/base/interface.py | 18 +++++- pandas/tests/extension/test_masked.py | 16 +++-- pandas/tests/frame/methods/test_astype.py | 12 +++- pandas/tests/frame/methods/test_replace.py | 4 +- pandas/tests/frame/test_reductions.py | 12 +++- pandas/tests/groupby/methods/test_quantile.py | 6 +- pandas/tests/groupby/test_reductions.py | 5 +- .../tests/indexes/multi/test_constructors.py | 2 +- pandas/tests/indexes/numeric/test_indexing.py | 53 +++++++++++------ pandas/tests/indexing/test_iloc.py | 11 ++-- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/reshape/test_cut.py | 4 +- .../series/accessors/test_dt_accessor.py | 4 +- pandas/tests/series/methods/test_case_when.py | 3 +- pandas/tests/series/methods/test_clip.py | 13 ++-- .../series/methods/test_convert_dtypes.py | 11 ++++ pandas/tests/series/methods/test_rank.py | 28 +++++++-- 46 files changed, 480 insertions(+), 192 deletions(-) diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 3bb151a2dd339..11d5ab86e76ef 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -75,7 +75,7 @@ or convert from existing pandas data: .. ipython:: python - s1 = pd.Series([1, 2, np.nan], dtype="Int64") + s1 = pd.Series([1, 2, pd.NA], dtype="Int64") s1 s2 = s1.astype("string") s2 diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1f5813940c058..a25fedc8d33f4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -8,6 +8,8 @@ from csv import ( ) import warnings +from pandas._config import is_nan_na + from pandas.util._exceptions import find_stack_level from pandas import StringDtype @@ -43,7 +45,6 @@ from libc.string cimport ( strncpy, ) - import numpy as np cimport numpy as cnp @@ -1461,7 +1462,7 @@ def _maybe_upcast( if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow arr = arr.to_numpy(na_value=None) - arr = ArrowExtensionArray(pa.array(arr)) + arr = ArrowExtensionArray(pa.array(arr, from_pandas=is_nan_na())) return arr diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 533b9b689af0b..c14ab2bc02da2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1065,7 +1065,16 @@ def rank( (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ is_datetimelike = needs_i8_conversion(values.dtype) - values = _ensure_data(values) + if ( + isinstance(values.dtype, BaseMaskedDtype) + and values._hasna + and values.dtype.kind in "iuf" + ): + # e.g. test_rank_ea_small_values + # TODO: bug in the object-dtype path that we would get without this special casting. + values = values.to_numpy(dtype=np.float64, na_value=np.nan) + else: + values = _ensure_data(values) if values.ndim == 1: ranks = algos.rank_1d( diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index e511b481887a9..67ce39de75b20 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -28,7 +28,6 @@ def to_numpy_dtype_inference( dtype: npt.DTypeLike | None, na_value, hasna: bool, - is_pyarrow: bool = True, ) -> tuple[npt.DTypeLike, Any]: if dtype is None and is_numeric_dtype(arr.dtype): dtype_given = False @@ -41,7 +40,7 @@ def to_numpy_dtype_inference( else: dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] if na_value is lib.no_default: - if is_pyarrow and not is_nan_na(): + if not is_nan_na(): na_value = NA dtype = np.dtype(object) else: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7aeeefbe2913a..8ec3d37236b17 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -352,9 +352,7 @@ def _from_sequence_of_strings( from pandas.core.tools.datetimes import to_datetime scalars = to_datetime(strings, errors="raise").date - - scalars = pa.array(scalars, mask=mask.view(bool), type=pa_type) - + scalars = pa.array(scalars, type=pa_type, mask=mask) elif pa.types.is_duration(pa_type): from pandas.core.tools.timedeltas import to_timedelta @@ -965,7 +963,10 @@ def __len__(self) -> int: def __contains__(self, key) -> bool: # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 if isna(key) and key is not self.dtype.na_value: - if self.dtype.kind == "f" and lib.is_float(key): + if lib.is_float(key) and is_nan_na(): + return self.dtype.na_value in self + elif self.dtype.kind == "f" and lib.is_float(key): + # Check specifically for NaN return pc.any(pc.is_nan(self._pa_array)).as_py() # e.g. date or timestamp types we do not allow None here to match pd.NA @@ -1512,9 +1513,7 @@ def to_numpy( na_value: object = lib.no_default, ) -> np.ndarray: original_na_value = na_value - dtype, na_value = to_numpy_dtype_inference( - self, dtype, na_value, self._hasna, is_pyarrow=True - ) + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) pa_type = self._pa_array.type if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): data = self @@ -2073,7 +2072,7 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") chunks = [ *self._pa_array[:key].chunks, - pa.array([value], type=self._pa_array.type), + pa.array([value], type=self._pa_array.type, from_pandas=is_nan_na()), *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() @@ -2127,7 +2126,7 @@ def _rank_calc( pa_type = pa.float64() else: pa_type = pa.uint64() - result = pa.array(ranked, type=pa_type) + result = pa.array(ranked, type=pa_type, from_pandas=is_nan_na()) return result data = self._pa_array.combine_chunks() @@ -2379,7 +2378,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: right, right_type = _to_numpy_and_type(right) pa_type = left_type or right_type result = np.where(cond, left, right) - return pa.array(result, type=pa_type) + return pa.array(result, type=pa_type, from_pandas=is_nan_na()) @classmethod def _replace_with_mask( @@ -2423,7 +2422,7 @@ def _replace_with_mask( result = np.array(values, dtype=object) result[mask] = replacements - return pa.array(result, type=values.type) + return pa.array(result, type=values.type, from_pandas=is_nan_na()) # ------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7f924db0dcc3b..4005138b54850 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -11,6 +11,8 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import ( algos as libalgos, lib, @@ -310,7 +312,9 @@ def __setitem__(self, key, value) -> None: def __contains__(self, key) -> bool: if isna(key) and key is not self.dtype.na_value: # GH#52840 - if self._data.dtype.kind == "f" and lib.is_float(key): + if lib.is_float(key) and is_nan_na(): + key = self.dtype.na_value + elif self._data.dtype.kind == "f" and lib.is_float(key): return bool((np.isnan(self._data) & ~self._mask).any()) return bool(super().__contains__(key)) @@ -497,9 +501,7 @@ def to_numpy( array([ True, False, False]) """ hasna = self._hasna - dtype, na_value = to_numpy_dtype_inference( - self, dtype, na_value, hasna, is_pyarrow=False - ) + dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) if dtype is None: dtype = object @@ -670,6 +672,8 @@ def reconstruct(x: np.ndarray): # reached in e.g. np.sqrt on BooleanArray # we don't support float16 x = x.astype(np.float32) + if is_nan_na(): + m[np.isnan(x)] = True return FloatingArray(x, m) else: x[mask] = np.nan @@ -875,6 +879,9 @@ def _maybe_mask_result( if result.dtype.kind == "f": from pandas.core.arrays import FloatingArray + if is_nan_na(): + mask[np.isnan(result)] = True + return FloatingArray(result, mask, copy=False) elif result.dtype.kind == "b": diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index f319a3cc05575..27ff4b7563ba9 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -8,6 +8,8 @@ import numpy as np +from pandas._config import is_nan_na + from pandas._libs import ( lib, missing as libmissing, @@ -101,6 +103,8 @@ def __from_arrow__( array = array.combine_chunks() data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype) + if data.dtype.kind == "f" and is_nan_na(): + mask[np.isnan(data)] = False return array_class(data.copy(), ~mask, copy=False) @classmethod @@ -195,9 +199,21 @@ def _coerce_to_data_and_mask( elif values.dtype.kind == "f": # np.isnan is faster than is_numeric_na() for floats # github issue: #60066 - mask = np.isnan(values) + if is_nan_na(): + mask = np.isnan(values) + else: + mask = np.zeros(len(values), dtype=np.bool_) + if dtype_cls.__name__.strip("_").startswith(("I", "U")): + wrong = np.isnan(values) + if wrong.any(): + raise ValueError("Cannot cast NaN value to Integer dtype.") else: - mask = libmissing.is_numeric_na(values) + if is_nan_na(): + mask = libmissing.is_numeric_na(values) + else: + # is_numeric_na will raise on non-numeric NAs + libmissing.is_numeric_na(values) + mask = libmissing.is_pdna_or_none(values) else: assert len(mask) == len(values) @@ -236,7 +252,6 @@ def _coerce_to_data_and_mask( values = values.astype(dtype, copy=copy) else: values = dtype_cls._safe_cast(values, dtype, copy=False) - return values, mask, dtype, inferred_type @@ -265,6 +280,10 @@ def __init__( # If we don't raise here, then accessing self.dtype would raise raise TypeError("FloatingArray does not support np.float16 dtype.") + # NB: if is_nan_na() is True + # then caller is responsible for ensuring + # assert mask[np.isnan(values)].all() + super().__init__(values, mask, copy=copy) @cache_readonly diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a8014afb225bb..1478380d90a7d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -427,12 +427,12 @@ def is_terminal() -> bool: validator=is_one_of_factory([True, False, "warn"]), ) -with cf.config_prefix("mode"): cf.register_option( "nan_is_na", - True, - "Whether to make ArrowDtype arrays consistently treat NaN as " - "interchangeable with pd.NA", + os.environ.get("PANDAS_NAN_IS_NA", 0) == "1", + "Whether to treat NaN entries as interchangeable with pd.NA in " + "numpy-nullable and pyarrow float dtypes. See discussion in " + "https://github.com/pandas-dev/pandas/issues/32265", validator=is_one_of_factory([True, False]), ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 20fe9b92b4677..a8ff49fac543d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -18,7 +18,10 @@ import numpy as np -from pandas._config import using_string_dtype +from pandas._config import ( + is_nan_na, + using_string_dtype, +) from pandas._libs import ( Interval, @@ -1053,7 +1056,10 @@ def convert_dtypes( elif input_array.dtype.kind in "fcb": # TODO: de-dup with maybe_cast_to_integer_array? arr = input_array[notna(input_array)] - if (arr.astype(int) == arr).all(): + if len(arr) < len(input_array) and not is_nan_na(): + # In the presence of NaNs, we cannot convert to IntegerDtype + pass + elif (arr.astype(int) == arr).all(): inferred_dtype = target_int_dtype else: inferred_dtype = input_array.dtype @@ -1077,7 +1083,10 @@ def convert_dtypes( if convert_integer: # TODO: de-dup with maybe_cast_to_integer_array? arr = input_array[notna(input_array)] - if (arr.astype(int) == arr).all(): + if len(arr) < len(input_array) and not is_nan_na(): + # In the presence of NaNs, we can't convert to IntegerDtype + inferred_dtype = inferred_float_dtype + elif (arr.astype(int) == arr).all(): inferred_dtype = pandas_dtype_func("Int64") else: inferred_dtype = inferred_float_dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e8c5a03a6de50..be1c8365e640c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -21,6 +21,7 @@ from pandas._config import ( get_option, + is_nan_na, using_string_dtype, ) @@ -161,6 +162,7 @@ ExtensionArray, TimedeltaArray, ) +from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.string_ import ( StringArray, StringDtype, @@ -6575,6 +6577,14 @@ def _maybe_cast_indexer(self, key): If we have a float key and are not a floating index, then try to cast to an int if equivalent. """ + if ( + is_float(key) + and np.isnan(key) + and isinstance(self.dtype, FloatingDtype) + and is_nan_na() + ): + # TODO: better place to do this? + key = self.dtype.na_value return key def _maybe_cast_listlike_indexer(self, target) -> Index: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 35de97d570bd3..7de508e5a30bc 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -17,6 +17,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas._libs.missing import NA from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( @@ -34,7 +35,10 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -364,7 +368,11 @@ def dict_to_mgr( if columns is not None: columns = ensure_index(columns) - arrays = [np.nan] * len(columns) + if dtype is not None and not isinstance(dtype, np.dtype): + # e.g. test_dataframe_from_dict_of_series + arrays = [NA] * len(columns) + else: + arrays = [np.nan] * len(columns) midxs = set() data_keys = ensure_index(data.keys()) # type: ignore[arg-type] data_values = list(data.values()) @@ -414,12 +422,14 @@ def dict_to_mgr( arrays = [ x.copy() if isinstance(x, ExtensionArray) - else x.copy(deep=True) - if ( - isinstance(x, Index) - or (isinstance(x, ABCSeries) and is_1d_only_ea_dtype(x.dtype)) + else ( + x.copy(deep=True) + if ( + isinstance(x, Index) + or (isinstance(x, ABCSeries) and is_1d_only_ea_dtype(x.dtype)) + ) + else x ) - else x for x in arrays ] @@ -949,10 +959,13 @@ def convert_object_array( def convert(arr): if dtype != np.dtype("O"): + # e.g. if dtype is UInt32 then we want to cast Nones to NA instead of + # NaN in maybe_convert_objects. + to_nullable = dtype_backend != "numpy" or isinstance(dtype, BaseMaskedDtype) arr = lib.maybe_convert_objects( arr, try_float=coerce_float, - convert_to_nullable_dtype=dtype_backend != "numpy", + convert_to_nullable_dtype=to_nullable, ) # Notes on cases that get here 2023-02-15 # 1) we DO get here when arr is all Timestamps and dtype=None diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 1c79f24a9fd96..408a2f290c477 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -997,9 +997,6 @@ def _read_ujson(self) -> DataFrame | Series: obj = self._get_object_parser(self.data) if self.dtype_backend is not lib.no_default: with option_context("mode.nan_is_na", True): - # The construction above takes "null" to NaN, which we want to - # convert to NA. But .convert_dtypes to pyarrow doesn't allow - # that, so we do a 2-step conversion through numpy-nullable. return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend ) @@ -1078,9 +1075,6 @@ def __next__(self) -> DataFrame | Series: if self.dtype_backend is not lib.no_default: with option_context("mode.nan_is_na", True): - # The construction above takes "null" to NaN, which we want to - # convert to NA. But .convert_dtypes to pyarrow doesn't allow - # that, so we do a 2-step conversion through numpy-nullable. return obj.convert_dtypes( infer_objects=False, dtype_backend=self.dtype_backend ) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index feca60c6e28a2..5510036e542f5 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -13,6 +13,8 @@ ) import warnings +from pandas._config import option_context + from pandas._libs import lib from pandas._libs.json import ujson_loads from pandas._libs.tslibs import timezones @@ -384,7 +386,8 @@ def parse_table_schema(json, precise_float: bool) -> DataFrame: 'table="orient" can not yet read ISO-formatted Timedelta data' ) - df = df.astype(dtypes) + with option_context("mode.nan_is_na", True): + df = df.astype(dtypes) if "primaryKey" in table["schema"]: df = df.set_index(table["schema"]["primaryKey"]) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 09759d4127ac8..d48e888ae3838 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -19,9 +19,7 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ( - BaseMaskedDtype, -) +from pandas.core.dtypes.dtypes import BaseMaskedDtype from pandas.core.dtypes.inference import is_integer from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 777099e76fc73..e4e26383ae42c 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -35,21 +35,24 @@ def test_array_op(dtype, opname, exp): @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) -def test_divide_by_zero(dtype, zero, negative): +def test_divide_by_zero(dtype, zero, negative, using_nan_is_na): # TODO pending NA/NaN discussion # https://github.com/pandas-dev/pandas/issues/32265/ a = pd.array([0, 1, -1, None], dtype=dtype) result = a / zero + exp_mask = np.array([False, False, False, True]) + if using_nan_is_na: + exp_mask[[0, -1]] = True expected = FloatingArray( np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype), - np.array([False, False, False, True]), + exp_mask, ) if negative: expected *= -1 tm.assert_extension_array_equal(result, expected) -def test_pow_scalar(dtype): +def test_pow_scalar(dtype, using_nan_is_na): a = pd.array([-1, 0, 1, None, 2], dtype=dtype) result = a**0 expected = pd.array([1, 1, 1, 1, 1], dtype=dtype) @@ -64,11 +67,14 @@ def test_pow_scalar(dtype): tm.assert_extension_array_equal(result, expected) result = a**np.nan - # TODO np.nan should be converted to pd.NA / missing before operation? - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), - mask=a._mask, - ) + if using_nan_is_na: + expected = pd.array([None, None, 1, None, None], dtype=dtype) + else: + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask, + ) tm.assert_extension_array_equal(result, expected) # reversed @@ -87,9 +93,11 @@ def test_pow_scalar(dtype): tm.assert_extension_array_equal(result, expected) result = np.nan**a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask - ) + if not using_nan_is_na: + # Otherwise the previous `expected` can be reused + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py index a429649f1ce1d..0990757964267 100644 --- a/pandas/tests/arrays/floating/test_comparison.py +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -38,11 +38,15 @@ def test_equals(): assert a1.equals(a2) is False -def test_equals_nan_vs_na(): +def test_equals_nan_vs_na(using_nan_is_na): # GH#44382 mask = np.zeros(3, dtype=bool) data = np.array([1.0, np.nan, 3.0], dtype=np.float64) + if using_nan_is_na: + # Under PDEP16, all callers of the FloatingArray constructor should + # ensure that mask[np.isnan(data)] = True + mask[1] = True left = FloatingArray(data, mask) assert left.equals(left) @@ -57,7 +61,11 @@ def test_equals_nan_vs_na(): assert right.equals(right) tm.assert_extension_array_equal(right, right) - assert not left.equals(right) + if not using_nan_is_na: + assert not left.equals(right) + else: + # the constructor will set the NaN locations to NA + assert left.equals(right) # with mask[1] = True, the only difference is data[1], which should # not matter for equals diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index e1d237205a753..9c383efa3216c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -85,9 +85,12 @@ def test_to_array(): ([np.nan], [pd.NA]), ], ) -def test_to_array_none_is_nan(a, b): +def test_to_array_none_is_nan(a, b, using_nan_is_na): result = pd.array(a, dtype="Float64") expected = pd.array(b, dtype="Float64") + if not using_nan_is_na and a[-1] is np.nan: + assert np.isnan(result[-1]) + expected._mask[-1] = False tm.assert_extension_array_equal(result, expected) @@ -189,13 +192,17 @@ def test_to_array_bool(bool_values, values, target_dtype, expected_dtype): tm.assert_extension_array_equal(result, expected) -def test_series_from_float(data): +def test_series_from_float(data, using_nan_is_na): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + np_res = data.to_numpy(na_value=np.nan, dtype="float") + if not using_nan_is_na: + np_res = np_res.astype(object) + np_res[data.isna()] = pd.NA + result = pd.Series(np_res, dtype=str(dtype)) tm.assert_series_equal(result, expected) # from list diff --git a/pandas/tests/arrays/floating/test_contains.py b/pandas/tests/arrays/floating/test_contains.py index 956642697bf32..5dff4b803d87d 100644 --- a/pandas/tests/arrays/floating/test_contains.py +++ b/pandas/tests/arrays/floating/test_contains.py @@ -3,10 +3,13 @@ import pandas as pd -def test_contains_nan(): +def test_contains_nan(using_nan_is_na): # GH#52840 arr = pd.array(range(5)) / 0 assert np.isnan(arr._data[0]) - assert not arr.isna()[0] + if using_nan_is_na: + assert arr.isna()[0] + else: + assert not arr.isna()[0] assert np.nan in arr diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index dffb2a1f6e1f5..e03e8f30197b9 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -10,10 +10,13 @@ @pytest.mark.parametrize("ufunc", [np.abs, np.sign]) # np.sign emits a warning with nans, @pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning") -def test_ufuncs_single(ufunc): +def test_ufuncs_single(ufunc, using_nan_is_na): a = pd.array([1, 2, -3, pd.NA], dtype="Float64") result = ufunc(a) - expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) s = pd.Series(a) @@ -23,45 +26,66 @@ def test_ufuncs_single(ufunc): @pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): +def test_ufuncs_single_float(ufunc, using_nan_is_na): a = pd.array([1.0, 0.2, 3.0, pd.NA], dtype="Float64") with np.errstate(invalid="ignore"): result = ufunc(a) - expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) s = pd.Series(a) with np.errstate(invalid="ignore"): result = ufunc(s) - expected = pd.Series(ufunc(s.astype(float)), dtype="Float64") + np_res = ufunc(s.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.Series(np_res, dtype="Float64") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ufunc", [np.add, np.subtract]) -def test_ufuncs_binary_float(ufunc): +def test_ufuncs_binary_float(ufunc, using_nan_is_na): # two FloatingArrays a = pd.array([1, 0.2, -3, pd.NA], dtype="Float64") result = ufunc(a, a) - expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64") + np_res = ufunc(a.astype(float), a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) # FloatingArray with numpy array arr = np.array([1, 2, 3, 4]) result = ufunc(a, arr) - expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64") + np_res = ufunc(a.astype(float), arr) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) - expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64") + np_res = ufunc(arr, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) # FloatingArray with scalar result = ufunc(a, 1) - expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64") + np_res = ufunc(a.astype(float), 1) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) result = ufunc(1, a) - expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64") + np_res = ufunc(1, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Float64") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index e954cecba417a..fc9e260923d32 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -7,18 +7,23 @@ @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) -def test_to_numpy(box): +def test_to_numpy(box, using_nan_is_na): con = pd.Series if box else pd.array # default (with or without missing values) -> object dtype arr = con([0.1, 0.2, 0.3], dtype="Float64") result = arr.to_numpy() expected = np.array([0.1, 0.2, 0.3], dtype="float64") + # TODO: should this be object with `not using_nan_is_na` to avoid + # values-dependent behavior? tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") result = arr.to_numpy() - expected = np.array([0.1, 0.2, np.nan], dtype="float64") + if using_nan_is_na: + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + else: + expected = np.array([0.1, 0.2, pd.NA], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -81,11 +86,18 @@ def test_to_numpy_na_value(box): tm.assert_numpy_array_equal(result, expected) -def test_to_numpy_na_value_with_nan(): +def test_to_numpy_na_value_with_nan(using_nan_is_na): # array with both NaN and NA -> only fill NA with `na_value` - arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True])) + mask = np.array([False, False, True]) + if using_nan_is_na: + mask[1] = True + arr = FloatingArray(np.array([0.0, np.nan, 0.0]), mask) result = arr.to_numpy(dtype="float64", na_value=-1) - expected = np.array([0.0, np.nan, -1.0], dtype="float64") + if using_nan_is_na: + # the NaN passed to the constructor is considered as NA + expected = np.array([0.0, -1.0, -1.0], dtype="float64") + else: + expected = np.array([0.0, np.nan, -1.0], dtype="float64") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index aeceb9b8a3cb1..e16ab6f23b417 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -52,13 +52,16 @@ def test_div(dtype): @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) -def test_divide_by_zero(zero, negative): +def test_divide_by_zero(zero, negative, using_nan_is_na): # https://github.com/pandas-dev/pandas/issues/27398, GH#22793 a = pd.array([0, 1, -1, None], dtype="Int64") result = a / zero + exp_mask = np.array([False, False, False, True]) + if using_nan_is_na: + exp_mask[0] = True expected = FloatingArray( np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"), - np.array([False, False, False, True]), + exp_mask, ) if negative: expected *= -1 @@ -99,7 +102,7 @@ def test_mod(dtype): tm.assert_extension_array_equal(result, expected) -def test_pow_scalar(): +def test_pow_scalar(using_nan_is_na): a = pd.array([-1, 0, 1, None, 2], dtype="Int64") result = a**0 expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") @@ -114,10 +117,13 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = a**np.nan - expected = FloatingArray( - np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), - np.array([False, False, False, True, False]), - ) + if using_nan_is_na: + expected = expected.astype("Float64") + else: + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"), + np.array([False, False, False, True, False]), + ) tm.assert_extension_array_equal(result, expected) # reversed @@ -136,10 +142,13 @@ def test_pow_scalar(): tm.assert_extension_array_equal(result, expected) result = np.nan**a - expected = FloatingArray( - np.array([1, np.nan, np.nan, np.nan], dtype="float64"), - np.array([False, False, True, False]), - ) + if using_nan_is_na: + expected = expected.astype("Float64") + else: + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype="float64"), + np.array([False, False, True, False]), + ) tm.assert_extension_array_equal(result, expected) @@ -212,7 +221,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): # TODO test unsigned overflow -def test_arith_coerce_scalar(data, all_arithmetic_operators): +def test_arith_coerce_scalar(data, all_arithmetic_operators, using_nan_is_na): op = tm.get_op_from_name(all_arithmetic_operators) s = pd.Series(data) other = 0.01 @@ -220,9 +229,11 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): result = op(s, other) expected = op(s.astype(float), other) expected = expected.astype("Float64") + if not using_nan_is_na: + expected[s.isna()] = pd.NA # rmod results in NaN that wasn't NA in original nullable Series -> unmask it - if all_arithmetic_operators == "__rmod__": + if all_arithmetic_operators == "__rmod__" and not using_nan_is_na: mask = (s == 0).fillna(False).to_numpy(bool) expected.array._mask[mask] = False diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 8eaa9ace027c9..ab52fbec45f79 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -26,14 +26,20 @@ def test_uses_pandas_na(): assert a[1] is pd.NA -def test_from_dtype_from_float(data): +def test_from_dtype_from_float(data, using_nan_is_na): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) - tm.assert_series_equal(result, expected) + arr = data.to_numpy(na_value=np.nan, dtype="float") + if using_nan_is_na: + result = pd.Series(arr, dtype=str(dtype)) + tm.assert_series_equal(result, expected) + else: + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + pd.Series(arr, dtype=str(dtype)) # from int / list expected = pd.Series(data) @@ -116,10 +122,15 @@ def test_integer_array_constructor_copy(): ([np.nan, np.nan], [np.nan, np.nan]), ], ) -def test_to_integer_array_none_is_nan(a, b): - result = pd.array(a, dtype="Int64") - expected = pd.array(b, dtype="Int64") - tm.assert_extension_array_equal(result, expected) +def test_to_integer_array_none_is_nan(a, b, using_nan_is_na): + if using_nan_is_na: + result = pd.array(a, dtype="Int64") + expected = pd.array(b, dtype="Int64") + tm.assert_extension_array_equal(result, expected) + else: + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + pd.array(b, dtype="Int64") @pytest.mark.parametrize( @@ -139,6 +150,7 @@ def test_to_integer_array_error(values): # error in converting existing arrays to IntegerArrays msg = "|".join( [ + "cannot convert float NaN to integer", # with not using_nan_is_na r"cannot be converted to IntegerDtype", r"invalid literal for int\(\) with base 10:", r"values must be a 1D list-like", @@ -214,8 +226,16 @@ def test_to_integer_array_str(): ], ) def test_to_integer_array_bool( - constructor, bool_values, int_values, target_dtype, expected_dtype + constructor, bool_values, int_values, target_dtype, expected_dtype, using_nan_is_na ): + if not using_nan_is_na and np.isnan(bool_values[-1]): + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + constructor(bool_values, dtype=target_dtype) + with pytest.raises(ValueError, match=msg): + pd.array(int_values, dtype=target_dtype) + return + result = constructor(bool_values, dtype=target_dtype) assert result.dtype == expected_dtype expected = pd.array(int_values, dtype=target_dtype) @@ -230,8 +250,16 @@ def test_to_integer_array_bool( (np.array([1, np.nan]), "int8", Int8Dtype), ], ) -def test_to_integer_array(values, to_dtype, result_dtype): +def test_to_integer_array(values, to_dtype, result_dtype, using_nan_is_na): # convert existing arrays to IntegerArrays + if not using_nan_is_na and np.isnan(values[-1]): + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + IntegerArray._from_sequence(values, dtype=to_dtype) + with pytest.raises(ValueError, match=msg): + pd.array(values, dtype=result_dtype()) + return + result = IntegerArray._from_sequence(values, dtype=to_dtype) assert result.dtype == result_dtype() expected = pd.array(values, dtype=result_dtype()) diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 77a0dd12534cc..892a7a2be7b5c 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -9,24 +9,33 @@ @pytest.mark.parametrize("ufunc", [np.abs, np.sign]) # np.sign emits a warning with nans, @pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning") -def test_ufuncs_single_int(ufunc): +def test_ufuncs_single_int(ufunc, using_nan_is_na): a = pd.array([1, 2, -3, pd.NA], dtype="Int64") result = ufunc(a) - expected = pd.array(ufunc(a.astype(float)), dtype="Int64") + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[-1] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) s = pd.Series(a) result = ufunc(s) - expected = pd.Series(pd.array(ufunc(a.astype(float)), dtype="Int64")) + np_res = ufunc(a.astype(float)) + np_res = np_res.astype(object) + np_res[-1] = pd.NA + expected = pd.Series(pd.array(np_res, dtype="Int64")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) -def test_ufuncs_single_float(ufunc): +def test_ufuncs_single_float(ufunc, using_nan_is_na): a = pd.array([1, 2, -3, pd.NA], dtype="Int64") with np.errstate(invalid="ignore"): result = ufunc(a) - expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) + if using_nan_is_na: + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + else: + expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask) tm.assert_extension_array_equal(result, expected) s = pd.Series(a) @@ -41,34 +50,56 @@ def test_ufuncs_binary_int(ufunc): # two IntegerArrays a = pd.array([1, 2, -3, pd.NA], dtype="Int64") result = ufunc(a, a) - expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Int64") + np_res = ufunc(a.astype(float), a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) # IntegerArray with numpy array arr = np.array([1, 2, 3, 4]) result = ufunc(a, arr) - expected = pd.array(ufunc(a.astype(float), arr), dtype="Int64") + np_res = ufunc(a.astype(float), arr) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) result = ufunc(arr, a) - expected = pd.array(ufunc(arr, a.astype(float)), dtype="Int64") + np_res = ufunc(arr, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) # IntegerArray with scalar result = ufunc(a, 1) - expected = pd.array(ufunc(a.astype(float), 1), dtype="Int64") + np_res = ufunc(a.astype(float), 1) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) result = ufunc(1, a) - expected = pd.array(ufunc(1, a.astype(float)), dtype="Int64") + np_res = ufunc(1, a.astype(float)) + np_res = np_res.astype(object) + np_res[a.isna()] = pd.NA + expected = pd.array(np_res, dtype="Int64") tm.assert_extension_array_equal(result, expected) -def test_ufunc_binary_output(): - a = pd.array([1, 2, np.nan]) +def test_ufunc_binary_output(using_nan_is_na): + a = pd.array([1, 2, pd.NA], dtype="Int64") result = np.modf(a) - expected = np.modf(a.to_numpy(na_value=np.nan, dtype="float")) - expected = (pd.array(expected[0]), pd.array(expected[1])) + np_res = np.modf(a.to_numpy(na_value=np.nan, dtype="float")) + + np_res = list(np_res) + np_res[0] = np_res[0].astype(object) + np_res[1] = np_res[1].astype(object) + np_res[0][-1] = pd.NA + np_res[1][-1] = pd.NA + + expected = (pd.array(np_res[0]), pd.array(np_res[1])) assert isinstance(result, tuple) assert len(result) == 2 diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index 1c91cd25ba69c..f456d06a49fe5 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -96,8 +96,8 @@ def test_groupby_reductions(op, expected): ["median", Series([2, 2], index=["B", "C"], dtype="Float64")], ["var", Series([2, 2], index=["B", "C"], dtype="Float64")], ["std", Series([2**0.5, 2**0.5], index=["B", "C"], dtype="Float64")], - ["skew", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], - ["kurt", Series([pd.NA, pd.NA], index=["B", "C"], dtype="Float64")], + ["skew", Series([np.nan, pd.NA], index=["B", "C"], dtype="Float64")], + ["kurt", Series([np.nan, pd.NA], index=["B", "C"], dtype="Float64")], ["any", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], diff --git a/pandas/tests/arrays/interval/test_interval_pyarrow.py b/pandas/tests/arrays/interval/test_interval_pyarrow.py index ef8701be81e2b..c8692bb98f346 100644 --- a/pandas/tests/arrays/interval/test_interval_pyarrow.py +++ b/pandas/tests/arrays/interval/test_interval_pyarrow.py @@ -51,7 +51,7 @@ def test_arrow_array(): pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) -def test_arrow_array_missing(): +def test_arrow_array_missing(using_nan_is_na): pa = pytest.importorskip("pyarrow") from pandas.core.arrays.arrow.extension_types import ArrowIntervalType diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index b4b1761217826..38a9488e5707d 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -38,17 +38,18 @@ def numpy_dtype(data): def test_round(data, numpy_dtype): # No arguments result = data.round() - expected = pd.array( - np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)), dtype=data.dtype - ) + np_result = np.round(data.to_numpy(dtype=numpy_dtype, na_value=None)) + exp_np = np_result.astype(object) + exp_np[data.isna()] = pd.NA + expected = pd.array(exp_np, dtype=data.dtype) tm.assert_extension_array_equal(result, expected) # Decimals argument result = data.round(decimals=2) - expected = pd.array( - np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2), - dtype=data.dtype, - ) + np_result = np.round(data.to_numpy(dtype=numpy_dtype, na_value=None), decimals=2) + exp_np = np_result.astype(object) + exp_np[data.isna()] = pd.NA + expected = pd.array(exp_np, dtype=data.dtype) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 96e1cc05e284c..06a910aa06108 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -527,7 +527,8 @@ def test_astype_float(dtype, any_float_dtype): # Don't compare arrays (37974) ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) result = ser.astype(any_float_dtype) - expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_dtype) + item = np.nan if isinstance(result.dtype, np.dtype) else pd.NA + expected = pd.Series([1.1, item, 3.3], dtype=any_float_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 821f51ee95ad3..cdf3b549bddee 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -355,7 +355,10 @@ def test_array_multiindex_raises(): ), ], ) -def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array, using_nan_is_na): + if not using_nan_is_na and arr[-1] is pd.NA: + expected = np.array([0, pd.NA], dtype=object) + box = index_or_series_or_array with tm.assert_produces_warning(None): diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 7f094db6ea524..6e55531bbce8f 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -30,7 +30,7 @@ def test_unique(index_or_series_obj): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize("null_obj", [np.nan, None]) -def test_unique_null(null_obj, index_or_series_obj): +def test_unique_null(null_obj, index_or_series_obj, using_nan_is_na): obj = index_or_series_obj if not allow_na_ops(obj): @@ -39,6 +39,12 @@ def test_unique_null(null_obj, index_or_series_obj): pytest.skip("Test doesn't make sense on empty data") elif isinstance(obj, pd.MultiIndex): pytest.skip(f"MultiIndex can't hold '{null_obj}'") + elif ( + null_obj is not None + and not using_nan_is_na + and obj.dtype in ["Int64", "UInt16", "Float32"] + ): + pytest.skip("NaN is not a valid NA for this dtype.") values = obj._values values[0:2] = null_obj diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 79eb64b5a654f..3e4075911f735 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -31,7 +31,7 @@ def test_can_hold_na_valid(self, data): # GH-20761 assert data._can_hold_na is True - def test_contains(self, data, data_missing): + def test_contains(self, data, data_missing, using_nan_is_na): # GH-37867 # Tests for membership checks. Membership checks for nan-likes is tricky and # the settled on rule is: `nan_like in arr` is True if nan_like is @@ -55,7 +55,21 @@ def test_contains(self, data, data_missing): # type check for e.g. two instances of Decimal("NAN") continue assert na_value_obj not in data - assert na_value_obj not in data_missing + if ( + using_nan_is_na + and isinstance(na_value_obj, float) + and isinstance( + data, + ( + pd.core.arrays.BaseMaskedArray, + pd.core.arrays.ArrowExtensionArray, + ), + ) + ): + # TODO: wrong place for this override + assert na_value_obj in data_missing + else: + assert na_value_obj not in data_missing def test_memory_usage(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 0e9ffce07bf98..07e1c1d96a196 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -176,20 +176,23 @@ def skip_if_doesnt_support_2d(self, dtype, request): # override becomes unnecessary. @pytest.mark.parametrize("na_action", [None, "ignore"]) - def test_map(self, data_missing, na_action): + def test_map(self, data_missing, na_action, using_nan_is_na): result = data_missing.map(lambda x: x, na_action=na_action) - if data_missing.dtype == Float32Dtype(): + if data_missing.dtype == Float32Dtype() and using_nan_is_na: # map roundtrips through objects, which converts to float64 expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) else: expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_map_na_action_ignore(self, data_missing_for_sorting): + def test_map_na_action_ignore(self, data_missing_for_sorting, using_nan_is_na): zero = data_missing_for_sorting[2] result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") if data_missing_for_sorting.dtype.kind == "b": expected = np.array([False, pd.NA, False], dtype=object) + elif not using_nan_is_na: + # TODO: would we prefer to get NaN in this case to get a non-object? + expected = np.array([zero, pd.NA, zero], dtype=object) else: expected = np.array([zero, np.nan, zero]) tm.assert_numpy_array_equal(result, expected) @@ -220,8 +223,7 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): if sdtype.kind in "iu": if op_name in ("__rtruediv__", "__truediv__", "__div__"): - filled = expected.fillna(np.nan) - expected = filled.astype("Float64") + expected = expected.astype("Float64") else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(sdtype) @@ -392,7 +394,9 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.Series( pd.array( getattr(ser.astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, + dtype="Float64", ) ) + expected[np.isnan(expected)] = pd.NA + expected = expected.astype(expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index c428bd1820cb1..d25cb2d4b8e6e 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -365,12 +365,22 @@ def test_astype_extension_dtypes_1d(self, any_int_ea_dtype): tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) @pytest.mark.parametrize("dtype", ["category", "Int64"]) - def test_astype_extension_dtypes_duplicate_col(self, dtype): + def test_astype_extension_dtypes_duplicate_col(self, dtype, using_nan_is_na): # GH#24704 a1 = Series([0, np.nan, 4], name="a") a2 = Series([np.nan, 3, 5], name="a") df = concat([a1, a2], axis=1) + if dtype == "Int64" and not using_nan_is_na: + msg = "Cannot cast NaN value to Integer dtype" + with pytest.raises(ValueError, match=msg): + df.astype(dtype) + with pytest.raises(ValueError, match=msg): + a1.astype(dtype) + with pytest.raises(ValueError, match=msg): + a2.astype(dtype) + return + result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 9e302dc5f94ee..41f72d17ebef7 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -643,7 +643,7 @@ def test_replace_mixed3(self): def test_replace_nullable_int_with_string_doesnt_cast(self): # GH#25438 don't cast df['a'] to float64 - df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]}) + df = DataFrame({"a": [1, 2, 3, pd.NA], "b": ["some", "strings", "here", "he"]}) df["a"] = df["a"].astype("Int64") res = df.replace("", np.nan) @@ -681,7 +681,7 @@ def test_replace_simple_nested_dict_with_nonexistent_value(self): def test_replace_NA_with_None(self): # gh-45601 - df = DataFrame({"value": [42, None]}).astype({"value": "Int64"}) + df = DataFrame({"value": [42, pd.NA]}, dtype="Int64") result = df.replace({pd.NA: None}) expected = DataFrame({"value": [42, None]}, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index cc23c292b66dc..5aacd2df11873 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -2121,7 +2121,9 @@ def test_fails_on_non_numeric(kernel): ], ) @pytest.mark.parametrize("min_count", [0, 2]) -def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): +def test_numeric_ea_axis_1( + method, skipna, min_count, any_numeric_ea_dtype, using_nan_is_na +): # GH 54341 df = DataFrame( { @@ -2170,5 +2172,11 @@ def test_numeric_ea_axis_1(method, skipna, min_count, any_numeric_ea_dtype): result = getattr(df, method)(axis=1, **kwargs) expected = getattr(expected_df, method)(axis=1, **kwargs) if method not in ("idxmax", "idxmin"): - expected = expected.astype(expected_dtype) + if using_nan_is_na: + expected = expected.astype(expected_dtype) + else: + mask = np.isnan(expected) + expected[mask] = 0 + expected = expected.astype(expected_dtype) + expected[mask] = pd.NA tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 28cb25b515ed2..815513fe96009 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -255,7 +255,9 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): def test_groupby_quantile_NA_float(any_float_dtype): # GH#42849 - df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + dtype = pd.Series([], dtype=any_float_dtype).dtype + item = np.nan if isinstance(dtype, np.dtype) else pd.NA + df = DataFrame({"x": [1, 1], "y": [0.2, item]}, dtype=any_float_dtype) result = df.groupby("x")["y"].quantile(0.5) exp_index = Index([1.0], dtype=any_float_dtype, name="x") @@ -353,7 +355,7 @@ def test_groupby_quantile_allNA_column(dtype): df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) result = df.groupby("x")["y"].quantile(0.5) expected = pd.Series( - [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" + [pd.NA], dtype=dtype, index=Index([1.0], dtype=dtype), name="y" ) expected.index.name = "x" tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index e60e7d6bc05d4..977d98f81e0f3 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -186,9 +186,10 @@ def test_masked_kleene_logic(all_boolean_reductions, skipna, data): ) def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2): # GH#37506 - data = [1.0, np.nan] + data1 = [1.0, np.nan] if dtype1.startswith("f") else [1.0, pd.NA] + data2 = [1.0, np.nan] if dtype2.startswith("f") else [1.0, pd.NA] df = DataFrame( - {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)} + {"col1": pd.array(data1, dtype=dtype1), "col2": pd.array(data2, dtype=dtype2)} ) result = df.groupby([1, 1]).agg("all", skipna=False) diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index cf5fc2977a28f..c134e44681122 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -671,7 +671,7 @@ def test_from_frame_missing_values_multiIndex(): multi_indexed = MultiIndex.from_frame(df) expected = MultiIndex.from_arrays( [ - Series([1, 2, None]).astype("Int64"), + Series([1, 2, None], dtype="Int64"), pd.Float64Dtype().__from_arrow__(pa.array([0.2, None, None])), ], names=["a", "b"], diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index b29f783203177..2f37b15ca74f5 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -339,35 +339,50 @@ def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype): with pytest.raises(KeyError, match="NA"): idx.get_loc(NA) - def test_get_loc_masked_na_and_nan(self): + def test_get_loc_masked_na_and_nan(self, using_nan_is_na): # GH#39133 - idx = Index( - FloatingArray( - np.array([1, 2, 1, np.nan]), mask=np.array([False, False, True, False]) - ) - ) - result = idx.get_loc(NA) - assert result == 2 - result = idx.get_loc(np.nan) - assert result == 3 + mask = np.array([False, False, True, False]) + if using_nan_is_na: + mask[-1] = True + + idx = Index(FloatingArray(np.array([1, 2, 1, np.nan]), mask=mask)) + if using_nan_is_na: + # NaN and NA are consistently treated as the same + result = idx.get_loc(NA) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) + result = idx.get_loc(np.nan) + tm.assert_numpy_array_equal(result, expected) + else: + result = idx.get_loc(NA) + assert result == 2 + result = idx.get_loc(np.nan) + assert result == 3 idx = Index( FloatingArray(np.array([1, 2, 1.0]), mask=np.array([False, False, True])) ) result = idx.get_loc(NA) assert result == 2 - with pytest.raises(KeyError, match="nan"): - idx.get_loc(np.nan) + if using_nan_is_na: + result = idx.get_loc(np.nan) + assert result == 2 + else: + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) - idx = Index( - FloatingArray( - np.array([1, 2, np.nan]), mask=np.array([False, False, False]) - ) - ) + mask = np.array([False, False, False]) + if using_nan_is_na: + mask[-1] = True + idx = Index(FloatingArray(np.array([1, 2, np.nan]), mask=mask)) result = idx.get_loc(np.nan) assert result == 2 - with pytest.raises(KeyError, match="NA"): - idx.get_loc(NA) + if using_nan_is_na: + result = idx.get_loc(NA) + assert result == 2 + else: + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) @pytest.mark.parametrize("val", [4, 2]) def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c04ea129590bc..5414389f52fc5 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -10,6 +10,7 @@ from pandas.errors import IndexingError from pandas import ( + NA, Categorical, CategoricalDtype, DataFrame, @@ -794,9 +795,9 @@ def test_iloc_mask(self): idx is None or (idx == "index" and method != ".iloc") ) and "0b" in expected_result: # For successful numeric results, exact match is needed - assert expected_result == answer, ( - f"[{key}] does not match [{answer}]" - ) + assert ( + expected_result == answer + ), f"[{key}] does not match [{answer}]" else: # For error messages, substring match is sufficient assert expected_result in answer, f"[{key}] not found in [{answer}]" @@ -1480,8 +1481,10 @@ def test_iloc_setitem_pure_position_based(self): def test_iloc_nullable_int64_size_1_nan(self): # GH 31861 result = DataFrame({"a": ["test"], "b": [np.nan]}) + + ser = Series([NA], name="b", dtype="Int64") with pytest.raises(TypeError, match="Invalid value"): - result.loc[:, "b"] = result.loc[:, "b"].astype("Int64") + result.loc[:, "b"] = ser def test_iloc_arrow_extension_array(self): # GH#61311 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ccb58aae2783f..3aa2eb2e42f91 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2929,7 +2929,7 @@ def test_loc_getitem_multiindex_tuple_level(): def test_loc_getitem_nullable_index_with_duplicates(): # GH#34497 df = DataFrame( - data=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, np.nan, np.nan]]).T, + data=np.array([[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, pd.NA, pd.NA]]).T, columns=["a", "b", "c"], dtype="Int64", ) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 63332fe4658e5..10335ff716c1f 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -656,8 +656,10 @@ def test_cut_incorrect_labels(labels): def test_cut_nullable_integer(bins, right, include_lowest): a = np.random.default_rng(2).integers(0, 10, size=50).astype(float) a[::2] = np.nan + b = a.astype(object) + b[::2] = pd.NA result = cut( - pd.array(a, dtype="Int64"), bins, right=right, include_lowest=include_lowest + pd.array(b, dtype="Int64"), bins, right=right, include_lowest=include_lowest ) expected = cut(a, bins, right=right, include_lowest=include_lowest) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 2c441a6ed91c1..3e8c13685aca1 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -732,9 +732,9 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): "input_series, expected_output", [ [["2020-01-01"], [[2020, 1, 3]]], - [[pd.NaT], [[np.nan, np.nan, np.nan]]], + [[pd.NaT], [[None, None, None]]], [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], - [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.nan, np.nan, np.nan]]], + [["2010-01-01", pd.NaT], [[2009, 53, 5], [None, None, None]]], # see GH#36032 [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]], [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], diff --git a/pandas/tests/series/methods/test_case_when.py b/pandas/tests/series/methods/test_case_when.py index acfc58bea728e..7cb60a11644a3 100644 --- a/pandas/tests/series/methods/test_case_when.py +++ b/pandas/tests/series/methods/test_case_when.py @@ -2,7 +2,6 @@ import pytest from pandas import ( - NA, DataFrame, Series, array as pd_array, @@ -100,7 +99,7 @@ def test_case_when_multiple_conditions_replacement_extension_dtype(df): (df["a"].gt(1) & df["b"].eq(5), pd_array([1, 2, 3], dtype="Int64")), ], ) - expected = Series([1, 2, NA], dtype="Float64") + expected = Series([1, 2, np.nan], dtype="Float64") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 8ed422fc118dc..c1ee7f8c9e008 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -43,21 +43,16 @@ def test_clip_types_and_nulls(self): assert list(isna(s)) == list(isna(lower)) assert list(isna(s)) == list(isna(upper)) - def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixture): + def test_series_clipping_with_na_values(self, any_numeric_ea_dtype): # Ensure that clipping method can handle NA values with out failing # GH#40581 - if nulls_fixture is pd.NaT: - # constructor will raise, see - # test_constructor_mismatched_null_nullable_dtype - pytest.skip("See test_constructor_mismatched_null_nullable_dtype") - - ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) + ser = Series([pd.NA, 1.0, 3.0], dtype=any_numeric_ea_dtype) s_clipped_upper = ser.clip(upper=2.0) s_clipped_lower = ser.clip(lower=2.0) - expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype) - expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype) + expected_upper = Series([pd.NA, 1.0, 2.0], dtype=any_numeric_ea_dtype) + expected_lower = Series([pd.NA, 2.0, 3.0], dtype=any_numeric_ea_dtype) tm.assert_series_equal(s_clipped_upper, expected_upper) tm.assert_series_equal(s_clipped_lower, expected_lower) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 324e03894e92c..e36baba5e0108 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -182,6 +182,7 @@ def test_convert_dtypes( expected_other, params, using_infer_string, + using_nan_is_na, ): if ( hasattr(data, "dtype") @@ -224,6 +225,16 @@ def test_convert_dtypes( # If convert_string=False and infer_objects=True, we end up with the # default string dtype instead of preserving object for string data expected_dtype = pd.StringDtype(na_value=np.nan) + if ( + not using_nan_is_na + and expected_dtype == "Int64" + and isinstance(data[1], float) + and np.isnan(data[1]) + ): + if params_dict["convert_floating"]: + expected_dtype = "Float64" + else: + expected_dtype = "float64" expected = pd.Series(data, dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 7d96f7f862fce..55ee660d09067 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -279,7 +279,10 @@ def test_rank_tie_methods( method, exp = results if ( dtype == "int64" - or (dtype in ["int64[pyarrow]", "uint64[pyarrow]"] and not using_nan_is_na) + or ( + dtype in ["int64[pyarrow]", "uint64[pyarrow]", "Int64"] + and not using_nan_is_na + ) or (not using_infer_string and dtype == "str") ): pytest.skip("int64/str does not support NaN") @@ -300,6 +303,11 @@ def test_rank_tie_methods( elif method == "first": exp[np.isnan(ser)] = [9, 10] + if dtype == "string[pyarrow]" and not using_nan_is_na: + mask = np.isnan(exp) + exp = exp.astype(object) + exp[mask] = NA + expected = Series(exp, dtype=expected_dtype(dtype, method)) tm.assert_series_equal(result, expected) @@ -320,7 +328,15 @@ def test_rank_tie_methods( ], ) def test_rank_tie_methods_on_infs_nans( - self, rank_method, na_option, ascending, dtype, na_value, pos_inf, neg_inf + self, + rank_method, + na_option, + ascending, + dtype, + na_value, + pos_inf, + neg_inf, + using_nan_is_na, ): pytest.importorskip("scipy") if dtype == "float64[pyarrow]": @@ -352,7 +368,7 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] - elif dtype == "float64[pyarrow]": + elif dtype == "float64[pyarrow]" and not using_nan_is_na: order = [ranks[0], [NA] * chunk, ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] @@ -424,7 +440,7 @@ def test_rank_descending( method, _ = results if ( dtype == "int64" - or (dtype in ["int64[pyarrow]"] and not using_nan_is_na) + or (dtype in ["int64[pyarrow]", "Int64", "Float64"] and not using_nan_is_na) or (not using_infer_string and dtype == "str") ): s = ser.dropna().astype(dtype) @@ -436,6 +452,8 @@ def test_rank_descending( expected = (s.astype("float64").max() - s.astype("float64")).rank() else: expected = (s.max() - s).rank() + if dtype == "string[pyarrow]" and not using_nan_is_na: + expected = expected.replace(np.nan, NA) tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) if dtype.startswith("str"): @@ -445,6 +463,8 @@ def test_rank_descending( else: expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) + if dtype == "string[pyarrow]" and not using_nan_is_na: + expected = expected.replace(np.nan, NA) tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): From 95f9ad958504e9deff2b1d4114a7027170074182 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 09:37:39 -0700 Subject: [PATCH 32/39] update style test --- pandas/tests/io/formats/style/test_highlight.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/style/test_highlight.py b/pandas/tests/io/formats/style/test_highlight.py index 5d19e9c14d534..98c1f70f08e89 100644 --- a/pandas/tests/io/formats/style/test_highlight.py +++ b/pandas/tests/io/formats/style/test_highlight.py @@ -15,8 +15,10 @@ @pytest.fixture(params=[(None, "float64"), (NA, "Int64")]) def df(request): # GH 45804 + dtype = request.param[1] + item = np.nan if dtype == "float64" else NA return DataFrame( - {"A": [0, np.nan, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1] + {"A": [0, item, 10], "B": [1, request.param[0], 2]}, dtype=request.param[1] ) From 1e3d105240d4d28640e8b82f4dd2837eba01c134 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 09:43:18 -0700 Subject: [PATCH 33/39] update asvs, mypy ignores --- asv_bench/benchmarks/algorithms.py | 4 ++-- asv_bench/benchmarks/frame_methods.py | 3 +++ asv_bench/benchmarks/groupby.py | 4 ++++ pandas/core/algorithms.py | 7 ++++--- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 933e8fbc175d8..422ba5201bc4e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -199,8 +199,8 @@ class SortIntegerArray: params = [10**3, 10**5] def setup(self, N): - data = np.arange(N, dtype=float) - data[40] = np.nan + data = np.arange(N, dtype=float).astype(object) + data[40] = pd.NA self.array = pd.array(data, dtype="Int64") def time_argsort(self, N): diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index cd7851acae3f2..14fa64c01f1a5 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -4,6 +4,7 @@ import numpy as np from pandas import ( + NA, DataFrame, Index, MultiIndex, @@ -445,6 +446,8 @@ def setup(self, inplace, dtype): values[::2] = np.nan if dtype == "Int64": values = values.round() + values = values.astype(object) + values[::2] = NA self.df = DataFrame(values, dtype=dtype) self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 19c556dfe9d1f..7c1d6457eea15 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -689,6 +689,10 @@ def setup(self, dtype, method, with_nans): null_vals = vals.astype(float, copy=True) null_vals[::2, :] = np.nan null_vals[::3, :] = np.nan + if dtype in ["Int64", "Float64"]: + null_vals = null_vals.astype(object) + null_vals[::2, :] = NA + null_vals[::3, :] = NA df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) df["key"] = keys self.df = df diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c14ab2bc02da2..aa950022c5d1e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1067,12 +1067,13 @@ def rank( is_datetimelike = needs_i8_conversion(values.dtype) if ( isinstance(values.dtype, BaseMaskedDtype) - and values._hasna + and values._hasna # type: ignore[union-attr] and values.dtype.kind in "iuf" ): # e.g. test_rank_ea_small_values - # TODO: bug in the object-dtype path that we would get without this special casting. - values = values.to_numpy(dtype=np.float64, na_value=np.nan) + # TODO: bug in the object-dtype path that we would get without + # this special casting. + values = values.to_numpy(dtype=np.float64, na_value=np.nan) # type: ignore[union-attr] else: values = _ensure_data(values) From cefeb6b2ab85175ca729cd9d78c2e7a280418e5f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 09:48:50 -0700 Subject: [PATCH 34/39] pre-commit fixup --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 1478380d90a7d..65f3b3f179e4c 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -429,7 +429,7 @@ def is_terminal() -> bool: cf.register_option( "nan_is_na", - os.environ.get("PANDAS_NAN_IS_NA", 0) == "1", + os.environ.get("PANDAS_NAN_IS_NA", "0") == "1", "Whether to treat NaN entries as interchangeable with pd.NA in " "numpy-nullable and pyarrow float dtypes. See discussion in " "https://github.com/pandas-dev/pandas/issues/32265", From bc9c8891b2374c3db362a0332bba3c440a414c81 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 10:11:08 -0700 Subject: [PATCH 35/39] doc fixup --- doc/source/whatsnew/v0.24.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0f40f5bfa5fc9..27d5a65a08467 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -50,7 +50,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` .. ipython:: python - s = pd.Series([1, 2, np.nan], dtype='Int64') + s = pd.Series([1, 2, pd.NA], dtype='Int64') s @@ -166,7 +166,7 @@ See the :ref:`dtypes docs ` for more on extension arrays. .. ipython:: python - pd.array([1, 2, np.nan], dtype='Int64') + pd.array([1, 2, pd.NA], dtype='Int64') pd.array(['a', 'b', 'c'], dtype='category') Passing data for which there isn't dedicated extension type (e.g. float, integer, etc.) From 74ab221d3bf6f1946eb7ee83b80dee952ee1b98f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 13:41:09 -0700 Subject: [PATCH 36/39] Remove special-casing --- pandas/core/algorithms.py | 12 +----------- pandas/tests/series/methods/test_rank.py | 22 ++++++++-------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index aa950022c5d1e..533b9b689af0b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1065,17 +1065,7 @@ def rank( (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ is_datetimelike = needs_i8_conversion(values.dtype) - if ( - isinstance(values.dtype, BaseMaskedDtype) - and values._hasna # type: ignore[union-attr] - and values.dtype.kind in "iuf" - ): - # e.g. test_rank_ea_small_values - # TODO: bug in the object-dtype path that we would get without - # this special casting. - values = values.to_numpy(dtype=np.float64, na_value=np.nan) # type: ignore[union-attr] - else: - values = _ensure_data(values) + values = _ensure_data(values) if values.ndim == 1: ranks = algos.rank_1d( diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 55ee660d09067..ee221c0b72e65 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -288,20 +288,14 @@ def test_rank_tie_methods( pytest.skip("int64/str does not support NaN") ser = ser if dtype is None else ser.astype(dtype) + if dtype in ["float64[pyarrow]", "Float64"] and not using_nan_is_na: + # TODO: use ser.replace(np.nan, NA) once that works + ser[np.isnan(ser.to_numpy(dtype=np.float64, na_value=np.nan))] = NA + mask = np.isnan(exp) + exp = exp.astype(object) + exp[mask] = NA + result = ser.rank(method=method) - if dtype == "float64[pyarrow]" and not using_nan_is_na: - # the NaNs are not treated as NA - exp = exp.copy() - if method == "average": - exp[np.isnan(ser)] = 9.5 - elif method == "dense": - exp[np.isnan(ser)] = 6 - elif method == "max": - exp[np.isnan(ser)] = 10 - elif method == "min": - exp[np.isnan(ser)] = 9 - elif method == "first": - exp[np.isnan(ser)] = [9, 10] if dtype == "string[pyarrow]" and not using_nan_is_na: mask = np.isnan(exp) @@ -368,7 +362,7 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] - elif dtype == "float64[pyarrow]" and not using_nan_is_na: + elif dtype in ("float64[pyarrow]", "Float64") and not using_nan_is_na: order = [ranks[0], [NA] * chunk, ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] From 0773a86df7c14690ab63c3c720e6fc653db2ca0d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 13:58:37 -0700 Subject: [PATCH 37/39] comment --- pandas/tests/series/methods/test_rank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index ee221c0b72e65..357894cbd0fe3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -280,6 +280,7 @@ def test_rank_tie_methods( if ( dtype == "int64" or ( + # TODO: these can work but need to update ser construction. dtype in ["int64[pyarrow]", "uint64[pyarrow]", "Int64"] and not using_nan_is_na ) From 35a3c144b2671ca978e31c491463d8386ce010d6 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Aug 2025 19:13:34 -0700 Subject: [PATCH 38/39] ruff format --- pandas/tests/indexing/test_iloc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 5414389f52fc5..bbff484341ad5 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -795,9 +795,9 @@ def test_iloc_mask(self): idx is None or (idx == "index" and method != ".iloc") ) and "0b" in expected_result: # For successful numeric results, exact match is needed - assert ( - expected_result == answer - ), f"[{key}] does not match [{answer}]" + assert expected_result == answer, ( + f"[{key}] does not match [{answer}]" + ) else: # For error messages, substring match is sufficient assert expected_result in answer, f"[{key}] not found in [{answer}]" From 71d1c03766fbb17e7d7a1325be7170f08bac03e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 18:10:41 -0700 Subject: [PATCH 39/39] Set default to True --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 65f3b3f179e4c..9b317b51cabdc 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -429,7 +429,7 @@ def is_terminal() -> bool: cf.register_option( "nan_is_na", - os.environ.get("PANDAS_NAN_IS_NA", "0") == "1", + os.environ.get("PANDAS_NAN_IS_NA", "1") == "1", "Whether to treat NaN entries as interchangeable with pd.NA in " "numpy-nullable and pyarrow float dtypes. See discussion in " "https://github.com/pandas-dev/pandas/issues/32265",