diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ec5027840dfd5..0f8e026761db0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -814,6 +814,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8cadde1ad6537..09759d4127ac8 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,6 +3,10 @@ from typing import TYPE_CHECKING import warnings +import numpy as np + +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -11,9 +15,17 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import ( + is_string_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, +) from pandas.core.dtypes.inference import is_integer +from pandas.core.arrays.string_ import StringDtype + from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase @@ -140,20 +152,7 @@ def handle_warning(invalid_row) -> str: "encoding": self.encoding, } - def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: - """ - Processes data read in based on kwargs. - - Parameters - ---------- - frame: DataFrame - The DataFrame to process. - - Returns - ------- - DataFrame - The processed DataFrame. - """ + def _finalize_column_names(self, frame: DataFrame) -> DataFrame: num_cols = len(frame.columns) multi_index_named = True if self.header is None: @@ -196,6 +195,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) + return frame + + def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: + """ + Processes data read in based on kwargs. + + Parameters + ---------- + frame: DataFrame + The DataFrame to process. + + Returns + ------- + DataFrame + The processed DataFrame. + """ + if self.dtype is not None: # Ignore non-existent columns from dtype mapping # like other parsers do @@ -282,6 +298,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) + workaround = False + pass_backend = dtype_backend + if self.dtype is not None and dtype_backend != "pyarrow": + # We pass dtype_backend="pyarrow" and subsequently cast + # to avoid lossy conversion e.g. GH#56136 + workaround = True + pass_backend = "numpy_nullable" + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -289,7 +313,49 @@ def read(self) -> DataFrame: DeprecationWarning, ) frame = arrow_table_to_pandas( - table, dtype_backend=dtype_backend, null_to_int64=True + table, dtype_backend=pass_backend, null_to_int64=True ) + frame = self._finalize_column_names(frame) + + if workaround and dtype_backend != "numpy_nullable": + old_dtype = self.dtype + if not isinstance(old_dtype, dict): + # e.g. test_categorical_dtype_utf16 + old_dtype = dict.fromkeys(frame.columns, old_dtype) + + # _finalize_pandas_output will call astype, but we need to make + # sure all keys are populated appropriately. + new_dtype = {} + for key in frame.columns: + ser = frame[key] + if isinstance(ser.dtype, BaseMaskedDtype): + new_dtype[key] = ser.dtype.numpy_dtype + if ( + key in old_dtype + and not using_string_dtype() + and is_string_dtype(old_dtype[key]) + and not isinstance(old_dtype[key], StringDtype) + and ser.array._hasna + ): + # Cast to make sure we get "NaN" string instead of "NA" + frame[key] = ser.astype(old_dtype[key]) + frame.loc[ser.isna(), key] = np.nan + old_dtype[key] = object # Avoid re-casting + elif isinstance(ser.dtype, StringDtype): + # We cast here in case the user passed "category" in + # order to get the correct dtype.categories.dtype + # e.g. test_categorical_dtype_utf16 + if not using_string_dtype(): + sdt = np.dtype(object) + frame[key] = ser.astype(sdt) + frame.loc[ser.isna(), key] = np.nan + else: + sdt = StringDtype(na_value=np.nan) # type: ignore[assignment] + frame[key] = frame[key].astype(sdt) + new_dtype[key] = sdt + + new_dtype.update(old_dtype) + self.dtype = new_dtype + return self._finalize_pandas_output(frame) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 75b7cf0d42cb8..e4563afc631c5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): tm.assert_frame_equal(result, expected) -# pyarrow engine failing: -# https://github.com/pandas-dev/pandas/issues/56136 -@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 213fa2c01cef4..5f08f5ef466cf 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) -def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): +def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request): # see gh-20377 parser = all_parsers + if parser.engine == "pyarrow" and na_filter is False: + mark = pytest.mark.xfail(reason="mismatched shape") + request.applymarker(mark) + data = "a,b,c\n1,,3\n4,5,6" # na_filter=True --> missing value becomes NaN. @@ -798,7 +801,15 @@ def test_bool_and_nan_to_int(all_parsers): True False """ - with pytest.raises(ValueError, match="convert|NoneType"): + msg = ( + "cannot safely convert passed user dtype of int(64|32) for " + " dtyped data in column 0 due to NA values" + ) + if parser.engine == "python": + msg = "Unable to convert column 0 to type int(64|32)" + elif parser.engine == "pyarrow": + msg = r"cannot convert NA to integer" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype="int")