From 7cbebaf981e45648358943e626edc25677c2104d Mon Sep 17 00:00:00 2001 From: Uddeshya Singh Date: Thu, 21 Jun 2018 15:09:18 +0530 Subject: [PATCH 01/55] Update merging.rst (#21568) --- doc/source/merging.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 1161656731f88..4d7cd0bdadef7 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -279,7 +279,7 @@ need to be: Ignoring indexes on the concatenation axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``DataFrame``s which don't have a meaningful index, you may wish to append +For ``DataFrame`` s which don't have a meaningful index, you may wish to append them and ignore the fact that they may have overlapping indexes. To do this, use the ``ignore_index`` argument: @@ -314,7 +314,7 @@ This is also a valid argument to :meth:`DataFrame.append`: Concatenating with mixed ndims ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can concatenate a mix of ``Series`` and ``DataFrame``s. The +You can concatenate a mix of ``Series`` and ``DataFrame`` s. The ``Series`` will be transformed to ``DataFrame`` with the column name as the name of the ``Series``. From 3b65b9572a1fc8a2b232544d4e194b7d9eacdaa6 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 11 Jun 2018 17:15:29 -0700 Subject: [PATCH 02/55] DOC: Add 0.23.2 whatsnew template (#21433) (cherry picked from commit 879b15f3476d81d51f236d13684444579bafb8fd) --- doc/source/whatsnew/v0.23.2.txt | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.2.txt diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt new file mode 100644 index 0000000000000..ec2eddcfd4d41 --- /dev/null +++ b/doc/source/whatsnew/v0.23.2.txt @@ -0,0 +1,82 @@ +.. _whatsnew_0232: + +v0.23.2 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.23.2 + :local: + :backlinks: none + +.. _whatsnew_0232.enhancements: + +New features +~~~~~~~~~~~~ + + +.. _whatsnew_0232.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0232.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0232.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +- +- + +Conversion +^^^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Reshaping +^^^^^^^^^ + +- +- + +Categorical +^^^^^^^^^^^ + +- From 22c5145861fcf21567e46dcb7fb608b08cdd66a1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 11 Jun 2018 17:16:36 -0700 Subject: [PATCH 03/55] MAINT: More friendly error msg on Index overflow (#21377) * MAINT: More useful error msg on Index overflow Display a more friendly error message when there is an OverflowError during Index construction. Partially addresses gh-15832. * DOC: Clarify how Index.__new__ handles dtype Partially addresses gh-15823. (cherry picked from commit defdb34bafa3900069d399ce597c0abbd4a2b0cc) --- pandas/core/indexes/base.py | 12 +++++++++++- pandas/tests/indexes/test_base.py | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 90238af9b3632..5fdb8fc59deca 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -187,6 +187,9 @@ class Index(IndexOpsMixin, PandasObject): ---------- data : array-like (1-dimensional) dtype : NumPy dtype (default: object) + If dtype is None, we find the dtype that best fits the data. + If an actual dtype is provided, we coerce to that dtype if it's safe. + Otherwise, an error will be raised. copy : bool Make a copy of input ndarray name : object @@ -312,7 +315,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - data = np.array(data, copy=copy, dtype=dtype) + try: + data = np.array(data, copy=copy, dtype=dtype) + except OverflowError: + # gh-15823: a more user-friendly error message + raise OverflowError( + "the elements provided in the data cannot " + "all be casted to the dtype {dtype}" + .format(dtype=dtype)) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 1e4dd2921b3f5..19acfb294762c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -455,6 +455,13 @@ def test_constructor_nonhashable_name(self, indices): tm.assert_raises_regex(TypeError, message, indices.set_names, names=renamed) + def test_constructor_overflow_int64(self): + # see gh-15832 + msg = ("the elements provided in the data cannot " + "all be casted to the dtype int64") + with tm.assert_raises_regex(OverflowError, msg): + Index([np.iinfo(np.uint64).max - 1], dtype="int64") + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', From 191767168dfa21639d16a16319245969a8e974ad Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 12 Jun 2018 09:54:11 +0200 Subject: [PATCH 04/55] DOC: follow 0.23.1 template for 0.23.2 whatsnew (#21435) (cherry picked from commit 1275f91b74d8a48671eb8e705807bf852a8806a8) --- doc/source/whatsnew/v0.23.2.txt | 36 +++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ec2eddcfd4d41..c636e73fbd6c2 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -10,16 +10,11 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none -.. _whatsnew_0232.enhancements: -New features -~~~~~~~~~~~~ +.. _whatsnew_0232.fixed_regressions: - -.. _whatsnew_0232.deprecations: - -Deprecations -~~~~~~~~~~~~ +Fixed Regressions +~~~~~~~~~~~~~~~~~ - - @@ -43,40 +38,41 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +**Groupby/Resample/Rolling** + - - -Conversion -^^^^^^^^^^ +**Conversion** + - - -Indexing -^^^^^^^^ +**Indexing** - - -I/O -^^^ +**I/O** - - -Plotting -^^^^^^^^ +**Plotting** - - -Reshaping -^^^^^^^^^ +**Reshaping** - - -Categorical -^^^^^^^^^^^ +**Categorical** + +- + +**Other** - From 475c8bcfde52545b7f46d3035691f20487415160 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 13 Jun 2018 11:25:58 +0100 Subject: [PATCH 05/55] Fix tests fragile to PATH (#21453) (cherry picked from commit 7a49449b8c95fed027af1da35970743f23a93dff) --- pandas/tests/plotting/test_converter.py | 3 ++- pandas/tests/test_downstream.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 47cded19f5300..bb976a1e3e81c 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -1,4 +1,5 @@ import subprocess +import sys import pytest from datetime import datetime, date @@ -27,7 +28,7 @@ def test_register_by_default(self): "import pandas as pd; " "units = dict(matplotlib.units.registry); " "assert pd.Timestamp in units)'") - call = ['python', '-c', code] + call = [sys.executable, '-c', code] assert subprocess.check_call(call) == 0 def test_warns(self): diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index afd7993fefc70..cf98cff97669a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -3,6 +3,7 @@ Testing that we work in the downstream packages """ import subprocess +import sys import pytest import numpy as np # noqa @@ -57,7 +58,7 @@ def test_xarray(df): def test_oo_optimizable(): # GH 21071 - subprocess.check_call(["python", "-OO", "-c", "import pandas"]) + subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) @tm.network From d4c48aaadfa2a6cbf2375631101b79752504f004 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 13 Jun 2018 03:51:41 -0700 Subject: [PATCH 06/55] BUG: Construct Timestamp with tz correctly near DST border (#21407) (cherry picked from commit bc4ccd7dfaceb92ac2c6dc345c1bc4489407108f) --- doc/source/whatsnew/v0.23.2.txt | 4 ++++ pandas/_libs/tslibs/conversion.pyx | 22 ++++--------------- pandas/tests/frame/test_timezones.py | 10 +++++++++ .../indexes/datetimes/test_construction.py | 9 ++++++++ .../indexes/datetimes/test_date_range.py | 14 ++++++++++++ .../tests/scalar/timestamp/test_timestamp.py | 8 +++++++ 6 files changed, 49 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index c636e73fbd6c2..1de44ffeb4160 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -73,6 +73,10 @@ Bug Fixes - +**Timezones** +- Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) +- Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) + **Other** - diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f4841e6abb7e8..3cbef82437544 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -347,25 +347,11 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, if tz is not None: tz = maybe_get_tz(tz) - # sort of a temporary hack if ts.tzinfo is not None: - if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): - ts = tz.normalize(ts) - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # tzoffset - try: - tz = ts.astimezone(tz).tzinfo - except: - pass - obj.value = pydatetime_to_dt64(ts, &obj.dts) - ts_offset = get_utcoffset(ts.tzinfo, ts) - obj.value -= int(ts_offset.total_seconds() * 1e9) - tz_offset = get_utcoffset(tz, ts) - obj.value += int(tz_offset.total_seconds() * 1e9) - dt64_to_dtstruct(obj.value, &obj.dts) - obj.tzinfo = tz + # Convert the current timezone to the passed timezone + ts = ts.astimezone(tz) + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo elif not is_utc(tz): ts = _localize_pydatetime(ts, tz) obj.value = pydatetime_to_dt64(ts, &obj.dts) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index fa589a0aa4817..3956968173070 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -133,3 +133,13 @@ def test_frame_reset_index(self, tz): xp = df.index.tz rs = roundtripped.index.tz assert xp == rs + + @pytest.mark.parametrize('tz', [None, 'America/New_York']) + def test_boolean_compare_transpose_tzindex_with_dst(self, tz): + # GH 19970 + idx = date_range('20161101', '20161130', freq='4H', tz=tz) + df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))}, + index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list('ab'), columns=idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index dae69a86910af..b138b79caac76 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -469,6 +469,15 @@ def test_constructor_with_non_normalized_pytz(self, tz): result = DatetimeIndex(['2010'], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz + def test_constructor_timestamp_near_dst(self): + # GH 20854 + ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'), + Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')] + result = DatetimeIndex(ts) + expected = DatetimeIndex([ts[0].to_pydatetime(), + ts[1].to_pydatetime()]) + tm.assert_index_equal(result, expected) + class TestTimeSeries(object): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 193804b66395b..ec37bbbcb6c02 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -278,6 +278,20 @@ def test_wom_len(self, periods): res = date_range(start='20110101', periods=periods, freq='WOM-1MON') assert len(res) == periods + def test_construct_over_dst(self): + # GH 20854 + pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=True) + pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=False) + expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'), + pre_dst, + pst_dst] + expected = DatetimeIndex(expect_data) + result = date_range(start='2010-11-7', periods=3, + freq='H', tz='US/Pacific') + tm.assert_index_equal(result, expected) + class TestGenRangeGeneration(object): diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index ab87d98fca8eb..4689c7bea626f 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -528,6 +528,14 @@ def test_disallow_setting_tz(self, tz): with pytest.raises(AttributeError): ts.tz = tz + @pytest.mark.parametrize('offset', ['+0300', '+0200']) + def test_construct_timestamp_near_dst(self, offset): + # GH 20854 + expected = Timestamp('2016-10-30 03:00:00{}'.format(offset), + tz='Europe/Helsinki') + result = Timestamp(expected, tz='Europe/Helsinki') + assert result == expected + class TestTimestamp(object): From 14e5f3d4e604a9abe1ebefe9a136b026add6a7fc Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 13 Jun 2018 15:24:01 +0200 Subject: [PATCH 07/55] BUG: fix get_indexer_non_unique with CategoricalIndex key (#21457) closes #21448 (cherry picked from commit 576d5c6b76e039a411a7cc4c0de29813e2de0149) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/core/indexes/base.py | 3 +++ pandas/core/indexes/category.py | 7 ++++++- pandas/tests/categorical/test_indexing.py | 20 +++++++++++++++++++- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 1de44ffeb4160..3e4326dea2ecc 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -51,7 +51,7 @@ Bug Fixes **Indexing** -- +- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5fdb8fc59deca..a85a0ea88855c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,6 +31,7 @@ is_dtype_equal, is_dtype_union_equal, is_object_dtype, + is_categorical, is_categorical_dtype, is_interval_dtype, is_period_dtype, @@ -3357,6 +3358,8 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = _ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..587090fa72def 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -598,7 +598,12 @@ def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): - target = target.categories + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return _ensure_platform_int(indexer), missing + target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) diff --git a/pandas/tests/categorical/test_indexing.py b/pandas/tests/categorical/test_indexing.py index 9c27b1101e5ca..cf7b5cfa55882 100644 --- a/pandas/tests/categorical/test_indexing.py +++ b/pandas/tests/categorical/test_indexing.py @@ -5,7 +5,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex from pandas.tests.categorical.common import TestCategorical @@ -103,3 +103,21 @@ def f(): s.categories = [1, 2] pytest.raises(ValueError, f) + + # Combinations of sorted/unique: + @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], + [1, 3, 3, 4], [1, 2, 2, 4]]) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + # Test for flat index and CategoricalIndex with same/different cats: + for dtype in None, 'category', key.dtype: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss) From 2272ef4d7d99018f6f570317f7ec3a3d0cd92580 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Thu, 14 Jun 2018 15:53:14 +0530 Subject: [PATCH 08/55] CLN: Comparison methods for MultiIndex should have consistent behaviour for all nlevels (GH21149) (#21195) (cherry picked from commit a8738ba69cd817f7d57c8c25957d2a59621e875f) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/indexes/base.py | 3 ++- pandas/tests/indexes/test_multi.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 3e4326dea2ecc..0d3f9cb8dd3b6 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -52,6 +52,7 @@ Bug Fixes **Indexing** - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) +- Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) - **I/O** diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a85a0ea88855c..a2e237c8cc45d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -97,7 +97,8 @@ def cmp_method(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + from .multi import MultiIndex + if is_object_dtype(self) and not isinstance(self, MultiIndex): # don't pass MultiIndex with np.errstate(all='ignore'): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 182dbdf2cf4e4..df506ae9486ee 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -3295,3 +3295,20 @@ def test_duplicate_multiindex_labels(self): with pytest.raises(ValueError): ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + + def test_multiindex_compare(self): + # GH 21149 + # Ensure comparison operations for MultiIndex with nlevels == 1 + # behave consistently with those for MultiIndex with nlevels > 1 + + midx = pd.MultiIndex.from_product([[0, 1]]) + + # Equality self-test: MultiIndex object vs self + expected = pd.Series([True, True]) + result = pd.Series(midx == midx) + tm.assert_series_equal(result, expected) + + # Greater than comparison: MultiIndex object vs self + expected = pd.Series([False, False]) + result = pd.Series(midx > midx) + tm.assert_series_equal(result, expected) From e4e48f8f34adcf1fe6e37ead4cfd2b0b55547f74 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Fri, 15 Jun 2018 11:21:36 -0600 Subject: [PATCH 09/55] BUG: Fix Series.nlargest for integer boundary values (#21432) (cherry picked from commit ec5956ed350d33ac2cee07bf9a24ea5315529443) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/conftest.py | 71 +++++++++ pandas/core/algorithms.py | 5 +- pandas/tests/frame/test_analytics.py | 78 +++++----- pandas/tests/series/test_analytics.py | 209 ++++++++++++++++++++++++++ 5 files changed, 321 insertions(+), 43 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0d3f9cb8dd3b6..d839a72323c78 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -80,4 +80,5 @@ Bug Fixes **Other** +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - diff --git a/pandas/conftest.py b/pandas/conftest.py index d5f399c7cd63d..9d806a91f37f7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -129,6 +129,14 @@ def join_type(request): return request.param +@pytest.fixture(params=['nlargest', 'nsmallest']) +def nselect_method(request): + """ + Fixture for trying all nselect methods + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ @@ -170,3 +178,66 @@ def string_dtype(request): * 'U' """ return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def float_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float32 + * float64 + """ + + return request.param + + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES + + +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + + * int8 + * int16 + * int32 + * int64 + """ + + return request.param + + +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. + + * uint8 + * uint16 + * uint32 + * uint64 + """ + + return request.param + + +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): + """ + Parameterized fixture for any integer dtypes. + + * int8 + * uint8 + * int16 + * uint16 + * int32 + * uint32 + * int64 + * uint64 + """ + + return request.param diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88bc497f9f22d..bcde32696c1ff 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1131,9 +1131,12 @@ def compute(self, method): return dropped[slc].sort_values(ascending=ascending).head(n) # fast method - arr, _, _ = _ensure_data(dropped.values) + arr, pandas_dtype, _ = _ensure_data(dropped.values) if method == 'nlargest': arr = -arr + if is_integer_dtype(pandas_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 if self.keep == 'last': arr = arr[::-1] diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index d1a4a5f615b86..90d7c46f7554f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,7 +12,7 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, product, PY35 +from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, _np_version_under1p12, _np_version_under1p15) @@ -2240,54 +2240,49 @@ class TestNLargestNSmallest(object): # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize( - 'method, n, order', - product(['nsmallest', 'nlargest'], range(1, 11), - [['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c'], - - ])) - def test_n(self, df_strings, method, n, order): + @pytest.mark.parametrize('order', [ + ['a'], + ['c'], + ['a', 'b'], + ['a', 'c'], + ['b', 'a'], + ['b', 'c'], + ['a', 'b', 'c'], + ['c', 'a', 'b'], + ['c', 'b', 'a'], + ['b', 'c', 'a'], + ['b', 'a', 'c'], + + # dups! + ['b', 'c', 'c']]) + @pytest.mark.parametrize('n', range(1, 11)) + def test_n(self, df_strings, nselect_method, n, order): # GH10393 df = df_strings if 'b' in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=method, dtype='object') + column='b', method=nselect_method, dtype='object') with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(n, order) + getattr(df, nselect_method)(n, order) else: - ascending = method == 'nsmallest' - result = getattr(df, method)(n, order) + ascending = nselect_method == 'nsmallest' + result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'method, columns', - product(['nsmallest', 'nlargest'], - product(['group'], ['category_string', 'string']) - )) - def test_n_error(self, df_main_dtypes, method, columns): + @pytest.mark.parametrize('columns', [ + ('group', 'category_string'), ('group', 'string')]) + def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes + col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=columns[1], method=method, dtype=df[columns[1]].dtype) + column=col, method=nselect_method, dtype=df[col].dtype) # escape some characters that may be in the repr error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") .replace("[", "\\[").replace("]", "\\]")) with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(2, columns) + getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes @@ -2308,15 +2303,14 @@ def test_n_identical_values(self): expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'n, order', - product([1, 2, 3, 4, 5], - [['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']])) + @pytest.mark.parametrize('order', [ + ['a', 'b', 'c'], + ['c', 'b', 'a'], + ['a'], + ['b'], + ['a', 'b'], + ['c', 'b']]) + @pytest.mark.parametrize('n', range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6ea40329f4bc3..7a78b562ac1fa 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1866,6 +1866,189 @@ def s_main_dtypes(): return df +class TestMode(object): + + @pytest.mark.parametrize('dropna, expected', [ + (True, Series([], dtype=np.float64)), + (False, Series([], dtype=np.float64)) + ]) + def test_mode_empty(self, dropna, expected): + s = Series([], dtype=np.float64) + result = s.mode(dropna) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, data, expected', [ + (True, [1, 1, 1, 2], [1]), + (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + (False, [1, 1, 1, 2], [1]), + (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), + ]) + @pytest.mark.parametrize( + 'dt', + list(np.typecodes['AllInteger'] + np.typecodes['Float']) + ) + def test_mode_numerical(self, dropna, data, expected, dt): + s = Series(data, dtype=dt) + result = s.mode(dropna) + expected = Series(expected, dtype=dt) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected', [ + (True, [1.0]), + (False, [1, np.nan]), + ]) + def test_mode_numerical_nan(self, dropna, expected): + s = Series([1, 1, 2, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, ['b'], ['bar'], ['nan']), + (False, ['b'], [np.nan], ['nan']) + ]) + def test_mode_str_obj(self, dropna, expected1, expected2, expected3): + # Test string and object types. + data = ['a'] * 2 + ['b'] * 3 + + s = Series(data, dtype='c') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='c') + tm.assert_series_equal(result, expected1) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected2) + + data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] + + s = Series(data, dtype=object).astype(str) + result = s.mode(dropna) + expected3 = Series(expected3, dtype=str) + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['foo'], ['foo']), + (False, ['foo'], [np.nan]) + ]) + def test_mode_mixeddtype(self, dropna, expected1, expected2): + s = Series([1, 'foo', 'foo']) + result = s.mode(dropna) + expected = Series(expected1) + tm.assert_series_equal(result, expected) + + s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) + result = s.mode(dropna) + expected = Series(expected2, dtype=object) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['1900-05-03', '2011-01-03', '2013-01-02'], + ['2011-01-03', '2013-01-02']), + (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), + ]) + def test_mode_datetime(self, dropna, expected1, expected2): + s = Series(['2011-01-03', '2013-01-02', + '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='M8[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['2011-01-03', '2013-01-02', '1900-05-03', + '2011-01-03', '2013-01-02', 'nan', 'nan'], + dtype='M8[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='M8[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), + (False, [np.nan], [np.nan, '2 min', '1 day']), + ]) + def test_mode_timedelta(self, dropna, expected1, expected2): + # gh-5986: Test timedelta types. + + s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected1 = Series(expected1, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected1) + + s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', + '2 min', '2 min', 'nan', 'nan'], + dtype='timedelta64[ns]') + result = s.mode(dropna) + expected2 = Series(expected2, dtype='timedelta64[ns]') + tm.assert_series_equal(result, expected2) + + @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ + (True, Categorical([1, 2], categories=[1, 2]), + Categorical(['a'], categories=[1, 'a']), + Categorical([3, 1], categories=[3, 2, 1], ordered=True)), + (False, Categorical([np.nan], categories=[1, 2]), + Categorical([np.nan, 'a'], categories=[1, 'a']), + Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), + ]) + def test_mode_category(self, dropna, expected1, expected2, expected3): + s = Series(Categorical([1, 2, np.nan, np.nan])) + result = s.mode(dropna) + expected1 = Series(expected1, dtype='category') + tm.assert_series_equal(result, expected1) + + s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) + result = s.mode(dropna) + expected2 = Series(expected2, dtype='category') + tm.assert_series_equal(result, expected2) + + s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], + categories=[3, 2, 1], ordered=True)) + result = s.mode(dropna) + expected3 = Series(expected3, dtype='category') + tm.assert_series_equal(result, expected3) + + @pytest.mark.parametrize('dropna, expected1, expected2', [ + (True, [2**63], [1, 2**63]), + (False, [2**63], [1, 2**63]) + ]) + def test_mode_intoverflow(self, dropna, expected1, expected2): + # Test for uint64 overflow. + s = Series([1, 2**63, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected1 = Series(expected1, dtype=np.uint64) + tm.assert_series_equal(result, expected1) + + s = Series([1, 2**63], dtype=np.uint64) + result = s.mode(dropna) + expected2 = Series(expected2, dtype=np.uint64) + tm.assert_series_equal(result, expected2) + + @pytest.mark.skipif(not compat.PY3, reason="only PY3") + def test_mode_sortwarning(self): + # Check for the warning that is raised when the mode + # results cannot be sorted + + expected = Series(['foo', np.nan]) + s = Series([1, 'foo', 'foo', np.nan, np.nan]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = s.mode(dropna=False) + result = result.sort_values().reset_index(drop=True) + + tm.assert_series_equal(result, expected) + + +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + s = Series(vals, dtype=dtype) + result = getattr(s, method)(3) + expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected = s.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + class TestNLargestNSmallest(object): @pytest.mark.parametrize( @@ -1950,6 +2133,32 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) + def test_boundary_integer(self, nselect_method, any_int_dtype): + # GH 21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_boundary_float(self, nselect_method, float_dtype): + # GH 21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter( + [min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + def test_boundary_datetimelike(self, nselect_method, dtype): + # GH 21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo('int64') + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + class TestCategoricalSeriesAnalytics(object): From e9ee3a10f8d2eb0ef927e7ad5007fac6d64217ae Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Mon, 18 Jun 2018 23:42:59 +0200 Subject: [PATCH 10/55] PERF: remove useless overrides (#21523) closes #21522 (cherry picked from commit ea54d390ac69a4421f8e88810dd058e9894daf26) --- pandas/core/indexes/multi.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fbcf06a28c1e5..c8332d762f7ef 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -852,14 +852,6 @@ def _has_complex_internals(self): # to disable groupby tricks return True - @cache_readonly - def is_monotonic(self): - """ - return if the index is monotonic increasing (only equal or - increasing) values. - """ - return self.is_monotonic_increasing - @cache_readonly def is_monotonic_increasing(self): """ @@ -887,10 +879,6 @@ def is_monotonic_decreasing(self): # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing - @cache_readonly - def is_unique(self): - return not self.duplicated().any() - @cache_readonly def _have_mixed_levels(self): """ return a boolean list indicated if we have mixed levels """ From 76551c2540a51c028193a16843b7e6d9fcbe47ba Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 18 Jun 2018 17:39:39 -0500 Subject: [PATCH 11/55] BUG: Timedelta.__bool__ (#21485) Closes #21484 (cherry picked from commit d5a1232da14e86dea2b3db8b61741f3f9b56e55a) --- doc/source/whatsnew/v0.23.2.txt | 9 ++++++--- pandas/_libs/tslibs/timedeltas.pyx | 3 +++ pandas/tests/scalar/timedelta/test_timedelta.py | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index d839a72323c78..ea6d8620289f8 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -43,10 +43,13 @@ Bug Fixes - - -**Conversion** +**Timedelta** +- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) -- +**Conversion** + +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - **Indexing** @@ -75,10 +78,10 @@ Bug Fixes - **Timezones** + - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) **Other** -- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index e2b0b33053f83..769f3ca5fa8bf 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -899,6 +899,9 @@ cdef class _Timedelta(timedelta): def __str__(self): return self._repr_base(format='long') + def __bool__(self): + return self.value != 0 + def isoformat(self): """ Format Timedelta as ISO 8601 Duration like diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 205fdf49d3e91..6472bd4245622 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -588,3 +588,17 @@ def test_components(self): result = s.dt.components assert not result.iloc[0].isna().all() assert result.iloc[1].isna().all() + + +@pytest.mark.parametrize('value, expected', [ + (Timedelta('10S'), True), + (Timedelta('-10S'), True), + (Timedelta(10, unit='ns'), True), + (Timedelta(0, unit='ns'), False), + (Timedelta(-10, unit='ns'), True), + (Timedelta(None), True), + (pd.NaT, True), +]) +def test_truthiness(value, expected): + # https://github.com/pandas-dev/pandas/issues/21484 + assert bool(value) is expected From eb6f3681557f61aca378dd81ad92ff09fb05ad15 Mon Sep 17 00:00:00 2001 From: David Krych Date: Mon, 18 Jun 2018 18:43:27 -0400 Subject: [PATCH 12/55] BUG: Fix Index construction when given empty generator (#21470). (#21481) (cherry picked from commit 076635ac3a33b819f4ae0fb1f95106bf8e4bf329) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/arrays/categorical.py | 5 ++--- pandas/core/indexes/base.py | 10 ++++++---- pandas/tests/indexes/test_base.py | 19 +++++++++++-------- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ea6d8620289f8..2af89c15bb8fb 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -49,8 +49,9 @@ Bug Fixes **Conversion** +- Bug in constructing :class:`Index` with an iterator or generator (:issue:`21470`) - Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) -- + **Indexing** diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a1a8f098b582e..b587a4c0bc722 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -3,7 +3,6 @@ import numpy as np from warnings import warn import textwrap -import types from pandas import compat from pandas.compat import u, lzip @@ -28,7 +27,7 @@ is_categorical, is_categorical_dtype, is_list_like, is_sequence, - is_scalar, + is_scalar, is_iterator, is_dict_like) from pandas.core.algorithms import factorize, take_1d, unique1d, take @@ -2473,7 +2472,7 @@ def _convert_to_list_like(list_like): if isinstance(list_like, list): return list_like if (is_sequence(list_like) or isinstance(list_like, tuple) or - isinstance(list_like, types.GeneratorType)): + is_iterator(list_like)): return list(list_like) elif is_scalar(list_like): return [list_like] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a2e237c8cc45d..4dacec6a93c68 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -436,12 +436,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif data is None or is_scalar(data): cls._scalar_data_error(data) else: - if tupleize_cols and is_list_like(data) and data: + if tupleize_cols and is_list_like(data): + # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) - # we must be all tuples, otherwise don't construct - # 10697 - if all(isinstance(e, tuple) for e in data): + + if data and all(isinstance(e, tuple) for e in data): + # we must be all tuples, otherwise don't construct + # 10697 from .multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get('names')) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 19acfb294762c..a0d6907055a2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -419,21 +419,24 @@ def test_constructor_dtypes_timedelta(self, attr, klass): result = klass(list(values), dtype=dtype) tm.assert_index_equal(result, index) - def test_constructor_empty_gen(self): - skip_index_keys = ["repeats", "periodIndex", "rangeIndex", - "tuples"] - for key, index in self.generate_index_types(skip_index_keys): - empty = index.__class__([]) - assert isinstance(empty, index.__class__) - assert not len(empty) + @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) + @pytest.mark.parametrize("klass", + [Index, Float64Index, Int64Index, UInt64Index, + CategoricalIndex, DatetimeIndex, TimedeltaIndex]) + def test_constructor_empty(self, value, klass): + empty = klass(value) + assert isinstance(empty, klass) + assert not len(empty) @pytest.mark.parametrize("empty,klass", [ (PeriodIndex([], freq='B'), PeriodIndex), + (PeriodIndex(iter([]), freq='B'), PeriodIndex), + (PeriodIndex((x for x in []), freq='B'), PeriodIndex), (RangeIndex(step=1), pd.RangeIndex), (MultiIndex(levels=[[1, 2], ['blue', 'red']], labels=[[], []]), MultiIndex) ]) - def test_constructor_empty(self, empty, klass): + def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) From 2292005d0e780036939a258d09c8a6db16ecdd74 Mon Sep 17 00:00:00 2001 From: Ming Li <14131823+minggli@users.noreply.github.com> Date: Mon, 18 Jun 2018 23:45:25 +0100 Subject: [PATCH 13/55] BUG/REG: file-handle object handled incorrectly in to_csv (#21478) (cherry picked from commit 91451cb7dbaaf6fb3f9bdfca73fe6adc2ee68cce) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/io/common.py | 4 +++ pandas/io/formats/csvs.py | 59 ++++++++++++++++++++----------- pandas/tests/frame/test_to_csv.py | 16 +++++---- pandas/tests/series/test_io.py | 18 +++++----- pandas/tests/test_common.py | 34 +++++++++++++----- 6 files changed, 87 insertions(+), 46 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 2af89c15bb8fb..e3205aecee121 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - .. _whatsnew_0232.performance: diff --git a/pandas/io/common.py b/pandas/io/common.py index a492b7c0b8e8e..ac9077f2db50e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) + @property + def closed(self): + return self.fp is None + class MMapWrapper(BaseIterator): """ diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 7f660e2644fa4..60518f596e9af 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,11 +5,13 @@ from __future__ import print_function +import warnings + import csv as csvlib +from zipfile import ZipFile import numpy as np from pandas.core.dtypes.missing import notna -from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -128,19 +130,31 @@ def save(self): else: encoding = self.encoding - # PR 21300 uses string buffer to receive csv writing and dump into - # file-like output with compression as option. GH 21241, 21118 - f = StringIO() - if not is_file_like(self.path_or_buf): - # path_or_buf is path - path_or_buf = self.path_or_buf - elif hasattr(self.path_or_buf, 'name'): - # path_or_buf is file handle - path_or_buf = self.path_or_buf.name - else: - # path_or_buf is file-like IO objects. + # GH 21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, 'write'): + msg = ("compression has no effect when passing file-like " + "object as input.") + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, 'write') + and self.compression == 'zip') + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH 21241, 21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf - path_or_buf = None + close = False + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -157,13 +171,18 @@ def save(self): self._save() finally: - # GH 17778 handles zip compression for byte strings separately. - buf = f.getvalue() - if path_or_buf: - f, handles = _get_handle(path_or_buf, self.mode, - encoding=encoding, - compression=self.compression) - f.write(buf) + if is_zip: + # GH 17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, 'write'): + self.path_or_buf.write(buf) + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(buf) + close = True + if close: f.close() for _fh in handles: _fh.close() diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 60dc336a85388..3ad25ae73109e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,6 +9,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.io.common import _get_handle import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) - # test the round trip - to_csv -> read_csv result = read_csv(filename, compression=compression, index_col=0, encoding=encoding) + assert_frame_equal(df, result) - with open(filename, 'w') as fh: - df.to_csv(fh, compression=compression, encoding=encoding) - - result_fh = read_csv(filename, compression=compression, - index_col=0, encoding=encoding) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + df.to_csv(f, encoding=encoding) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_frame_equal(df, result) - assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 76dd4bc1f3d4a..90f37053ce17e 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame from pandas.compat import StringIO, u +from pandas.io.common import _get_handle from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -152,20 +153,19 @@ def test_to_csv_compression(self, s, encoding, compression): s.to_csv(filename, compression=compression, encoding=encoding, header=True) - # test the round trip - to_csv -> read_csv result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) - with open(filename, 'w') as fh: - s.to_csv(fh, compression=compression, encoding=encoding, - header=True) - - result_fh = pd.read_csv(filename, compression=compression, - encoding=encoding, index_col=0, - squeeze=True) + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + s.to_csv(f, encoding=encoding, header=True) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) assert_series_equal(s, result) - assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3443331e3d4ba..576239e49455e 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -11,6 +11,7 @@ from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops +from pandas.io.common import _get_handle import pandas.util.testing as tm @@ -248,19 +249,34 @@ def test_compression_size(obj, method, compression): [12.32112, 123123.2, 321321.2]], columns=['X', 'Y', 'Z']), Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) -@pytest.mark.parametrize('method', ['to_csv']) +@pytest.mark.parametrize('method', ['to_csv', 'to_json']) def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=compression_only) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: - with open(filename, 'w') as fh: - getattr(obj, method)(fh, compression=None) - assert not fh.closed - assert fh.closed + f, _handles = _get_handle(filename, 'w', compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed + + +# GH 21227 +def test_compression_warning(compression_only): + df = DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']) + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, + check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only) From 030a0589cdc8479c65223669b5bbf0d10a95f31c Mon Sep 17 00:00:00 2001 From: Jacopo Rota Date: Tue, 19 Jun 2018 13:26:48 +0200 Subject: [PATCH 14/55] BUG: Handle read_csv corner case (#21176) Closes gh-21141 (cherry picked from commit c2da06c8eea4cc0339717aa09acdd6765bc3d673) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/parsers.py | 12 +++++++++++- pandas/tests/io/parser/common.py | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index e3205aecee121..f7c04ba9cfa9f 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -61,6 +61,7 @@ Bug Fixes **I/O** +- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) - - diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8f98732c92f..65df2bffb4abf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3209,12 +3209,22 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): col = columns[k] if is_integer(k) else k dtype[col] = v - if index_col is None or index_col is False: + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic emtpy Index. + if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() + for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2b7ff1f5a9879..b39122e5e7906 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -238,6 +238,21 @@ def test_csv_mixed_type(self): out = self.read_csv(StringIO(data)) tm.assert_frame_equal(out, expected) + def test_read_csv_low_memory_no_rows_with_index(self): + if self.engine == "c" and not self.low_memory: + pytest.skip("This is a low-memory specific test") + + # see gh-21141 + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + out = self.read_csv(StringIO(data), low_memory=True, + index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(out, expected) + def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, From d44fddb12ff0ff3991dfaa81b52d8f63b0f3d308 Mon Sep 17 00:00:00 2001 From: Kalyan Gokhale <4734245+KalyanGokhale@users.noreply.github.com> Date: Wed, 20 Jun 2018 16:03:07 +0530 Subject: [PATCH 15/55] REGR: Fixes first_valid_index when DataFrame or Series has duplicate row index (GH21441) (#21497) (cherry picked from commit ec2020735d72ff73e0a6a607689281aad173c702) --- doc/source/whatsnew/v0.23.2.txt | 3 ++- pandas/core/generic.py | 23 +++++++++++------------ pandas/tests/frame/test_timeseries.py | 15 ++++++++++++++- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index f7c04ba9cfa9f..7d870fefba651 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -17,7 +17,8 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) -- +- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- .. _whatsnew_0232.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9e4eda1bc4dc7..b03e598dcc52c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8969,18 +8969,17 @@ def _find_valid_index(self, how): is_valid = is_valid.any(1) # reduce axis 1 if how == 'first': - # First valid value case - i = is_valid.idxmax() - if not is_valid[i]: - return None - return i - - elif how == 'last': - # Last valid value case - i = is_valid.values[::-1].argmax() - if not is_valid.iat[len(self) - i - 1]: - return None - return self.index[len(self) - i - 1] + idxpos = is_valid.values[::].argmax() + + if how == 'last': + idxpos = len(self) - 1 - is_valid.values[::-1].argmax() + + chk_notna = is_valid.iat[idxpos] + idx = self.index[idxpos] + + if not chk_notna: + return None + return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', 'klass': 'NDFrame'}) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 90fbc6e628369..fb9bd74d9876d 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -506,7 +506,15 @@ def test_asfreq_fillvalue(self): actual_series = ts.asfreq(freq='1S', fill_value=9.0) assert_series_equal(expected_series, actual_series) - def test_first_last_valid(self): + @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ + ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), + ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), + ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), + ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) + def test_first_last_valid(self, data, idx, + expected_first, expected_last): N = len(self.frame.index) mat = randn(N) mat[:5] = nan @@ -539,6 +547,11 @@ def test_first_last_valid(self): assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq + # GH 21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + def test_first_subset(self): ts = tm.makeTimeDataFrame(freq='12h') result = ts.first('10d') From 172c5159ba7c1a1c0a398af4ee2ac77f00c1ef85 Mon Sep 17 00:00:00 2001 From: Michael Odintsov Date: Thu, 21 Jun 2018 05:54:23 +0300 Subject: [PATCH 16/55] BUG: Fix group index calculation to prevent hitting maximum recursion depth (#21541) (cherry picked from commit f91a7049d1730aa1924584a07a1265d9f57a2f35) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/sorting.py | 29 ++++++++++++++++------------ pandas/tests/frame/test_analytics.py | 17 ++++++++++++++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 7d870fefba651..a1b71ba5cbc43 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -58,6 +58,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - **I/O** diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e550976d1deeb..212f44e55c489 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,21 @@ def _int64_cut_off(shape): return i return len(shape) - def loop(labels, shape): + def maybe_lift(lab, size): + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk + while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) @@ -74,7 +88,7 @@ def loop(labels, shape): out[mask] = -1 if nlev == len(shape): # all levels done! - return out + break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted @@ -83,16 +97,7 @@ def loop(labels, shape): labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) + return out def get_compressed_ids(labels, sizes): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 90d7c46f7554f..4197339ff6e03 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1507,6 +1507,23 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ From a2199d2c01241d325bbff9474a94c47a8a7a4b82 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Thu, 21 Jun 2018 09:13:01 +0100 Subject: [PATCH 17/55] BUG: Fix passing empty label to df drop (#21515) Closes #21494 (cherry picked from commit f4fba9e90f6a7e27af984acc77403139ef600d8f) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/generic.py | 21 ++-- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/multi.py | 1 - .../tests/frame/test_axis_select_reindex.py | 15 +++ .../tests/series/indexing/test_alter_index.py | 106 ++++++++++++------ 6 files changed, 98 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index a1b71ba5cbc43..20d427335a47f 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -58,6 +58,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :meth:`DataFrame.drop` behaviour is not consistent for unique and non-unique indexes (:issue:`21494`) - Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b03e598dcc52c..612ee7cb42021 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3129,7 +3129,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): """ axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) - axis, axis_ = self._get_axis(axis), axis + axis = self._get_axis(axis) if axis.is_unique: if level is not None: @@ -3138,24 +3138,25 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) - dropped = self.reindex(**{axis_name: new_axis}) - try: - dropped.axes[axis_].set_names(axis.names, inplace=True) - except AttributeError: - pass - result = dropped + result = self.reindex(**{axis_name: new_axis}) + # Case for non-unique axis else: labels = _ensure_object(com._index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') indexer = ~axis.get_level_values(level).isin(labels) + + # GH 18561 MultiIndex.drop should raise if label is absent + if errors == 'raise' and indexer.all(): + raise KeyError('{} not found in axis'.format(labels)) else: indexer = ~axis.isin(labels) - - if errors == 'raise' and indexer.all(): - raise KeyError('{} not found in axis'.format(labels)) + # Check if label doesn't exist along axis + labels_missing = (axis.get_indexer_for(labels) == -1).any() + if errors == 'raise' and labels_missing: + raise KeyError('{} not found in axis'.format(labels)) slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4dacec6a93c68..59527afe6c1f7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4392,7 +4392,7 @@ def drop(self, labels, errors='raise'): Raises ------ KeyError - If none of the labels are found in the selected axis + If not all of the labels are found in the selected axis """ arr_dtype = 'object' if self.dtype == 'object' else None labels = com._index_labels_to_array(labels, dtype=arr_dtype) @@ -4401,7 +4401,7 @@ def drop(self, labels, errors='raise'): if mask.any(): if errors != 'ignore': raise KeyError( - 'labels %s not contained in axis' % labels[mask]) + '{} not found in axis'.format(labels[mask])) indexer = indexer[~mask] return self.delete(indexer) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c8332d762f7ef..80bf73cfe7dd3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1707,7 +1707,6 @@ def drop(self, labels, level=None, errors='raise'): if errors != 'ignore': raise ValueError('labels %s not contained in axis' % labels[mask]) - indexer = indexer[~mask] except Exception: pass diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 28e82f7585850..0e0d6598f5101 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -1151,3 +1151,18 @@ def test_raise_on_drop_duplicate_index(self, actual): expected_no_err = actual.T.drop('c', axis=1, level=level, errors='ignore') assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH 21494 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 999ed5f26daee..2fdf198596ce2 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -463,54 +463,86 @@ def test_rename(): assert result.name == expected.name -def test_drop(): - # unique - s = Series([1, 2], index=['one', 'two']) - expected = Series([1], index=['one']) - result = s.drop(['two']) - assert_series_equal(result, expected) - result = s.drop('two', axis='rows') - assert_series_equal(result, expected) - - # non-unique - # GH 5248 - s = Series([1, 1, 2], index=['one', 'two', 'one']) - expected = Series([1, 2], index=['one', 'one']) - result = s.drop(['two'], axis=0) - assert_series_equal(result, expected) - result = s.drop('two') - assert_series_equal(result, expected) - - expected = Series([1], index=['two']) - result = s.drop(['one']) - assert_series_equal(result, expected) - result = s.drop('one') - assert_series_equal(result, expected) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, expected_data, expected_index', + [ + # Unique Index + ([1, 2], ['one', 'two'], ['two'], + 0, [1], ['one']), + ([1, 2], ['one', 'two'], ['two'], + 'rows', [1], ['one']), + ([1, 1, 2], ['one', 'two', 'one'], ['two'], + 0, [1, 2], ['one', 'one']), + + # GH 5248 Non-Unique Index + ([1, 1, 2], ['one', 'two', 'one'], 'two', + 0, [1, 2], ['one', 'one']), + ([1, 1, 2], ['one', 'two', 'one'], ['one'], + 0, [1], ['two']), + ([1, 1, 2], ['one', 'two', 'one'], 'one', + 0, [1], ['two'])]) +def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, + expected_data, expected_index): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) - # single string/tuple-like - s = Series(range(3), index=list('abc')) - pytest.raises(KeyError, s.drop, 'bc') - pytest.raises(KeyError, s.drop, ('a',)) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, error_type, error_desc', + [ + # single string/tuple-like + (range(3), list('abc'), 'bc', + 0, KeyError, 'not found in axis'), + + # bad axis + (range(3), list('abc'), ('a',), + 0, KeyError, 'not found in axis'), + (range(3), list('abc'), 'one', + 'columns', ValueError, 'No axis named columns')]) +def test_drop_exception_raised(data, index, drop_labels, + axis, error_type, error_desc): + + with tm.assert_raises_regex(error_type, error_desc): + Series(data, index=index).drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): # errors='ignore' s = Series(range(3), index=list('abc')) result = s.drop('bc', errors='ignore') - assert_series_equal(result, s) + tm.assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') expected = s.iloc[1:] - assert_series_equal(result, expected) - - # bad axis - pytest.raises(ValueError, s.drop, 'one', axis='columns') + tm.assert_series_equal(result, expected) # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + - # GH 16877 - s = Series([2, 3], index=[0, 1]) - with tm.assert_raises_regex(KeyError, 'not contained in axis'): - s.drop([False, True]) +@pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize('drop_labels', [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index).drop(drop_labels) + tm.assert_series_equal(series, pd.Series(index=expected_index)) + + +@pytest.mark.parametrize('data, index, drop_labels', [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]) +]) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.Series(data=data, index=index).drop(drop_labels) From 4b1a68776aa20bb2dc081bb77093adb6c47957f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 21 Jun 2018 03:18:53 -0700 Subject: [PATCH 18/55] fix hashing string-casting error (#21187) (cherry picked from commit e24da6c9f92d2b04ffb39a7fe0db85015af7ff3f) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/_libs/hashing.pyx | 7 ++----- pandas/tests/series/test_repr.py | 30 ++++++++++++++++++++++++++++++ pandas/util/testing.py | 22 ++++++++++++++++++++++ 4 files changed, 55 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 20d427335a47f..60376f416aeb7 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -80,6 +80,7 @@ Bug Fixes **Categorical** +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) - **Timezones** diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c6f182ac5003f..4489847518a1d 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -8,8 +8,7 @@ import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t from util cimport _checknull -from cpython cimport (PyString_Check, - PyBytes_Check, +from cpython cimport (PyBytes_Check, PyUnicode_Check) from libc.stdlib cimport malloc, free @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): cdef list datas = [] for i in range(n): val = arr[i] - if PyString_Check(val): - data = val.encode(encoding) - elif PyBytes_Check(val): + if PyBytes_Check(val): data = val elif PyUnicode_Check(val): data = val.encode(encoding) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 97236f028b1c4..730c2b7865f1f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -11,6 +11,7 @@ from pandas import (Index, Series, DataFrame, date_range, option_context, Categorical, period_range, timedelta_range) from pandas.core.index import MultiIndex +from pandas.core.base import StringMixin from pandas.compat import lrange, range, u from pandas import compat @@ -202,6 +203,35 @@ def test_latex_repr(self): class TestCategoricalRepr(object): + def test_categorical_repr_unicode(self): + # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', + # and we are working in PY2, then rendering a Categorical could raise + # UnicodeDecodeError by trying to decode when it shouldn't + + class County(StringMixin): + name = u'San Sebastián' + state = u'PR' + + def __unicode__(self): + return self.name + u', ' + self.state + + cat = pd.Categorical([County() for n in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + if compat.PY3: + # no reloading of sys, just check that the default (utf8) works + # as expected + repr(ser) + str(ser) + + else: + # set sys.defaultencoding to ascii, then change it back after + # the test + with tm.set_defaultencoding('ascii'): + repr(ser) + str(ser) + def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 233eba6490937..6384eca9849f6 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize): # Stdout / stderr decorators +@contextmanager +def set_defaultencoding(encoding): + """ + Set default encoding (as given by sys.getdefaultencoding()) to the given + encoding; restore on exit. + + Parameters + ---------- + encoding : str + """ + if not PY2: + raise ValueError("set_defaultencoding context is only available " + "in Python 2.") + orig = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding(encoding) + try: + yield + finally: + sys.setdefaultencoding(orig) + + def capture_stdout(f): """ Decorator to capture stdout in a buffer so that it can be checked From 2d2f6aa9e368e3d97d8a8d24a802357e4ac3a919 Mon Sep 17 00:00:00 2001 From: Jacopo Rota Date: Sat, 23 Jun 2018 01:04:38 +0200 Subject: [PATCH 19/55] add test case when to_csv argument is sys.stdout (#21572) (cherry picked from commit 66fea91e915ca5e3f096055f3ad0f07335483e3f) --- pandas/tests/io/formats/test_to_csv.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index dfa3751bff57a..36c4ae547ad4e 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -285,3 +285,18 @@ def test_to_csv_string_array_utf8(self): df.to_csv(path, encoding='utf-8') with open(path, 'r') as f: assert f.read() == expected_utf8 + + @tm.capture_stdout + def test_to_csv_stdout_file(self): + # GH 21561 + df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], + columns=['name_1', 'name_2']) + expected_ascii = '''\ +,name_1,name_2 +0,foo,bar +1,baz,qux +''' + df.to_csv(sys.stdout, encoding='ascii') + output = sys.stdout.getvalue() + assert output == expected_ascii + assert not sys.stdout.closed From cf0a55f86eb73782d0d76cc9208ca56d374c9a5e Mon Sep 17 00:00:00 2001 From: Vu Le Date: Sat, 23 Jun 2018 06:07:21 +0700 Subject: [PATCH 20/55] BUG: Fix json_normalize throwing TypeError (#21536) (#21540) (cherry picked from commit 5fdaa9717f7550c5293d421205bfa19011278396) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/io/json/normalize.py | 8 +++++++- pandas/tests/io/json/test_normalize.py | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 60376f416aeb7..53ca4c0d1c144 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -65,7 +65,7 @@ Bug Fixes **I/O** - Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) -- +- Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`) - **Plotting** diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index b845a43b9ca9e..2004a24c2ec5a 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -170,6 +170,11 @@ def json_normalize(data, record_path=None, meta=None, 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH + >>> data = {'A': [1, 2]} + >>> json_normalize(data, 'A', record_prefix='Prefix.') + Prefix.0 + 0 1 + 1 2 """ def _pull_field(js, spec): result = js @@ -259,7 +264,8 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) + result = result.rename( + columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 395c2c90767d3..200a853c48900 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -123,6 +123,12 @@ def test_simple_normalize_with_separator(self, deep_nested): 'country', 'states_name']).sort_values() assert result.columns.sort_values().equals(expected) + def test_value_array_record_prefix(self): + # GH 21536 + result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.') + expected = DataFrame([[1], [2]], columns=['Prefix.0']) + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], From 176695fde32e872478d303ab21965bd49416aae4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Jul 2018 13:48:59 +0200 Subject: [PATCH 21/55] Remove incorrectly added TestMode class The tests were incorrectly added from https://github.com/pandas-dev/pandas/commit/f1631bec96dd9a1dc4890677b9c5475d0677e102#diff-dc347bc3d0448ea297bed67dc7ff3437 when fixing merge conflicts during cherry-picking --- pandas/tests/series/test_analytics.py | 174 -------------------------- 1 file changed, 174 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 7a78b562ac1fa..1e6ea96a5de51 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1866,180 +1866,6 @@ def s_main_dtypes(): return df -class TestMode(object): - - @pytest.mark.parametrize('dropna, expected', [ - (True, Series([], dtype=np.float64)), - (False, Series([], dtype=np.float64)) - ]) - def test_mode_empty(self, dropna, expected): - s = Series([], dtype=np.float64) - result = s.mode(dropna) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, data, expected', [ - (True, [1, 1, 1, 2], [1]), - (True, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - (False, [1, 1, 1, 2], [1]), - (False, [1, 1, 1, 2, 3, 3, 3], [1, 3]), - ]) - @pytest.mark.parametrize( - 'dt', - list(np.typecodes['AllInteger'] + np.typecodes['Float']) - ) - def test_mode_numerical(self, dropna, data, expected, dt): - s = Series(data, dtype=dt) - result = s.mode(dropna) - expected = Series(expected, dtype=dt) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected', [ - (True, [1.0]), - (False, [1, np.nan]), - ]) - def test_mode_numerical_nan(self, dropna, expected): - s = Series([1, 1, 2, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, ['b'], ['bar'], ['nan']), - (False, ['b'], [np.nan], ['nan']) - ]) - def test_mode_str_obj(self, dropna, expected1, expected2, expected3): - # Test string and object types. - data = ['a'] * 2 + ['b'] * 3 - - s = Series(data, dtype='c') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='c') - tm.assert_series_equal(result, expected1) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected2) - - data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan] - - s = Series(data, dtype=object).astype(str) - result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['foo'], ['foo']), - (False, ['foo'], [np.nan]) - ]) - def test_mode_mixeddtype(self, dropna, expected1, expected2): - s = Series([1, 'foo', 'foo']) - result = s.mode(dropna) - expected = Series(expected1) - tm.assert_series_equal(result, expected) - - s = Series([1, 'foo', 'foo', np.nan, np.nan, np.nan]) - result = s.mode(dropna) - expected = Series(expected2, dtype=object) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['1900-05-03', '2011-01-03', '2013-01-02'], - ['2011-01-03', '2013-01-02']), - (False, [np.nan], [np.nan, '2011-01-03', '2013-01-02']), - ]) - def test_mode_datetime(self, dropna, expected1, expected2): - s = Series(['2011-01-03', '2013-01-02', - '1900-05-03', 'nan', 'nan'], dtype='M8[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='M8[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['2011-01-03', '2013-01-02', '1900-05-03', - '2011-01-03', '2013-01-02', 'nan', 'nan'], - dtype='M8[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='M8[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, ['-1 days', '0 days', '1 days'], ['2 min', '1 day']), - (False, [np.nan], [np.nan, '2 min', '1 day']), - ]) - def test_mode_timedelta(self, dropna, expected1, expected2): - # gh-5986: Test timedelta types. - - s = Series(['1 days', '-1 days', '0 days', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected1 = Series(expected1, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected1) - - s = Series(['1 day', '1 day', '-1 day', '-1 day 2 min', - '2 min', '2 min', 'nan', 'nan'], - dtype='timedelta64[ns]') - result = s.mode(dropna) - expected2 = Series(expected2, dtype='timedelta64[ns]') - tm.assert_series_equal(result, expected2) - - @pytest.mark.parametrize('dropna, expected1, expected2, expected3', [ - (True, Categorical([1, 2], categories=[1, 2]), - Categorical(['a'], categories=[1, 'a']), - Categorical([3, 1], categories=[3, 2, 1], ordered=True)), - (False, Categorical([np.nan], categories=[1, 2]), - Categorical([np.nan, 'a'], categories=[1, 'a']), - Categorical([np.nan, 3, 1], categories=[3, 2, 1], ordered=True)), - ]) - def test_mode_category(self, dropna, expected1, expected2, expected3): - s = Series(Categorical([1, 2, np.nan, np.nan])) - result = s.mode(dropna) - expected1 = Series(expected1, dtype='category') - tm.assert_series_equal(result, expected1) - - s = Series(Categorical([1, 'a', 'a', np.nan, np.nan])) - result = s.mode(dropna) - expected2 = Series(expected2, dtype='category') - tm.assert_series_equal(result, expected2) - - s = Series(Categorical([1, 1, 2, 3, 3, np.nan, np.nan], - categories=[3, 2, 1], ordered=True)) - result = s.mode(dropna) - expected3 = Series(expected3, dtype='category') - tm.assert_series_equal(result, expected3) - - @pytest.mark.parametrize('dropna, expected1, expected2', [ - (True, [2**63], [1, 2**63]), - (False, [2**63], [1, 2**63]) - ]) - def test_mode_intoverflow(self, dropna, expected1, expected2): - # Test for uint64 overflow. - s = Series([1, 2**63, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected1 = Series(expected1, dtype=np.uint64) - tm.assert_series_equal(result, expected1) - - s = Series([1, 2**63], dtype=np.uint64) - result = s.mode(dropna) - expected2 = Series(expected2, dtype=np.uint64) - tm.assert_series_equal(result, expected2) - - @pytest.mark.skipif(not compat.PY3, reason="only PY3") - def test_mode_sortwarning(self): - # Check for the warning that is raised when the mode - # results cannot be sorted - - expected = Series(['foo', np.nan]) - s = Series([1, 'foo', 'foo', np.nan, np.nan]) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - result = s.mode(dropna=False) - result = result.sort_values().reset_index(drop=True) - - tm.assert_series_equal(result, expected) - - def assert_check_nselect_boundary(vals, dtype, method): # helper function for 'test_boundary_{dtype}' tests s = Series(vals, dtype=dtype) From 8c7996d2211a95cf67ff2d465dd3c1517b90a310 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Jul 2018 17:25:33 +0200 Subject: [PATCH 22/55] DOC: fix spaces in 0.23.1 whatsnew file Take from https://github.com/pandas-dev/pandas/commit/e92b78603e1404e49d6bcb19873d2d24225a8e50 (could not be cherry-picked in its totality) --- doc/source/whatsnew/v0.23.1.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index db25bcf8113f5..a52ba22cf36d2 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -97,8 +97,8 @@ Bug Fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`) -- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`) +- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) +- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) **Sparse** @@ -110,12 +110,12 @@ Bug Fixes - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) -- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) +- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, :issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) **Plotting** -- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`) +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue:`20968`) **I/O** From d0f664a20d581919b6d5d6efef9704e540b013b8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Jun 2018 09:57:43 -0500 Subject: [PATCH 23/55] CI: Test against Python 3.7 (#21604) (cherry picked from commit 7829ad3290dc6894d24c1c853ffc4dabef50294a) --- .travis.yml | 5 +++++ ci/travis-37.yaml | 14 ++++++++++++++ doc/source/install.rst | 2 +- doc/source/whatsnew/v0.23.2.txt | 6 ++++++ pandas/compat/__init__.py | 9 +++++---- pandas/tests/tseries/offsets/test_offsets.py | 10 ++++++++-- setup.py | 1 + 7 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 ci/travis-37.yaml diff --git a/.travis.yml b/.travis.yml index 4e25380a7d941..2d2a0bc019c80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,11 @@ matrix: language: generic env: - JOB="3.5, OSX" ENV_FILE="ci/travis-35-osx.yaml" TEST_ARGS="--skip-slow --skip-network" + + - dist: trusty + env: + - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - dist: trusty env: - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml new file mode 100644 index 0000000000000..8b255c9e6ec72 --- /dev/null +++ b/ci/travis-37.yaml @@ -0,0 +1,14 @@ +name: pandas +channels: + - defaults + - conda-forge + - c3i_test +dependencies: + - python=3.7 + - cython + - numpy + - python-dateutil + - nomkl + - pytz + - pytest + - pytest-xdist diff --git a/doc/source/install.rst b/doc/source/install.rst index 6054be112f52c..846170f9f0fa5 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -43,7 +43,7 @@ For more information, see the `Python 3 statement`_ and the `Porting to Python 3 Python version support ---------------------- -Officially Python 2.7, 3.5, and 3.6. +Officially Python 2.7, 3.5, 3.6, and 3.7. Installing pandas ----------------- diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 53ca4c0d1c144..5d196c4fe8d15 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -6,6 +6,12 @@ v0.23.2 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. note:: + + Pandas 0.23.2 is first pandas release that's compatible with + Python 3.7 (:issue:`20552`) + + .. contents:: What's new in v0.23.2 :local: :backlinks: none diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ae22694d0da7..28a55133e68aa 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -40,10 +40,11 @@ from collections import namedtuple PY2 = sys.version_info[0] == 2 -PY3 = (sys.version_info[0] >= 3) -PY35 = (sys.version_info >= (3, 5)) -PY36 = (sys.version_info >= (3, 6)) -PYPY = (platform.python_implementation() == 'PyPy') +PY3 = sys.version_info[0] >= 3 +PY35 = sys.version_info >= (3, 5) +PY36 = sys.version_info >= (3, 6) +PY37 = sys.version_info >= (3, 7) +PYPY = platform.python_implementation() == 'PyPy' try: import __builtin__ as builtins diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5369b1a94a956..0c08d813a7f1b 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -528,7 +528,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * BusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -1642,7 +1645,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * CustomBusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): diff --git a/setup.py b/setup.py index 90ec8e91a0700..c5831eb097767 100755 --- a/setup.py +++ b/setup.py @@ -217,6 +217,7 @@ def build_extensions(self): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Cython', 'Topic :: Scientific/Engineering'] From 684a4bda53cd37d7972162cfe5a582966cc1b070 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Jun 2018 02:34:15 -0500 Subject: [PATCH 24/55] API/COMPAT: support axis=None for logical reduction (reduce over all axes) (#21486) * Compat with NumPy 1.15 logical func * Accepts axis=None as reduce all dims (cherry picked from commit f7ed7f8e30e7418b346831c73e2f4541b7ae11be) --- doc/source/whatsnew/v0.23.2.txt | 30 +++++++ pandas/core/frame.py | 22 ++++- pandas/core/generic.py | 44 ++++++---- pandas/core/panel.py | 17 +++- pandas/core/series.py | 3 +- pandas/tests/frame/test_analytics.py | 119 +++++++++++++++++++++++++-- pandas/tests/test_panel.py | 7 ++ pandas/util/_test_decorators.py | 4 + 8 files changed, 215 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 5d196c4fe8d15..f5a520216b2be 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -16,6 +16,36 @@ and bug fixes. We recommend that all users upgrade to this version. :local: :backlinks: none +.. _whatsnew_0232.enhancements: + +Logical Reductions over Entire DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2], "B": [True, False]}) + df.all(axis=None) + + +This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``. +With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will no longer reduce over every axis: + +.. code-block:: python + + >>> # NumPy 1.15, pandas 0.23.1 + >>> np.any(pd.DataFrame({"A": [False], "B": [False]})) + A False + B False + dtype: bool + +With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.15. + +.. ipython:: python + + np.any(pd.DataFrame({"A": [False], "B": [False]})) + .. _whatsnew_0232.fixed_regressions: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9f6e834f0a25f..2a40dd28a6fd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6844,13 +6844,18 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): - axis = self._get_axis_number(axis) + if axis is None and filter_type == 'bool': + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - labels = self._get_agg_axis(axis) - # exclude timedelta/datetime unless we are uniform types if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: numeric_only = True @@ -6859,6 +6864,13 @@ def f(x): try: values = self.values result = f(values) + + if (filter_type == 'bool' and is_object_dtype(values) and + axis is None): + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) except Exception as e: # try by-column first @@ -6925,7 +6937,9 @@ def f(x): if axis == 0: result = coerce_to_dtypes(result, self.dtypes) - return Series(result, index=labels) + if constructor is not None: + result = Series(result, index=labels) + return result def nunique(self, axis=0, dropna=True): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 612ee7cb42021..50a5c10a6865f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8729,6 +8729,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + if axis is None: + raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) @@ -9055,8 +9057,15 @@ def _doc_parms(cls): Parameters ---------- -axis : int, default 0 - Select the axis which can be 0 for indices and 1 for columns. +axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -9078,9 +9087,9 @@ def _doc_parms(cls): %(examples)s""" _all_doc = """\ -Return whether all elements are True over series or dataframe axis. +Return whether all elements are True, potentially over an axis. -Returns True if all elements within a series or along a dataframe +Returns True if all elements within a series or along a Dataframe axis are non-zero, not-empty or not-False.""" _all_examples = """\ @@ -9093,7 +9102,7 @@ def _doc_parms(cls): >>> pd.Series([True, False]).all() False -Dataframes +DataFrames Create a dataframe from a dictionary. @@ -9110,12 +9119,17 @@ def _doc_parms(cls): col2 False dtype: bool -Adding axis=1 argument will check if row-wise values all return True. +Specify ``axis='columns'`` to check if row-wise values all return True. ->>> df.all(axis=1) +>>> df.all(axis='columns') 0 True 1 False dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False """ _all_see_also = """\ @@ -9481,6 +9495,11 @@ def _doc_parms(cls): 1 False dtype: bool +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + `any` for an empty DataFrame is an empty Series. >>> pd.DataFrame([]).any() @@ -9651,22 +9670,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) - def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number if level is not None: if bool_only is not None: raise NotImplementedError("Option bool_only is not " "implemented with option level.") return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce(f, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool', - name=name) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') return set_function_name(logical_func, name, cls) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 16e64192fdb20..bad0dd79aaedd 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1143,13 +1143,26 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) - axis_name = self._get_axis_name(axis) - axis_number = self._get_axis_number(axis_name) + if axis is None and filter_type == 'bool': + # labels = None + # constructor = None + axis_number = None + axis_name = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + # labels = self._get_agg_axis(axis) + # constructor = self._constructor + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) with np.errstate(all='ignore'): result = f(self.values) + if axis is None and filter_type == 'bool': + return np.bool_(result) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: result = result.T diff --git a/pandas/core/series.py b/pandas/core/series.py index 6975dd8fc918e..6b005c673c7cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3212,7 +3212,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, delegate = self._values if isinstance(delegate, np.ndarray): # Validate that 'axis' is consistent with Series's single axis. - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4197339ff6e03..437d3a9d24730 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -15,7 +15,7 @@ from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, - _np_version_under1p12, _np_version_under1p15) + _np_version_under1p12) import pandas as pd import pandas.core.nanops as nanops import pandas.core.algorithms as algorithms @@ -1139,11 +1139,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) - df = DataFrame(randn(10, 4)) > 0 - df.any(1) - df.all(1) - df.any(1, bool_only=True) - df.all(1, bool_only=True) + def test_any_all_extra(self): + df = DataFrame({ + 'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, True, True], + }, index=['a', 'b', 'c']) + result = df[['A', 'B']].any(1) + expected = Series([True, True, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df[['A', 'B']].any(1, bool_only=True) + tm.assert_series_equal(result, expected) + + result = df.all(1) + expected = Series([True, False, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df.all(1, bool_only=True) + tm.assert_series_equal(result, expected) + + # Axis is None + result = df.all(axis=None).item() + assert result is False + + result = df.any(axis=None).item() + assert result is True + + result = df[['C']].all(axis=None).item() + assert result is True # skip pathological failure cases # class CantNonzero(object): @@ -1165,6 +1189,86 @@ def test_any_all(self): # df.any(1, bool_only=True) # df.all(1, bool_only=True) + @pytest.mark.parametrize('func, data, expected', [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {'A': []}, False), + (np.all, {'A': []}, True), + (np.any, {'A': [False, False]}, False), + (np.all, {'A': [False, False]}, False), + (np.any, {'A': [True, False]}, True), + (np.all, {'A': [True, False]}, False), + (np.any, {'A': [True, True]}, True), + (np.all, {'A': [True, True]}, True), + + (np.any, {'A': [False], 'B': [False]}, False), + (np.all, {'A': [False], 'B': [False]}, False), + + (np.any, {'A': [False, False], 'B': [False, True]}, True), + (np.all, {'A': [False, False], 'B': [False, True]}, False), + + # other types + (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), + (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), + (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), + (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), + (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), + (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), + (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), + + # # Mix + # GH-21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ]) + def test_any_all_np_func(self, func, data, expected): + # https://github.com/pandas-dev/pandas/issues/19976 + data = DataFrame(data) + result = func(data) + assert isinstance(result, np.bool_) + assert result.item() is expected + + # method version + result = getattr(DataFrame(data), func.__name__)(axis=None) + assert isinstance(result, np.bool_) + assert result.item() is expected + + def test_any_all_object(self): + # https://github.com/pandas-dev/pandas/issues/19976 + result = np.all(DataFrame(columns=['a', 'b'])).item() + assert result is True + + result = np.any(DataFrame(columns=['a', 'b'])).item() + assert result is False + + @pytest.mark.parametrize('method', ['any', 'all']) + def test_any_all_level_axis_none_raises(self, method): + df = DataFrame( + {"A": 1}, + index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], + names=['out', 'in']) + ) + xpr = "Must specify 'axis' when aggregating by level." + with tm.assert_raises_regex(ValueError, xpr): + getattr(df, method)(axis=None, level='out') + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: @@ -2071,9 +2175,6 @@ def test_clip_against_list_like(self, inplace, lower, axis, res): result = original tm.assert_frame_equal(result, expected, check_exact=True) - @pytest.mark.xfail( - not _np_version_under1p15, - reason="failing under numpy-dev gh-19976") @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 7973b27601237..128ab0572ba55 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2717,3 +2717,10 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)], names=['time', 'panel']) tm.assert_index_equal(index, expected) + + +def test_panel_np_all(): + with catch_warnings(record=True): + wp = Panel({"A": DataFrame({'b': [1, 2]})}) + result = np.all(wp) + assert result == np.bool_(True) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 89d90258f58e0..27c24e3a68079 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -30,6 +30,7 @@ def test_foo(): from pandas.compat import (is_platform_windows, is_platform_32bit, PY3, import_lzma) +from pandas.compat.numpy import _np_version_under1p15 from pandas.core.computation.expressions import (_USE_NUMEXPR, _NUMEXPR_INSTALLED) @@ -160,6 +161,9 @@ def decorated_func(func): skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") + +skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15, + reason="NumPy 1.15 or greater required") skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), From 01bb92127abd4a23005e780eb1e9b09cacfbb748 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 26 Jun 2018 08:26:21 -0400 Subject: [PATCH 25/55] TST: xfail flaky 3.7 test, xref #21636 (#21637) (cherry picked from commit dbd102c863adb36d07b999c2fc26403717c4bc32) --- pandas/tests/groupby/test_categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..0fec6a8f96a24 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.compat import PY37 from pandas import (Index, MultiIndex, CategoricalIndex, DataFrame, Categorical, Series, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -205,6 +206,7 @@ def test_level_get_group(observed): assert_frame_equal(result, expected) +@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636") @pytest.mark.parametrize('ordered', [True, False]) def test_apply(ordered): # GH 10138 From 417e87372831c4c5f906a99e19227e1d5ab7d2b3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Jun 2018 10:02:17 -0500 Subject: [PATCH 26/55] PKG: Exclude data test files. (#19535) (cherry picked from commit 36422a88474396148bd7d5d38aa238ea844d9555) --- MANIFEST.in | 34 ++++--- ci/script_single.sh | 8 +- doc/source/whatsnew/v0.23.2.txt | 5 + pandas/conftest.py | 42 +++++++++ pandas/tests/indexes/test_multi.py | 8 +- pandas/tests/io/conftest.py | 21 ++--- pandas/tests/io/formats/test_format.py | 4 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 8 +- pandas/tests/io/parser/common.py | 25 +++-- pandas/tests/io/parser/compression.py | 4 +- pandas/tests/io/parser/dtypes.py | 6 +- pandas/tests/io/parser/test_network.py | 53 +++++------ pandas/tests/io/parser/test_parsers.py | 6 +- pandas/tests/io/parser/test_textreader.py | 5 +- pandas/tests/io/sas/test_sas7bdat.py | 43 ++++----- pandas/tests/io/sas/test_xport.py | 6 +- pandas/tests/io/test_common.py | 54 +++++------ pandas/tests/io/test_excel.py | 12 +-- pandas/tests/io/test_html.py | 92 +++++++++++-------- pandas/tests/io/test_packers.py | 51 +++++----- pandas/tests/io/test_pickle.py | 38 ++++---- pandas/tests/io/test_pytables.py | 23 +++-- pandas/tests/io/test_sql.py | 63 +++++++------ pandas/tests/io/test_stata.py | 9 +- pandas/tests/plotting/common.py | 5 - pandas/tests/plotting/test_deprecated.py | 5 +- pandas/tests/plotting/test_misc.py | 16 ++-- pandas/tests/reshape/merge/test_merge_asof.py | 33 +++---- pandas/tests/reshape/test_tile.py | 6 +- pandas/tests/tseries/offsets/test_offsets.py | 16 ++-- pandas/tests/util/test_testing.py | 13 +++ pandas/util/_test_decorators.py | 1 - pandas/util/testing.py | 10 -- setup.cfg | 3 +- setup.py | 6 +- 36 files changed, 393 insertions(+), 347 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 9773019c6e6e0..b417b8890fa24 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,27 +3,39 @@ include LICENSE include RELEASE.md include README.md include setup.py -include pyproject.toml graft doc prune doc/build +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl diff --git a/ci/script_single.sh b/ci/script_single.sh index f376c920ac71b..60e2fbb33ee5d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -25,12 +25,12 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index f5a520216b2be..b3da4d1c4e288 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -70,6 +70,11 @@ Documentation Changes - - +Build Changes +------------- + +- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) + .. _whatsnew_0232.bug_fixes: Bug Fixes diff --git a/pandas/conftest.py b/pandas/conftest.py index 9d806a91f37f7..ead357747666d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,5 +1,8 @@ +import os + import pytest +import pandas import numpy as np import pandas as pd from pandas.compat import PY3 @@ -15,6 +18,8 @@ def pytest_addoption(parser): help="run high memory tests") parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption("--strict-data-files", action="store_true", + help="Fail if a test is skipped for missing data file.") def pytest_runtest_setup(item): @@ -129,6 +134,43 @@ def join_type(request): return request.param +@pytest.fixture +def datapath(request): + """Get the path to a data file. + + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + + Returns + ------- + path : path including ``pandas/tests``. + + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. + """ + def deco(*args): + path = os.path.join('pandas', 'tests', *args) + if not os.path.exists(path): + if request.config.getoption("--strict-data-files"): + msg = "Could not find file {} and --strict-data-files is set." + raise ValueError(msg.format(path)) + else: + msg = "Could not find {}." + pytest.skip(msg.format(path)) + return path + return deco + + +@pytest.fixture +def iris(datapath): + """The iris dataset as a DataFrame.""" + return pandas.read_csv(datapath('data', 'iris.csv')) + + @pytest.fixture(params=['nlargest', 'nsmallest']) def nselect_method(request): """ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index df506ae9486ee..3ede83b5969ce 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1181,12 +1181,12 @@ def test_iter(self): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] assert result == expected - def test_legacy_pickle(self): + def test_legacy_pickle(self, datapath): if PY3: pytest.skip("testing for legacy pickles not " "support on py3") - path = tm.get_data_path('multiindex_v1.pickle') + path = datapath('indexes', 'data', 'multiindex_v1.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) @@ -1202,10 +1202,10 @@ def test_legacy_pickle(self): assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) - def test_legacy_v2_unpickle(self): + def test_legacy_v2_unpickle(self, datapath): # 0.7.3 -> 0.8.0 format manage - path = tm.get_data_path('mindex_073.pickle') + path = datapath('indexes', 'data', 'mindex_073.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8deb51e190bab..7623587803b41 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,32 +1,23 @@ -import os - import pytest from pandas.io.parsers import read_table -from pandas.util import testing as tm - - -@pytest.fixture -def parser_data(request): - return os.path.join(tm.get_data_path(), '..', 'parser', 'data') @pytest.fixture -def tips_file(parser_data): +def tips_file(datapath): """Path to the tips dataset""" - return os.path.join(parser_data, 'tips.csv') + return datapath('io', 'parser', 'data', 'tips.csv') @pytest.fixture -def jsonl_file(parser_data): +def jsonl_file(datapath): """Path a JSONL dataset""" - return os.path.join(parser_data, 'items.jsonl') + return datapath('io', 'parser', 'data', 'items.jsonl') @pytest.fixture -def salaries_table(parser_data): +def salaries_table(datapath): """DataFrame with the salaries dataset""" - path = os.path.join(parser_data, 'salaries.csv') - return read_table(path) + return read_table(datapath('io', 'parser', 'data', 'salaries.csv')) @pytest.fixture diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f221df93dd412..63b7cb3459069 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self): dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})}) compat.text_type(dm.to_string()) - def test_string_repr_encoding(self): - filepath = tm.get_data_path('unicode_series.csv') + def test_string_repr_encoding(self, datapath): + filepath = datapath('io', 'formats', 'data', 'unicode_series.csv') df = pd.read_csv(filepath, header=None, encoding='latin1') repr(df) repr(df[1]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c9074ca49e5be..05ceace20f5a4 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,11 +21,11 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_read_zipped_json(): - uncompressed_path = tm.get_data_path("tsframe_v012.json") +def test_read_zipped_json(datapath): + uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) - compressed_path = tm.get_data_path("tsframe_v012.json.zip") + compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7e497c395266f..bcbac4400c953 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -37,8 +37,9 @@ class TestPandasContainer(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(scope="function", autouse=True) + def setup(self, datapath): + self.dirpath = datapath("io", "json", "data") self.ts = tm.makeTimeSeries() self.ts.name = 'ts' @@ -59,7 +60,8 @@ def setup_method(self, method): self.mixed_frame = _mixed_frame.copy() self.categorical = _cat_frame.copy() - def teardown_method(self, method): + yield + del self.dirpath del self.ts diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index b39122e5e7906..fb510f1a74556 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -77,7 +77,7 @@ def test_read_csv(self): else: prefix = u("file://") - fname = prefix + compat.text_type(self.csv1) + fname = prefix + compat.text_type(os.path.abspath(self.csv1)) self.read_csv(fname, index_col=0, parse_dates=True) def test_1000_sep(self): @@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self): tm.assert_frame_equal(df, expected) @tm.network - def test_url(self): + def test_url(self, datapath): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @pytest.mark.slow - def test_file(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + def test_file(self, datapath): + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) try: @@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self): tm.assert_frame_equal(result, expected) - def test_utf16_example(self): - path = tm.get_data_path('utf16_ex.txt') + def test_utf16_example(self, datapath): + path = datapath('io', 'parser', 'data', 'utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') @@ -767,8 +765,8 @@ def test_utf16_example(self): result = self.read_table(buf, encoding='utf-16') assert len(result) == 50 - def test_unicode_encoding(self): - pth = tm.get_data_path('unicode_series.csv') + def test_unicode_encoding(self, datapath): + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) @@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self): result = self.read_csv(path) tm.assert_frame_equal(result, expected) - def test_sub_character(self): + def test_sub_character(self, datapath): # see gh-16893 - dirpath = tm.get_data_path() - filename = os.path.join(dirpath, "sub_char.csv") + filename = datapath('io', 'parser', 'data', 'sub_char.csv') expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) result = self.read_csv(filename) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index e84db66561c49..e4950af19ea95 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self): tm.assert_frame_equal(expected, df) - def test_read_csv_compressed_utf16_example(self): + def test_read_csv_compressed_utf16_example(self, datapath): # GH18071 - path = tm.get_data_path('utf16_ex_small.zip') + path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip') result = self.read_csv(path, encoding='utf-16', compression='zip', sep='\t') diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index b91ce04673e29..8060ebf2fbcd4 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -125,9 +125,9 @@ def test_categorical_dtype_high_cardinality_numeric(self): np.sort(actual.a.cat.categories), ordered=True) tm.assert_frame_equal(actual, expected) - def test_categorical_dtype_encoding(self): + def test_categorical_dtype_encoding(self, datapath): # GH 10153 - pth = tm.get_data_path('unicode_series.csv') + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) @@ -135,7 +135,7 @@ def test_categorical_dtype_encoding(self): dtype={1: 'category'}) tm.assert_frame_equal(actual, expected) - pth = tm.get_data_path('utf16_ex.txt') + pth = datapath('io', 'parser', 'data', 'utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index fdf45f307e953..e2243b8087a5b 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -48,10 +48,16 @@ def check_compressed_urls(salaries_table, compression, extension, mode, tm.assert_frame_equal(url_table, salaries_table) +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + + @pytest.mark.usefixtures("s3_resource") class TestS3(object): - def test_parse_public_s3_bucket(self): + def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') # more of an integration test due to the not-public contents portion # can probably mock this though. @@ -60,45 +66,40 @@ def test_parse_public_s3_bucket(self): ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self): + def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self): + def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self): + def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self): + def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) @@ -109,14 +110,13 @@ def test_parse_public_s3_bucket_chunked(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self): + def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp, @@ -127,36 +127,33 @@ def test_parse_public_s3_bucket_chunked_python(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self): + def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self): + def test_infer_s3_compression(self, tips_df): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self): + def test_parse_public_s3_bucket_nrows_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 7717102b64fc5..b6f13039641a2 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import pytest import pandas.util.testing as tm from pandas import read_csv, read_table, DataFrame @@ -45,8 +46,9 @@ def read_table(self, *args, **kwargs): def float_precision_choices(self): raise com.AbstractMethodError(self) - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index e8d9d8b52164b..c7026e3e0fc88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -28,8 +28,9 @@ class TestTextReader(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b80263021c269..101ee3e619f5b 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -11,8 +11,9 @@ class TestSAS7BDAT(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: @@ -123,9 +124,8 @@ def test_iterator_read_too_much(self): rdr.close() -def test_encoding_options(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test1.sas7bdat") +def test_encoding_options(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: @@ -143,43 +143,39 @@ def test_encoding_options(): assert(x == y.decode()) -def test_productsales(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "productsales.sas7bdat") +def test_productsales(datapath): + fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') - fname = os.path.join(dirpath, "productsales.csv") + fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) -def test_12659(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test_12659.sas7bdat") +def test_12659(datapath): + fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "test_12659.csv") + fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0) -def test_airline(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "airline.sas7bdat") +def test_airline(datapath): + fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "airline.csv") + fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False) -def test_date_time(): +def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "datetime.sas7bdat") + fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "datetime.csv") + fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) # GH 19732: Timestamps imported from sas will incur floating point errors @@ -187,9 +183,8 @@ def test_date_time(): tm.assert_frame_equal(df, df0) -def test_zero_variables(): +def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "zero_variables.sas7bdat") + fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index de31c3e36a8d5..6e5b2ab067aa5 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -1,3 +1,4 @@ +import pytest import pandas as pd import pandas.util.testing as tm from pandas.io.sas.sasreader import read_sas @@ -18,8 +19,9 @@ def numeric_as_float(data): class TestXport(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a89156db38ae3..5c9739be73393 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -149,27 +149,22 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): reader(path) @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_fwf, 'os', os.path.join(HERE, 'data', - 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', os.path.join(HERE, 'data', - 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', - 'test1.sas7bdat')), - (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', - 'tsframe_v012.json')), - (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', - 'frame.mp')), - (pd.read_pickle, 'os', os.path.join(HERE, 'data', - 'categorical_0_14_1.pickle')), + (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), + (pd.read_table, 'os', ('io', 'data', 'iris.csv')), + (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), + (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), + (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), + (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', + 'datetimetz_object.h5')), + (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), + (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), + (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), + (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), + (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), ]) - def test_read_fspath_all(self, reader, module, path): + def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) + path = datapath(*path) mypath = CustomFSPath(path) result = reader(mypath) @@ -232,13 +227,14 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) -class TestMMapWrapper(object): +@pytest.fixture +def mmap_file(datapath): + return datapath('io', 'data', 'test_mmap.csv') + - def setup_method(self, method): - self.mmap_file = os.path.join(tm.get_data_path(), - 'test_mmap.csv') +class TestMMapWrapper(object): - def test_constructor_bad_file(self): + def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 @@ -252,15 +248,15 @@ def test_constructor_bad_file(self): tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) - target = open(self.mmap_file, 'r') + target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( ValueError, msg, common.MMapWrapper, target) - def test_get_attr(self): - with open(self.mmap_file, 'r') as target: + def test_get_attr(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) @@ -273,8 +269,8 @@ def test_get_attr(self): assert not hasattr(wrapper, 'foo') - def test_next(self): - with open(self.mmap_file, 'r') as target: + def test_next(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) lines = target.readlines() diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 05423474f330a..4e2b2af0ebfe7 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -39,8 +39,9 @@ @td.skip_if_no('xlrd', '0.9') class SharedItems(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -49,7 +50,6 @@ def setup_method(self, method): def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. - Test data path is defined by pandas.util.testing.get_data_path() Parameters ---------- @@ -68,8 +68,7 @@ def get_csv_refdf(self, basename): def get_excelfile(self, basename, ext): """ - Return test data ExcelFile instance. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data ExcelFile instance. Parameters ---------- @@ -86,8 +85,7 @@ def get_excelfile(self, basename, ext): def get_exceldf(self, basename, ext, *args, **kwds): """ - Return test data DataFrame. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data DataFrame. Parameters ---------- diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a56946b82b027..9c6a8de7ed446 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1,6 +1,5 @@ from __future__ import print_function -import glob import os import re import threading @@ -25,8 +24,18 @@ import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network +HERE = os.path.dirname(__file__) -DATA_PATH = tm.get_data_path() + +@pytest.fixture(params=[ + 'chinese_utf-16.html', + 'chinese_utf-32.html', + 'chinese_utf-8.html', + 'letz_latin1.html', +]) +def html_encoding_file(request, datapath): + """Parametrized fixture for HTML encoding test filenames.""" + return datapath('io', 'data', 'html_encoding', request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -44,11 +53,11 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): @td.skip_if_no('bs4') -def test_bs4_version_fails(monkeypatch): +def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): - read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor='bs4') def test_invalid_flavor(): @@ -59,8 +68,8 @@ def test_invalid_flavor(): @td.skip_if_no('bs4') @td.skip_if_no('lxml') -def test_same_ordering(): - filename = os.path.join(DATA_PATH, 'valid_markup.html') +def test_same_ordering(datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -72,11 +81,14 @@ def test_same_ordering(): pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): - spam_data = os.path.join(DATA_PATH, 'spam.html') - spam_data_kwargs = {} - if PY3: - spam_data_kwargs['encoding'] = 'UTF-8' - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + @pytest.fixture(autouse=True) + def set_files(self, datapath): + self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data_kwargs = {} + if PY3: + self.spam_data_kwargs['encoding'] = 'UTF-8' + self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -272,7 +284,8 @@ def test_invalid_url(self): @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), 'First', + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), + 'First', attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: @@ -326,7 +339,7 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) assert isinstance(dfs, list) @@ -352,9 +365,9 @@ def test_python_docs_table(self): assert sorted(zz) == sorted(['Repo', 'What']) @pytest.mark.slow - def test_thousands_macau_stats(self): + def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] @@ -362,9 +375,9 @@ def test_thousands_macau_stats(self): assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow - def test_thousands_macau_index_col(self): + def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath('io', 'data', 'macau.html') dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -518,8 +531,8 @@ def test_countries_municipalities(self): res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) - def test_nyse_wsj_commas_table(self): - data = os.path.join(DATA_PATH, 'nyse_wsj.html') + def test_nyse_wsj_commas_table(self, datapath): + data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] @@ -530,7 +543,7 @@ def test_nyse_wsj_commas_table(self): tm.assert_index_equal(df.columns, columns) @pytest.mark.slow - def test_banklist_header(self): + def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -541,7 +554,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] - ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), + ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) assert df.shape == ground_truth.shape @@ -658,19 +671,19 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + def test_computer_sales_page(self, datapath): + data = datapath('io', 'data', 'computer_sales_page.html') with tm.assert_raises_regex(ParserError, r"Passed header=\[0,1\] are " r"too many rows for this " r"multi_index of columns"): self.read_html(data, header=[0, 1]) - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + data = datapath('io', 'data', 'computer_sales_page.html') assert self.read_html(data, header=[1, 2]) - def test_wikipedia_states_table(self): - data = os.path.join(DATA_PATH, 'wikipedia_states.html') + def test_wikipedia_states_table(self, datapath): + data = datapath('io', 'data', 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] @@ -784,15 +797,15 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self): - filename = os.path.join(DATA_PATH, 'valid_markup.html') + def test_works_on_valid_markup(self, datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self): - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + def test_fallback_success(self, datapath): + banklist_data = datapath('io', 'data', 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) def test_to_html_timestamp(self): @@ -835,22 +848,23 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table - @pytest.mark.parametrize("f", glob.glob( - os.path.join(DATA_PATH, 'html_encoding', '*.html'))) - def test_encode(self, f): - _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') + def test_encode(self, html_encoding_file): + _, encoding = os.path.splitext( + os.path.basename(html_encoding_file) + )[0].split('_') try: - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() - from_filename = self.read_html(f, encoding=encoding, + from_filename = self.read_html(html_encoding_file, + encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) @@ -906,7 +920,7 @@ def seekable(self): assert self.read_html(bad) @pytest.mark.slow - def test_importcheck_thread_safety(self): + def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): @@ -921,7 +935,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = os.path.join(DATA_PATH, 'valid_markup.html') + filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index cfac77291803d..491d5fe33cc33 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import os import datetime +import glob import numpy as np from distutils.version import LooseVersion @@ -837,13 +838,13 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -def legacy_packers_versions(): - # yield the packers versions - path = tm.get_data_path('legacy_msgpack') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - yield v +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_msgpack", "*", "*.msgpack")) + + +@pytest.fixture(params=files) +def legacy_packer(request, datapath): + return datapath(request.param) class TestMsgpack(object): @@ -920,24 +921,20 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('version', legacy_packers_versions()) def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - version): - - pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) - n = 0 - for f in os.listdir(pth): - # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): - continue - vf = os.path.join(pth, f) - try: - with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - vf, version) - except ImportError: - # blosc not installed - continue - n += 1 - assert n > 0, 'Msgpack files are not tested' + legacy_packer, datapath): + + version = os.path.basename(os.path.dirname(legacy_packer)) + + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and version.startswith('0.17.') and + legacy_packer.split('.')[-4][-1] == '2'): + msg = "Files packed in Py2 can't be read in Py3 ({})" + pytest.skip(msg.format(version)) + try: + with catch_warnings(record=True): + self.compare(current_packers_data, all_packers_data, + legacy_packer, version) + except ImportError: + # blosc not installed + pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index fbe2174e603e2..45cbbd43cd6a8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,7 +12,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ - +import glob import pytest from warnings import catch_warnings @@ -184,27 +184,25 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_pickle", "*", "*.pickle")) + + +@pytest.fixture(params=files) +def legacy_pickle(request, datapath): + return datapath(request.param) + + # --------------------- # tests # --------------------- -def legacy_pickle_versions(): - # yield the pickle versions - path = tm.get_data_path('legacy_pickle') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - for f in os.listdir(p): - yield (v, f) - - -@pytest.mark.parametrize('version, f', legacy_pickle_versions()) -def test_pickles(current_pickle_data, version, f): +def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") - vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) + version = os.path.basename(os.path.dirname(legacy_pickle)) with catch_warnings(record=True): - compare(current_pickle_data, vf, version) + compare(current_pickle_data, legacy_pickle, version) def test_round_trip_current(current_pickle_data): @@ -260,12 +258,11 @@ def python_unpickler(path): compare_element(result, expected, typ) -def test_pickle_v0_14_1(): +def test_pickle_v0_14_1(datapath): cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -275,14 +272,13 @@ def test_pickle_v0_14_1(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) -def test_pickle_v0_15_2(): +def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5ac91c15047ff..9cbb62f72f0a0 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4452,28 +4452,27 @@ def f(): store.select('df') tm.assert_raises_regex(ClosedFileError, 'file is not open', f) - def test_pytables_native_read(self): - + def test_pytables_native_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native.h5'), + datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), mode='r') as store: d2 = store['detector/readout'] assert isinstance(d2, DataFrame) @pytest.mark.skipif(PY35 and is_platform_windows(), reason="native2 read fails oddly on windows / 3.5") - def test_pytables_native2_read(self): + def test_pytables_native2_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native2.h5'), + datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), mode='r') as store: str(store) d1 = store['detector'] assert isinstance(d1, DataFrame) - def test_legacy_table_read(self): + def test_legacy_table_read(self, datapath): # legacy table types with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_table.h5'), + datapath('io', 'data', 'legacy_hdf', 'legacy_table.h5'), mode='r') as store: with catch_warnings(record=True): @@ -5120,7 +5119,7 @@ def test_fspath(self): with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) - def test_read_py2_hdf_file_in_py3(self): + def test_read_py2_hdf_file_in_py3(self, datapath): # GH 16781 # tests reading a PeriodIndex DataFrame written in Python2 in Python3 @@ -5135,8 +5134,8 @@ def test_read_py2_hdf_file_in_py3(self): ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) with ensure_clean_store( - tm.get_data_path( - 'legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), + datapath('io', 'data', 'legacy_hdf', + 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), mode='r') as store: result = store['p'] assert_frame_equal(result, expected) @@ -5533,14 +5532,14 @@ def test_store_timezone(self): assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self): + def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) with ensure_clean_store( - tm.get_data_path('legacy_hdf/datetimetz_object.h5'), + datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), mode='r') as store: result = store['df'] assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f3ab74d37a2bc..f8f742c5980ac 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -22,7 +22,6 @@ import pytest import sqlite3 import csv -import os import warnings import numpy as np @@ -184,9 +183,11 @@ class MixInBase(object): def teardown_method(self, method): - for tbl in self._get_all_tables(): - self.drop_table(tbl) - self._close_conn() + # if setup fails, there may not be a connection to close. + if hasattr(self, 'conn'): + for tbl in self._get_all_tables(): + self.drop_table(tbl) + self._close_conn() class MySQLMixIn(MixInBase): @@ -253,9 +254,9 @@ def _get_exec(self): else: return self.conn.cursor() - def _load_iris_data(self): + def _load_iris_data(self, datapath): import io - iris_csv_file = os.path.join(tm.get_data_path(), 'iris.csv') + iris_csv_file = datapath('io', 'data', 'iris.csv') self.drop_table('iris') self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) @@ -503,9 +504,10 @@ class _TestSQLApi(PandasSQLTest): flavor = 'sqlite' mode = None - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_iris_view() self._load_test1_data() self._load_test2_data() @@ -1025,8 +1027,9 @@ class _EngineToConnMixin(object): A mixin that causes setup_connect to create a conn rather than an engine. """ - def setup_method(self, method): - super(_EngineToConnMixin, self).setup_method(method) + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + super(_EngineToConnMixin, self).setup_method(datapath) engine = self.conn conn = engine.connect() self.__tx = conn.begin() @@ -1034,12 +1037,14 @@ def setup_method(self, method): self.__engine = engine self.conn = conn - def teardown_method(self, method): + yield + self.__tx.rollback() self.conn.close() self.conn = self.__engine self.pandasSQL = sql.SQLDatabase(self.__engine) - super(_EngineToConnMixin, self).teardown_method(method) + # XXX: + # super(_EngineToConnMixin, self).teardown_method(method) @pytest.mark.single @@ -1136,7 +1141,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ flavor = None - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1149,10 +1154,11 @@ def setup_class(cls): msg = "{0} - can't connect to {1} server".format(cls, cls.flavor) pytest.skip(msg) - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.setup_connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_raw_sql() self._load_test1_data() @@ -1920,11 +1926,12 @@ class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): def connect(cls): return sqlite3.connect(':memory:') - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() self.pandasSQL = sql.SQLiteDatabase(self.conn) - self._load_iris_data() + self._load_iris_data(datapath) self._load_test1_data() @@ -2135,8 +2142,9 @@ def _skip_if_no_pymysql(): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - def setup_method(self, method): - self.method = method + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + self.method = request.function self.conn = sqlite3.connect(':memory:') def test_basic(self): @@ -2215,8 +2223,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): create_sql = """ CREATE TABLE test ( @@ -2236,7 +2243,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): pass @@ -2341,7 +2348,7 @@ def clean_up(test_table_to_drop): "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn): - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): _skip_if_no_pymysql() @@ -2370,7 +2377,8 @@ def setup_class(cls): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): _skip_if_no_pymysql() import pymysql try: @@ -2396,7 +2404,7 @@ def setup_method(self, method): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - self.method = method + self.method = request.function def test_basic(self): _skip_if_no_pymysql() @@ -2501,8 +2509,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): _skip_if_no_pymysql() drop_sql = "DROP TABLE IF EXISTS test" create_sql = """ @@ -2525,7 +2532,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): _skip_if_no_pymysql() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index f3a465da4e87f..cff63516f4086 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -25,8 +25,8 @@ @pytest.fixture -def dirpath(): - return tm.get_data_path() +def dirpath(datapath): + return datapath("io", "data") @pytest.fixture @@ -39,8 +39,9 @@ def parsed_114(dirpath): class TestStata(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f65791329f2f1..09687dd97bd43 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -74,11 +74,6 @@ def setup_method(self, method): else: self.default_figsize = (8.0, 6.0) self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' - # common test data - from pandas import read_csv - base = os.path.join(os.path.dirname(curpath()), os.pardir) - path = os.path.join(base, 'tests', 'data', 'iris.csv') - self.iris = read_csv(path) n = 100 with tm.RNGContext(42): diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 2c2d371921d2f..a45b17ec98261 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -46,10 +46,9 @@ def test_boxplot_deprecated(self): by='indic') @pytest.mark.slow - def test_radviz_deprecated(self): - df = self.iris + def test_radviz_deprecated(self, iris): with tm.assert_produces_warning(FutureWarning): - plotting.radviz(frame=df, class_column='Name') + plotting.radviz(frame=iris, class_column='Name') @pytest.mark.slow def test_plot_params(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c82c939584dc7..0473610ea2f8f 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -100,11 +100,11 @@ def test_scatter_matrix_axis(self): axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow - def test_andrews_curves(self): + def test_andrews_curves(self, iris): from pandas.plotting import andrews_curves from matplotlib import cm - df = self.iris + df = iris _check_plot_works(andrews_curves, frame=df, class_column='Name') @@ -165,11 +165,11 @@ def test_andrews_curves(self): andrews_curves(data=df, class_column='Name') @pytest.mark.slow - def test_parallel_coordinates(self): + def test_parallel_coordinates(self, iris): from pandas.plotting import parallel_coordinates from matplotlib import cm - df = self.iris + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') @@ -234,11 +234,11 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] @pytest.mark.slow - def test_radviz(self): + def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm - df = self.iris + df = iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') @@ -272,8 +272,8 @@ def test_radviz(self): self._check_colors(handles, facecolors=colors) @pytest.mark.slow - def test_subplot_titles(self): - df = self.iris.drop('Name', axis=1).head() + def test_subplot_titles(self, iris): + df = iris.drop('Name', axis=1).head() # Use the column names as the subplot titles title = list(df.columns) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index cebbcc41c3e17..59b53cd23010e 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,4 +1,3 @@ -import os import pytest import pytz @@ -13,8 +12,8 @@ class TestAsOfMerge(object): - def read_data(self, name, dedupe=False): - path = os.path.join(tm.get_data_path(), name) + def read_data(self, datapath, name, dedupe=False): + path = datapath('reshape', 'merge', 'data', name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') @@ -23,15 +22,17 @@ def read_data(self, name, dedupe=False): x.time = to_datetime(x.time) return x - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): - self.trades = self.read_data('trades.csv') - self.quotes = self.read_data('quotes.csv', dedupe=True) - self.asof = self.read_data('asof.csv') - self.tolerance = self.read_data('tolerance.csv') - self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.trades = self.read_data(datapath, 'trades.csv') + self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) + self.asof = self.read_data(datapath, 'asof.csv') + self.tolerance = self.read_data(datapath, 'tolerance.csv') + self.allow_exact_matches = self.read_data(datapath, + 'allow_exact_matches.csv') self.allow_exact_matches_and_tolerance = self.read_data( - 'allow_exact_matches_and_tolerance.csv') + datapath, 'allow_exact_matches_and_tolerance.csv') def test_examples1(self): """ doc-string examples """ @@ -423,11 +424,11 @@ def test_multiby_indexed(self): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1']) - def test_basic2(self): + def test_basic2(self, datapath): - expected = self.read_data('asof2.csv') - trades = self.read_data('trades2.csv') - quotes = self.read_data('quotes2.csv', dedupe=True) + expected = self.read_data(datapath, 'asof2.csv') + trades = self.read_data(datapath, 'trades2.csv') + quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', @@ -467,14 +468,14 @@ def test_valid_join_keys(self): merge_asof(trades, quotes, by='ticker') - def test_with_duplicates(self): + def test_with_duplicates(self, datapath): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') - expected = self.read_data('asof.csv') + expected = self.read_data(datapath, 'asof.csv') assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 5ea27f9e34e1c..807fb2530603a 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -282,10 +282,10 @@ def test_round_frac(self): result = tmod._round_frac(0.000123456, precision=2) assert result == 0.00012 - def test_qcut_binning_issues(self): + def test_qcut_binning_issues(self, datapath): # #1978, 1979 - path = os.path.join(tm.get_data_path(), 'cut_data.csv') - arr = np.loadtxt(path) + cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv')) + arr = np.loadtxt(cut_file) result = qcut(arr, 20) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 0c08d813a7f1b..00701ca2be946 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,4 +1,3 @@ -import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta @@ -455,14 +454,15 @@ def test_add(self, offset_types, tz): assert isinstance(result, Timestamp) assert result == expected_localize - def test_pickle_v0_15_2(self): + def test_pickle_v0_15_2(self, datapath): offsets = {'DateOffset': DateOffset(years=1), 'MonthBegin': MonthBegin(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'Week': Week(1)} - pickle_path = os.path.join(tm.get_data_path(), - 'dateoffset_0_15_2.pickle') + + pickle_path = datapath('tseries', 'offsets', 'data', + 'dateoffset_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -1854,12 +1854,10 @@ def _check_roundtrip(obj): _check_roundtrip(self.offset2) _check_roundtrip(self.offset * 2) - def test_pickle_compat_0_14_1(self): + def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - - pth = tm.get_data_path() - - cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index ab7c4fb528452..4d34987e14f75 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pandas as pd import pytest import numpy as np @@ -841,3 +842,15 @@ def test_locale(self): # GH9744 locales = tm.get_locales() assert len(locales) >= 1 + + +def test_datapath_missing(datapath, request): + if not request.config.getoption("--strict-data-files"): + pytest.skip("Need to set '--strict-data-files'") + + with pytest.raises(ValueError): + datapath('not_a_file') + + result = datapath('data', 'iris.csv') + expected = os.path.join('pandas', 'tests', 'data', 'iris.csv') + assert result == expected diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 27c24e3a68079..c6ab24403d58d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,7 +23,6 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ - import pytest import locale from distutils.version import LooseVersion diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6384eca9849f6..b7edbff00a4b9 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -6,7 +6,6 @@ import sys import tempfile import warnings -import inspect import os import subprocess import locale @@ -751,15 +750,6 @@ def ensure_clean(filename=None, return_filelike=False): print("Exception on removing file: {error}".format(error=e)) -def get_data_path(f=''): - """Return the path of a data file, these are relative to the current test - directory. - """ - # get our callers file - _, filename, _, _, _, _ = inspect.getouterframes(inspect.currentframe())[1] - base_dir = os.path.abspath(os.path.dirname(filename)) - return os.path.join(base_dir, 'data', f) - # ----------------------------------------------------------------------------- # Comparators diff --git a/setup.cfg b/setup.cfg index 6d9657737a8bd..9ec967c25e225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,4 +32,5 @@ markers = slow: mark a test as slow network: mark a test as network high_memory: mark a test as a high-memory only -doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL +addopts = --strict-data-files +doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL \ No newline at end of file diff --git a/setup.py b/setup.py index c5831eb097767..5d6bbbcf7b862 100755 --- a/setup.py +++ b/setup.py @@ -734,11 +734,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*', '_libs/*.dll'], - 'pandas.tests.io': ['data/legacy_hdf/*.h5', - 'data/legacy_pickle/*/*.pickle', - 'data/legacy_msgpack/*/*.msgpack', - 'data/html_encoding/*.html']}, + package_data={'': ['templates/*', '_libs/*.dll']}, ext_modules=extensions, maintainer_email=EMAIL, description=DESCRIPTION, From db51f0a57030fd71d26df00c2e3dd63b7fd542b9 Mon Sep 17 00:00:00 2001 From: david-liu-brattle-1 <36486871+david-liu-brattle-1@users.noreply.github.com> Date: Tue, 26 Jun 2018 18:19:41 -0400 Subject: [PATCH 27/55] Cleanup clipboard tests (#21163) (cherry picked from commit 9d38e0ef5842fafcc4e391abc6aba486684e6dc7) --- pandas/tests/io/test_clipboard.py | 196 ++++++++++++++++++++---------- 1 file changed, 129 insertions(+), 67 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 98c0effabec84..80fddd50fc9a8 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -9,10 +9,11 @@ from pandas import DataFrame from pandas import read_clipboard from pandas import get_option +from pandas.compat import PY2 from pandas.util import testing as tm from pandas.util.testing import makeCustomDataframe as mkdf from pandas.io.clipboard.exceptions import PyperclipException -from pandas.io.clipboard import clipboard_set +from pandas.io.clipboard import clipboard_set, clipboard_get try: @@ -22,73 +23,134 @@ _DEPS_INSTALLED = 0 +def build_kwargs(sep, excel): + kwargs = {} + if excel != 'default': + kwargs['excel'] = excel + if sep != 'default': + kwargs['sep'] = sep + return kwargs + + +@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii', + 'colwidth', 'mixed', 'float', 'int']) +def df(request): + data_type = request.param + + if data_type == 'delims': + return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], + 'b': ['hi\'j', 'k\'\'lm']}) + elif data_type == 'utf8': + return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], + 'b': ['øπ∆˚¬', 'œ∑´®']}) + elif data_type == 'string': + return mkdf(5, 3, c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'long': + max_rows = get_option('display.max_rows') + return mkdf(max_rows + 1, 3, + data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'nonascii': + return pd.DataFrame({'en': 'in English'.split(), + 'es': 'en español'.split()}) + elif data_type == 'colwidth': + _cw = get_option('display.max_colwidth') + 1 + return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'mixed': + return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, + 'b': np.arange(1, 6), + 'c': list('abcde')}) + elif data_type == 'float': + return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'int': + return mkdf(5, 3, data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + else: + raise ValueError + + @pytest.mark.single @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") class TestClipboard(object): - - @classmethod - def setup_class(cls): - cls.data = {} - cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['float'] = mkdf(5, 3, - data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, - 'b': np.arange(1, 6), - 'c': list('abcde')}) - - # Test columns exceeding "max_colwidth" (GH8305) - _cw = get_option('display.max_colwidth') + 1 - cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test GH-5346 - max_rows = get_option('display.max_rows') - cls.data['longdf'] = mkdf(max_rows + 1, 3, - data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test for non-ascii text: GH9263 - cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), - 'es': 'en español'.split()}) - # unicode round trip test for GH 13747, GH 12529 - cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) - cls.data_types = list(cls.data.keys()) - - @classmethod - def teardown_class(cls): - del cls.data_types, cls.data - - def check_round_trip_frame(self, data_type, excel=None, sep=None, + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data = self.data[data_type] data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - if sep is not None: - result = read_clipboard(sep=sep, index_col=0, encoding=encoding) - else: - result = read_clipboard(encoding=encoding) + result = read_clipboard(sep=sep or '\t', index_col=0, + encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) - def test_round_trip_frame_sep(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, sep=',') - self.check_round_trip_frame(dt, sep=r'\s+') - self.check_round_trip_frame(dt, sep='|') - - def test_round_trip_frame_string(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, excel=False) - - def test_round_trip_frame(self): - for dt in self.data_types: - self.check_round_trip_frame(dt) + # Test that default arguments copy as tab delimited + @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' + 'Issue in #21104, Fixed in #21111') + def test_round_trip_frame(self, df): + self.check_round_trip_frame(df) + + # Test that explicit delimiters are respected + @pytest.mark.parametrize('sep', ['\t', ',', '|']) + def test_round_trip_frame_sep(self, df, sep): + self.check_round_trip_frame(df, sep=sep) + + # Test white space separator + @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " + "aren't handled correctly in default c engine. Fixed " + "in #21111 by defaulting to python engine for " + "whitespace separator") + def test_round_trip_frame_string(self, df): + df.to_clipboard(excel=False, sep=None) + result = read_clipboard() + assert df.to_string() == result.to_string() + assert df.shape == result.shape + + # Two character separator is not supported in to_clipboard + # Test that multi-character separators are not silently passed + @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") + def test_excel_sep_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=True, sep=r'\t') + + # Separator is ignored when excel=False and should produce a warning + @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") + def test_copy_delim_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=False, sep='\t') + + # Tests that the default behavior of to_clipboard is tab + # delimited and excel="True" + @pytest.mark.xfail(reason="to_clipboard defaults to space delim. Issue in " + "#21104, Fixed in #21111") + @pytest.mark.parametrize('sep', ['\t', None, 'default']) + @pytest.mark.parametrize('excel', [True, None, 'default']) + def test_clipboard_copy_tabs_default(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + if PY2: + # to_clipboard copies unicode, to_csv produces bytes. This is + # expected behavior + assert clipboard_get().encode('utf-8') == df.to_csv(sep='\t') + else: + assert clipboard_get() == df.to_csv(sep='\t') + + # Tests reading of white space separated tables + @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " + "aren't handled correctly. in default c engine. Fixed " + "in #21111 by defaulting to python engine for " + "whitespace separator") + @pytest.mark.parametrize('sep', [None, 'default']) + @pytest.mark.parametrize('excel', [False]) + def test_clipboard_copy_strings(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + result = read_clipboard(sep=r'\s+') + assert result.to_string() == df.to_string() + assert df.shape == result.shape def test_read_clipboard_infer_excel(self): # gh-19010: avoid warnings @@ -124,15 +186,15 @@ def test_read_clipboard_infer_excel(self): tm.assert_frame_equal(res, exp) - def test_invalid_encoding(self): + def test_invalid_encoding(self, df): # test case for testing invalid encoding - data = self.data['string'] with pytest.raises(ValueError): - data.to_clipboard(encoding='ascii') + df.to_clipboard(encoding='ascii') with pytest.raises(NotImplementedError): pd.read_clipboard(encoding='ascii') - def test_round_trip_valid_encodings(self): - for enc in ['UTF-8', 'utf-8', 'utf8']: - for dt in self.data_types: - self.check_round_trip_frame(dt, encoding=enc) + @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' + 'Issue in #21104, Fixed in #21111') + @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) + def test_round_trip_valid_encodings(self, enc, df): + self.check_round_trip_frame(df, encoding=enc) From d9ada974d0f73c72953fcece56e084dc277bc4c7 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 27 Jun 2018 03:57:55 -0600 Subject: [PATCH 28/55] DOC: Fix versionadded directive typos in IntervalIndex (#21649) (cherry picked from commit b35cb1c127aae894c2a1ee5ab2f16987b91e9000) --- pandas/core/indexes/interval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index eb9d7efc06c27..23a655b9a51ee 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -160,7 +160,7 @@ class IntervalIndex(IntervalMixin, Index): dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Attributes ---------- @@ -438,7 +438,7 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -568,7 +568,7 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -619,7 +619,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -671,7 +671,7 @@ def to_tuples(self, na_tuple=True): Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA value itself if False, ``nan``. - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- From 0a42f18687a1e586b09bfaa18b0ddc85e20d760a Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 29 Jun 2018 01:26:38 +0100 Subject: [PATCH 29/55] Fix Timestamp rounding (#21507) (cherry picked from commit 76ef7c459e752f72abc62e030fd1cea0117c1dca) --- doc/source/whatsnew/v0.23.2.txt | 2 +- pandas/_libs/tslibs/timestamps.pyx | 34 +++++++++++++------ .../indexes/datetimes/test_scalar_compat.py | 19 +++++++++++ .../tests/scalar/timestamp/test_unary_ops.py | 20 ++++++++++- 4 files changed, 62 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index b3da4d1c4e288..9d96e807dfd3e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -54,7 +54,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) -- +- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) .. _whatsnew_0232.performance: diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ba5ebdab82ddc..123ccebf83a56 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -59,42 +59,51 @@ cdef inline object create_timestamp_from_ts(int64_t value, def round_ns(values, rounder, freq): + """ Applies rounding function at given frequency Parameters ---------- - values : int, :obj:`ndarray` - rounder : function + values : :obj:`ndarray` + rounder : function, eg. 'ceil', 'floor', 'round' freq : str, obj Returns ------- - int or :obj:`ndarray` + :obj:`ndarray` """ + from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos + + # GH21262 If the Timestamp is multiple of the freq str + # don't apply any rounding + mask = values % unit == 0 + if mask.all(): + return values + r = values.copy() + if unit < 1000: # for nano rounding, work with the last 6 digits separately # due to float precision buff = 1000000 - r = (buff * (values // buff) + unit * - (rounder((values % buff) * (1 / float(unit)))).astype('i8')) + r[~mask] = (buff * (values[~mask] // buff) + + unit * (rounder((values[~mask] % buff) * + (1 / float(unit)))).astype('i8')) else: if unit % 1000 != 0: msg = 'Precision will be lost using frequency: {}' warnings.warn(msg.format(freq)) - # GH19206 # to deal with round-off when unit is large if unit >= 1e9: divisor = 10 ** int(np.log10(unit / 1e7)) else: divisor = 10 - - r = (unit * rounder((values * (divisor / float(unit))) / divisor) - .astype('i8')) - + r[~mask] = (unit * rounder((values[~mask] * + (divisor / float(unit))) / divisor) + .astype('i8')) return r @@ -649,7 +658,10 @@ class Timestamp(_Timestamp): else: value = self.value - r = round_ns(value, rounder, freq) + value = np.array([value], dtype=np.int64) + + # Will only ever contain 1 element for timestamp + r = round_ns(value, rounder, freq)[0] result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 9180bb0af3af3..801dcb91b124e 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -134,6 +134,21 @@ def test_round(self, tz): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') + def test_no_rounding_occurs(self, tz): + # GH 21262 + rng = date_range(start='2016-01-01', periods=5, + freq='2Min', tz=tz) + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'), + ]) + + tm.assert_index_equal(rng.round(freq='2T'), expected_rng) + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), @@ -143,6 +158,10 @@ def test_round(self, tz): ['1823-01-01 00:00:01.000000020']), (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), + (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']), + (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']), + (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']), + (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']), (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', ('NaT', '1823-01-01 00:00:01')), (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index aecddab8477fc..dbe31ccb11114 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -118,6 +118,25 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = Timestamp(expected) assert result == expected + @pytest.mark.parametrize('test_input, freq, expected', [ + ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), + ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), + ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), + ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), + ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), + ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), + ]) + @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) + def test_round_minute_freq(self, test_input, freq, expected, rounder): + # Ensure timestamps that shouldnt round dont! + # GH#21262 + + dt = Timestamp(test_input) + expected = Timestamp(expected) + func = getattr(dt, rounder) + result = func(freq) + assert result == expected + def test_ceil(self): dt = Timestamp('20130101 09:10:11') result = dt.ceil('D') @@ -257,7 +276,6 @@ def test_timestamp(self): if PY3: # datetime.timestamp() converts in the local timezone with tm.set_timezone('UTC'): - # should agree with datetime.timestamp method dt = ts.to_pydatetime() assert dt.timestamp() == ts.timestamp() From 2c00914e9addaa57d6b9f3308f25b5755e4dcc1a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 Jun 2018 02:38:39 +0200 Subject: [PATCH 30/55] API/REGR: (re-)allow neg/pos unary operation on object dtype (#21590) (cherry picked from commit 8cb6be0eced3bd3742efd0c03b2d903e3513cb11) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/generic.py | 7 +++++-- pandas/tests/frame/test_operators.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 9d96e807dfd3e..07ce99d4f19aa 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -54,6 +54,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) .. _whatsnew_0232.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 50a5c10a6865f..02462218e8b02 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_dict_like, is_re_compilable, is_period_arraylike, + is_object_dtype, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.inference import is_hashable @@ -1117,7 +1118,8 @@ def __neg__(self): values = com._values_from_object(self) if is_bool_dtype(values): arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): arr = operator.neg(values) else: raise TypeError("Unary negative expects numeric dtype, not {}" @@ -1128,7 +1130,8 @@ def __pos__(self): values = com._values_from_object(self) if (is_bool_dtype(values) or is_period_arraylike(values)): arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): arr = operator.pos(values) else: raise TypeError("Unary plus expects numeric dtype, not {}" diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 5df50f3d7835b..fdf50805ad818 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -3,6 +3,7 @@ from __future__ import print_function from collections import deque from datetime import datetime +from decimal import Decimal import operator import pytest @@ -282,6 +283,17 @@ def test_neg_numeric(self, df, expected): assert_frame_equal(-df, expected) assert_series_equal(-df['a'], expected['a']) + @pytest.mark.parametrize('df, expected', [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]), + ]) + def test_neg_object(self, df, expected): + # GH 21380 + df = pd.DataFrame({'a': df}) + expected = pd.DataFrame({'a': expected}) + assert_frame_equal(-df, expected) + assert_series_equal(-df['a'], expected['a']) + @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), @@ -307,6 +319,15 @@ def test_pos_numeric(self, df): @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': np.array([-1, 2], dtype=object)}), + pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}), + ]) + def test_pos_object(self, df): + # GH 21380 + assert_frame_equal(+df, df) + assert_series_equal(+df['a'], df['a']) + + @pytest.mark.parametrize('df', [ pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), ]) def test_pos_raises(self, df): From dddc81b7fb6d938ad96f40b6953e6db729c96da4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 29 Jun 2018 02:39:45 +0200 Subject: [PATCH 31/55] API: re-allow duplicate index level names (#21423) (cherry picked from commit 66b517c2f51ed20d4c6823272d5c2a0f47f96d84) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/core/indexes/multi.py | 19 +++++------- pandas/core/reshape/reshape.py | 12 ++++++++ pandas/tests/frame/test_alter_axes.py | 37 +++++++++++++++++++----- pandas/tests/frame/test_reshape.py | 10 +++++++ pandas/tests/groupby/test_categorical.py | 8 ++--- pandas/tests/indexes/test_multi.py | 25 +++++++++------- pandas/tests/io/test_pytables.py | 6 ++++ pandas/tests/reshape/test_pivot.py | 10 +++++-- 9 files changed, 90 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 07ce99d4f19aa..ab9c3bc3857d6 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -53,6 +53,7 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) +- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) - Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 80bf73cfe7dd3..33db32cfe1166 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -672,30 +672,18 @@ def _set_names(self, names, level=None, validate=True): if level is None: level = range(self.nlevels) - used = {} else: level = [self._get_level_number(l) for l in level] - used = {self.levels[l].name: l - for l in set(range(self.nlevels)) - set(level)} # set the name for l, name in zip(level, names): if name is not None: - # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): raise TypeError('{}.name must be a hashable type' .format(self.__class__.__name__)) - - if name in used: - raise ValueError( - 'Duplicated level name: "{}", assigned to ' - 'level {}, is already used for level ' - '{}.'.format(name, l, used[name])) - self.levels[l].rename(name, inplace=True) - used[name] = l names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") @@ -2935,6 +2923,13 @@ def isin(self, values, level=None): else: return np.lib.arraysetops.in1d(labs, sought_labels) + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return sum(name == n for n in self.names) > 1 + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2757e0797a410..3d9e84954a63b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.index = index.remove_unused_levels() + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The index " + "names are not unique.".format(level=level)) + raise ValueError(msg) + self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 @@ -528,6 +534,12 @@ def factorize(index): N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The column " + "names are not unique.".format(level=level)) + raise ValueError(msg) + # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 164d6746edec0..21961906c39bb 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -130,19 +130,27 @@ def test_set_index2(self): result = df.set_index(df.C) assert result.index.name == 'C' - @pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')]) + @pytest.mark.parametrize( + 'level', ['a', pd.Series(range(0, 8, 2), name='a')]) def test_set_index_duplicate_names(self, level): - # GH18872 + # GH18872 - GH19029 df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b']) # Pass an existing level name: df.index.name = 'a' - pytest.raises(ValueError, df.set_index, level, append=True) - pytest.raises(ValueError, df.set_index, [level], append=True) - - # Pass twice the same level name: - df.index.name = 'c' - pytest.raises(ValueError, df.set_index, [level, level]) + expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)], + names=['a', 'a']) + result = df.set_index(level, append=True) + tm.assert_index_equal(result.index, expected) + result = df.set_index([level], append=True) + tm.assert_index_equal(result.index, expected) + + # Pass twice the same level name (only works with passing actual data) + if isinstance(level, pd.Series): + result = df.set_index([level, level]) + expected = pd.MultiIndex.from_tuples( + [(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a']) + tm.assert_index_equal(result.index, expected) def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], @@ -617,6 +625,19 @@ def test_reorder_levels(self): index=e_idx) assert_frame_equal(result, expected) + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels(['L0', 'L0', 'L0']) + assert_frame_equal(result, expected) + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d05321abefca6..ebf6c5e37b916 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,6 +560,16 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with pytest.raises(ValueError): + df.unstack('c1') + + with pytest.raises(ValueError): + df.T.stack('c1') + def test_unstack_unused_levels(self): # GH 17845: unused labels in index make unstack() cast int to float idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0fec6a8f96a24..cb76195eacf40 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -555,15 +555,11 @@ def test_as_index(): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): - df.groupby(['cat', s.rename('cat')], observed=True).sum() - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 3ede83b5969ce..40e64d99ac440 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -655,22 +655,27 @@ def test_constructor_nonhashable_names(self): # With .set_names() tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed) - @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], - ['1', 'a', '1']]) + @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], + [1, 'a', 1]]) def test_duplicate_level_names(self, names): - # GH18872 - pytest.raises(ValueError, pd.MultiIndex.from_product, - [[0, 1]] * 3, names=names) + # GH18872, GH19029 + mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names # With .rename() mi = pd.MultiIndex.from_product([[0, 1]] * 3) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names) + mi = mi.rename(names) + assert mi.names == names # With .rename(., level=) - mi.rename(names[0], level=1, inplace=True) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names[:2], level=[0, 2]) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + def test_duplicate_level_names_access_raises(self): + self.index.names = ['foo', 'foo'] + tm.assert_raises_regex(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 9cbb62f72f0a0..7dafc9603f96d 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1842,6 +1842,12 @@ def make_index(names=None): 'a', 'b'], index=make_index(['date', 'a', 't'])) pytest.raises(ValueError, store.append, 'df', df) + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], + index=make_index(['date', 'date', 'date'])) + pytest.raises(ValueError, store.append, 'df', df) + # fully names _maybe_remove(store, 'df') df = DataFrame(np.zeros((12, 2)), columns=[ diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 3ec60d50f2792..b71954163f9e1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1729,9 +1729,15 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # GH 13279 s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + + result = pd.crosstab(s, s) + expected_index = pd.Index(range(3), name='foo') + expected = pd.DataFrame(np.eye(3, dtype=np.int64), + index=expected_index, + columns=expected_index) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) From 06d76e0c6dc0008510e7381cb774f183b8e8271b Mon Sep 17 00:00:00 2001 From: david-liu-brattle-1 <36486871+david-liu-brattle-1@users.noreply.github.com> Date: Fri, 29 Jun 2018 08:22:15 -0400 Subject: [PATCH 32/55] BUG: to_clipboard fails to format output for Excel (#21111) (cherry picked from commit dc45fbafef172e357cb5decdeab22de67160f5b7) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/clipboards.py | 32 +++++++++++++++++++++++++------ pandas/tests/io/test_clipboard.py | 16 ---------------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index ab9c3bc3857d6..608db7487c1e4 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -57,6 +57,7 @@ Fixed Regressions - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) - Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) +- Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) .. _whatsnew_0232.performance: diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index dcc221ce978b3..b3f40b3a2429c 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -1,6 +1,7 @@ """ io on the clipboard """ from pandas import compat, get_option, option_context, DataFrame -from pandas.compat import StringIO, PY2 +from pandas.compat import StringIO, PY2, PY3 +import warnings def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover @@ -32,7 +33,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does - if compat.PY3: + if PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or @@ -55,11 +56,27 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: - sep = r'\t' + sep = '\t' + # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' + # Regex separator currently only works with python engine. + # Default to python if separator is multi-character (regex) + if len(sep) > 1 and kwargs.get('engine') is None: + kwargs['engine'] = 'python' + elif len(sep) > 1 and kwargs.get('engine') == 'c': + warnings.warn('read_clipboard with regex separator does not work' + ' properly with c engine') + + # In PY2, the c table reader first encodes text with UTF-8 but Python + # table reader uses the format of the passed string. For consistency, + # encode strings for python engine so that output from python and c + # engines produce consistent results + if kwargs.get('engine') == 'python' and PY2: + text = text.encode('utf-8') + return read_table(StringIO(text), sep=sep, **kwargs) @@ -99,7 +116,7 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover if excel: try: if sep is None: - sep = r'\t' + sep = '\t' buf = StringIO() # clipboard_set (pyperclip) expects unicode obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) @@ -108,8 +125,11 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover text = text.decode('utf-8') clipboard_set(text) return - except: - pass + except TypeError: + warnings.warn('to_clipboard in excel mode requires a single ' + 'character separator.') + elif sep is not None: + warnings.warn('to_clipboard with excel=False ignores the sep argument') if isinstance(obj, DataFrame): # str(df) has various unhelpful defaults, like truncation diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 80fddd50fc9a8..a6b331685e72a 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -88,8 +88,6 @@ def check_round_trip_frame(self, data, excel=None, sep=None, tm.assert_frame_equal(data, result, check_dtype=False) # Test that default arguments copy as tab delimited - @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' - 'Issue in #21104, Fixed in #21111') def test_round_trip_frame(self, df): self.check_round_trip_frame(df) @@ -99,10 +97,6 @@ def test_round_trip_frame_sep(self, df, sep): self.check_round_trip_frame(df, sep=sep) # Test white space separator - @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " - "aren't handled correctly in default c engine. Fixed " - "in #21111 by defaulting to python engine for " - "whitespace separator") def test_round_trip_frame_string(self, df): df.to_clipboard(excel=False, sep=None) result = read_clipboard() @@ -111,21 +105,17 @@ def test_round_trip_frame_string(self, df): # Two character separator is not supported in to_clipboard # Test that multi-character separators are not silently passed - @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") def test_excel_sep_warning(self, df): with tm.assert_produces_warning(): df.to_clipboard(excel=True, sep=r'\t') # Separator is ignored when excel=False and should produce a warning - @pytest.mark.xfail(reason="Not yet implemented. Fixed in #21111") def test_copy_delim_warning(self, df): with tm.assert_produces_warning(): df.to_clipboard(excel=False, sep='\t') # Tests that the default behavior of to_clipboard is tab # delimited and excel="True" - @pytest.mark.xfail(reason="to_clipboard defaults to space delim. Issue in " - "#21104, Fixed in #21111") @pytest.mark.parametrize('sep', ['\t', None, 'default']) @pytest.mark.parametrize('excel', [True, None, 'default']) def test_clipboard_copy_tabs_default(self, sep, excel, df): @@ -139,10 +129,6 @@ def test_clipboard_copy_tabs_default(self, sep, excel, df): assert clipboard_get() == df.to_csv(sep='\t') # Tests reading of white space separated tables - @pytest.mark.xfail(reason="Fails on 'delims' df because quote escapes " - "aren't handled correctly. in default c engine. Fixed " - "in #21111 by defaulting to python engine for " - "whitespace separator") @pytest.mark.parametrize('sep', [None, 'default']) @pytest.mark.parametrize('excel', [False]) def test_clipboard_copy_strings(self, sep, excel, df): @@ -193,8 +179,6 @@ def test_invalid_encoding(self, df): with pytest.raises(NotImplementedError): pd.read_clipboard(encoding='ascii') - @pytest.mark.xfail(reason='to_clipboard defaults to space delim. ' - 'Issue in #21104, Fixed in #21111') @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) def test_round_trip_valid_encodings(self, enc, df): self.check_round_trip_frame(df, encoding=enc) From 2fccdedda2c4bb0e5b9edce8269cdecc973b191d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Jul 2018 17:26:43 +0200 Subject: [PATCH 33/55] BUG: fix reindexing MultiIndex with categorical datetime-like level (#21657) (cherry picked from commit 1cc547185b92073a3465ea105055d7791e9e6c48) --- doc/source/whatsnew/v0.23.2.txt | 2 ++ pandas/core/indexes/multi.py | 26 +++++++++---------- .../tests/frame/test_axis_select_reindex.py | 15 ++++++++++- pandas/tests/groupby/test_categorical.py | 20 ++++++++++++++ pandas/tests/indexes/test_multi.py | 12 +++++++-- 5 files changed, 58 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 608db7487c1e4..bef90506477ed 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -55,6 +55,8 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby` + with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`). - Fixed regression in unary negative operations with object dtype (:issue:`21380`) - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) - Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 33db32cfe1166..9a4aa15f4cc25 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -11,6 +11,8 @@ from pandas.compat.numpy import function as nv from pandas import compat +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.common import ( _ensure_int64, _ensure_platform_int, @@ -808,20 +810,16 @@ def values(self): return self._tuples values = [] - for lev, lab in zip(self.levels, self.labels): - # Need to box timestamps, etc. - box = hasattr(lev, '_box_values') - # Try to minimize boxing. - if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._ndarray_values, - lab)) - elif box: - taken = algos.take_1d(lev._box_values(lev._ndarray_values), - lab, - fill_value=lev._na_value) - else: - taken = algos.take_1d(np.asarray(lev._values), lab) - values.append(taken) + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals): + vals = vals.get_values() + if (isinstance(vals.dtype, (PandasExtensionDtype, ExtensionDtype)) + or hasattr(vals, '_box_values')): + vals = vals.astype(object) + vals = np.array(vals, copy=False) + values.append(vals) self._tuples = lib.fast_zip(values) return self._tuples diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 0e0d6598f5101..004fb4eb0c128 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -10,7 +10,7 @@ import numpy as np from pandas.compat import lrange, lzip, u -from pandas import (compat, DataFrame, Series, Index, MultiIndex, +from pandas import (compat, DataFrame, Series, Index, MultiIndex, Categorical, date_range, isna) import pandas as pd @@ -1129,6 +1129,19 @@ def test_reindex_multi(self): assert_frame_equal(result, expected) + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = pd.MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + df = pd.DataFrame({'a': range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = pd.DataFrame( + {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + assert_frame_equal(result, expected) + data = [[1, 2, 3], [1, 2, 3]] @pytest.mark.parametrize('actual', [ diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cb76195eacf40..d021396a7acb3 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -850,3 +850,23 @@ def test_empty_prod(): result = df.groupby("A", observed=False).B.prod(min_count=1) expected = pd.Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) + + +def test_groupby_multiindex_categorical_datetime(): + # https://github.com/pandas-dev/pandas/issues/21390 + + df = pd.DataFrame({ + 'key1': pd.Categorical(list('abcbabcba')), + 'key2': pd.Categorical( + list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), + 'values': np.arange(9), + }) + result = df.groupby(['key1', 'key2']).mean() + + idx = pd.MultiIndex.from_product( + [pd.Categorical(['a', 'b', 'c']), + pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + names=['key1', 'key2']) + expected = pd.DataFrame( + {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 40e64d99ac440..a7e90207c9ad7 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -12,8 +12,8 @@ import pandas as pd -from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex, - compat, date_range, period_range) +from pandas import (CategoricalIndex, Categorical, DataFrame, Index, + MultiIndex, compat, date_range, period_range) from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.dtypes.dtypes import CategoricalDtype @@ -1595,6 +1595,14 @@ def test_get_indexer_nearest(self): with pytest.raises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_get_indexer_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + def test_hash_collisions(self): # non-smoke test that we don't get hash collisions From a74ee5496900e80bdc653899555fc701ce344bf7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Jul 2018 10:28:19 -0500 Subject: [PATCH 34/55] BUG: Fix MI repr with long names (#21655) (cherry picked from commit ad76ffcca0d92c3885c279c80701c2f4a3f3f177) --- doc/source/whatsnew/v0.23.2.txt | 1 + pandas/io/formats/format.py | 10 +++++-- pandas/tests/io/formats/test_format.py | 38 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index bef90506477ed..61d1b83ea8f2e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -55,6 +55,7 @@ Fixed Regressions - Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) - Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). - Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed printing of DataFrames with hierarchical columns with long names (:issue:`21180`) - Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby` with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`). - Fixed regression in unary negative operations with object dtype (:issue:`21380`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 12201f62946ac..c46f4b5ad9c18 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -636,10 +636,14 @@ def to_string(self): mid = int(round(n_cols / 2.)) mid_ix = col_lens.index[mid] col_len = col_lens[mid_ix] - adj_dif -= (col_len + 1) # adjoin adds one + # adjoin adds one + adj_dif -= (col_len + 1) col_lens = col_lens.drop(mid_ix) n_cols = len(col_lens) - max_cols_adj = n_cols - self.index # subtract index column + # subtract index column + max_cols_adj = n_cols - self.index + # GH-21180. Ensure that we print at least two. + max_cols_adj = max(max_cols_adj, 2) self.max_cols_adj = max_cols_adj # Call again _chk_truncate to cut frame appropriately @@ -778,7 +782,7 @@ def space_format(x, y): str_columns = list(zip(*[[space_format(x, y) for y in x] for x in fmt_columns])) - if self.sparsify: + if self.sparsify and len(str_columns): str_columns = _sparsify(str_columns) str_columns = [list(x) for x in zip(*str_columns)] diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 63b7cb3459069..191e3f37f1c37 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -305,6 +305,44 @@ def test_repr_non_interactive(self): assert not has_truncated_repr(df) assert not has_expanded_repr(df) + def test_repr_truncates_terminal_size(self): + # https://github.com/pandas-dev/pandas/issues/21180 + # TODO: use mock fixutre. + # This is being backported, so doing it directly here. + try: + from unittest import mock + except ImportError: + mock = pytest.importorskip("mock") + + terminal_size = (118, 96) + p1 = mock.patch('pandas.io.formats.console.get_terminal_size', + return_value=terminal_size) + p2 = mock.patch('pandas.io.formats.format.get_terminal_size', + return_value=terminal_size) + + index = range(5) + columns = pd.MultiIndex.from_tuples([ + ('This is a long title with > 37 chars.', 'cat'), + ('This is a loooooonger title with > 43 chars.', 'dog'), + ]) + df = pd.DataFrame(1, index=index, columns=columns) + + with p1, p2: + result = repr(df) + + h1, h2 = result.split('\n')[:2] + assert 'long' in h1 + assert 'loooooonger' in h1 + assert 'cat' in h2 + assert 'dog' in h2 + + # regular columns + df2 = pd.DataFrame({"A" * 41: [1, 2], 'B' * 41: [1, 2]}) + with p1, p2: + result = repr(df2) + + assert df2.columns[0] in result.split('\n')[0] + def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: From 1d3766c3fd303672f29be4a71919c37443450ad8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Jul 2018 21:05:12 +0200 Subject: [PATCH 35/55] DOC: clean-up 0.23.2 whatsnew file (#21750) (cherry picked from commit 2f0773f49a64d23774d66c30988c80541fd7bb6f) --- doc/source/whatsnew/v0.23.2.txt | 40 ++------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 61d1b83ea8f2e..2d7808363648b 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -62,19 +62,6 @@ Fixed Regressions - Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) - Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) -.. _whatsnew_0232.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- -- - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- -- Build Changes ------------- @@ -86,55 +73,32 @@ Build Changes Bug Fixes ~~~~~~~~~ -**Groupby/Resample/Rolling** - -- -- - -**Timedelta** - -- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) - **Conversion** - Bug in constructing :class:`Index` with an iterator or generator (:issue:`21470`) - Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) - **Indexing** - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) - Bug in :meth:`DataFrame.drop` behaviour is not consistent for unique and non-unique indexes (:issue:`21494`) - Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). -- **I/O** - Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) - Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`) -- - -**Plotting** - -- -- - -**Reshaping** - -- -- **Categorical** - Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) -- **Timezones** - Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) - Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) -**Other** +**Timedelta** -- +- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) From de4455663215d2a8767fbc14e29f1e5e320603d5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 5 Jul 2018 15:49:24 -0500 Subject: [PATCH 36/55] RLS: release notes for 0.23.2 (#21752) (cherry picked from commit bd8ba3680eae9c19221ef7200928bcef68508f4a) --- doc/source/release.rst | 34 +++++++++++++++++++++++++++++++++ doc/source/whatsnew.rst | 2 ++ doc/source/whatsnew/v0.23.2.txt | 2 +- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 2f7eedfbe9a45..08200d4d276cc 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,40 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: https://pypi.org/project/pandas * Documentation: http://pandas.pydata.org +pandas 0.23.2 +------------- + +**Release date**: July 5, 2018 + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. + +See the :ref:`full whatsnew ` for a list of all the changes. + +Thanks +~~~~~~ + +A total of 17 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +* David Krych +* Jacopo Rota + +* Jeff Reback +* Jeremy Schendel +* Joris Van den Bossche +* Kalyan Gokhale +* Matthew Roeschke +* Michael Odintsov + +* Ming Li +* Pietro Battiston +* Tom Augspurger +* Uddeshya Singh +* Vu Le + +* alimcmaster1 + +* david-liu-brattle-1 + +* gfyoung +* jbrockmendel + pandas 0.23.1 ------------- diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index eb9211d0ceb02..0972cc9432f8e 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.23.2.txt + .. include:: whatsnew/v0.23.1.txt .. include:: whatsnew/v0.23.0.txt diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 2d7808363648b..bd86576ad8586 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -64,7 +64,7 @@ Fixed Regressions Build Changes -------------- +~~~~~~~~~~~~~ - The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) From 9b0f560a73d11b2fa72c48d7fd16126b5137f349 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 5 Jul 2018 17:04:24 -0500 Subject: [PATCH 37/55] RLS: 0.23.2 From e2f65df75efbfbb914f22605d139f73967211905 Mon Sep 17 00:00:00 2001 From: "meeseeksdev[bot]" Date: Fri, 6 Jul 2018 11:32:05 -0500 Subject: [PATCH 38/55] Backport PR #21771: Whatsnew note for v0.23.3 (#21772) --- doc/source/whatsnew/v0.23.3.txt | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.3.txt diff --git a/doc/source/whatsnew/v0.23.3.txt b/doc/source/whatsnew/v0.23.3.txt new file mode 100644 index 0000000000000..d308cf7a3cfac --- /dev/null +++ b/doc/source/whatsnew/v0.23.3.txt @@ -0,0 +1,55 @@ +.. _whatsnew_0233: + +v0.23.3 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + + +.. contents:: What's new in v0.23.3 + :local: + :backlinks: none + +.. _whatsnew_0233.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0233.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Conversion** + +- +- + +**Indexing** + +- +- + +**I/O** + +- +- + +**Categorical** + +- +- + +**Timezones** + +- +- + +**Timedelta** + +- +- From d2b7b2b2913d5da18f8df476a51b7f2f521ed99d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Jul 2018 09:31:26 -0500 Subject: [PATCH 39/55] 0.23.3 fixup (#21788) * Move 0.23.3 to 0.23.4 * 0.23.3 whatsnew (cherry picked from commit a3f8f14b24032151ba57c36f0a70192e13bfd116) --- doc/source/whatsnew/v0.23.3.txt | 56 +++--------------------------- doc/source/whatsnew/v0.23.4.txt | 60 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 52 deletions(-) create mode 100644 doc/source/whatsnew/v0.23.4.txt diff --git a/doc/source/whatsnew/v0.23.3.txt b/doc/source/whatsnew/v0.23.3.txt index d308cf7a3cfac..b8adce27d2523 100644 --- a/doc/source/whatsnew/v0.23.3.txt +++ b/doc/source/whatsnew/v0.23.3.txt @@ -1,55 +1,7 @@ .. _whatsnew_0233: -v0.23.3 -------- +v0.23.3 (July 7, 2018) +---------------------- -This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes -and bug fixes. We recommend that all users upgrade to this version. - - -.. contents:: What's new in v0.23.3 - :local: - :backlinks: none - -.. _whatsnew_0233.fixed_regressions: - -Fixed Regressions -~~~~~~~~~~~~~~~~~ - -- -- - -.. _whatsnew_0233.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -**Conversion** - -- -- - -**Indexing** - -- -- - -**I/O** - -- -- - -**Categorical** - -- -- - -**Timezones** - -- -- - -**Timedelta** - -- -- +This release fixes a build issue with the sdist for Python 3.7 (:issue:`21785`) +There are no other changes. diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt new file mode 100644 index 0000000000000..a88c22e3d01f7 --- /dev/null +++ b/doc/source/whatsnew/v0.23.4.txt @@ -0,0 +1,60 @@ +.. _whatsnew_0234: + +v0.23.4 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + + +.. contents:: What's new in v0.23.4 + :local: + :backlinks: none + +.. _whatsnew_0234.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0234.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) +- + +**Conversion** + +- +- + +**Indexing** + +- +- + +**I/O** + +- +- + +**Categorical** + +- +- + +**Timezones** + +- +- + +**Timedelta** + +- +- From a24750fbff99971ef3a31b610e74c9a0945f2aa0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Jul 2018 09:53:25 -0500 Subject: [PATCH 40/55] DOC: Updated whatsnew.rst --- doc/source/whatsnew.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 0972cc9432f8e..afd274332b3df 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.23.3.txt + .. include:: whatsnew/v0.23.2.txt .. include:: whatsnew/v0.23.1.txt From edb71fda022c6a155717e7a25679040ee0476639 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 7 Jul 2018 10:09:56 -0500 Subject: [PATCH 41/55] RLS: 0.23.3 From b7a2cd4a4c6ea235005aecbc2911034c6064afd3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 7 Jul 2018 13:57:42 -0500 Subject: [PATCH 42/55] Removed Need for OHLC As First Element if Used in .agg (#21769) (#21794) --- pandas/core/groupby/groupby.py | 6 ++---- pandas/tests/groupby/test_groupby.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..9d227ef37595f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3557,13 +3557,11 @@ def _aggregate_multiple_funcs(self, arg, _level): obj._selection = name results[name] = obj.aggregate(func) - if isinstance(list(compat.itervalues(results))[0], - DataFrame): - + if any(isinstance(x, DataFrame) for x in compat.itervalues(results)): # let higher level handle if _level: return results - return list(compat.itervalues(results))[0] + return DataFrame(results, columns=columns) def _wrap_output(self, output, index, names=None): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e05f9de5ea7f4..66577d738dd28 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1674,3 +1674,22 @@ def test_tuple_correct_keyerror(): [3, 4]])) with tm.assert_raises_regex(KeyError, "(7, 8)"): df.groupby((7, 8)).mean() + + +def test_groupby_agg_ohlc_non_first(): + # GH 21716 + df = pd.DataFrame([[1], [1]], columns=['foo'], + index=pd.date_range('2018-01-01', periods=2, freq='D')) + + expected = pd.DataFrame([ + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1] + ], columns=pd.MultiIndex.from_tuples(( + ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'), + ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'), + ('foo', 'sum', 'foo'))), index=pd.date_range( + '2018-01-01', periods=2, freq='D')) + + result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) + + tm.assert_frame_equal(result, expected) From 5609eff083baeacbfc80ce9c3a086c7530a7f2b4 Mon Sep 17 00:00:00 2001 From: "meeseeksdev[bot]" Date: Wed, 18 Jul 2018 21:41:02 -0400 Subject: [PATCH 43/55] Backport PR #21921: BUG:Clip with a list-like threshold with a nan is broken (GH19992) (#21967) --- doc/source/whatsnew/v0.23.4.txt | 4 ++++ pandas/core/generic.py | 6 ++++-- pandas/tests/frame/test_analytics.py | 18 ++++++++++++++---- pandas/tests/series/test_analytics.py | 8 ++++++-- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index a88c22e3d01f7..5e19ab491647d 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -58,3 +58,7 @@ Bug Fixes - - + +**Missing** + +- Bug in :func:`Series.clip` and :func:`DataFrame.clip` cannot accept list-like threshold containing ``NaN`` (:issue:`19992`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 02462218e8b02..facc709877285 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6433,9 +6433,11 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, # GH 17276 # numpy doesn't like NaN as a clip value # so ignore - if np.any(pd.isnull(lower)): + # GH 19992 + # numpy doesn't drop a list-like bound containing NaN + if not is_list_like(lower) and np.any(pd.isnull(lower)): lower = None - if np.any(pd.isnull(upper)): + if not is_list_like(upper) and np.any(pd.isnull(upper)): upper = None # GH 2747 (arguments were reversed) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 437d3a9d24730..415ae982673ee 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2195,13 +2195,23 @@ def test_clip_with_na_args(self): """Should process np.nan argument as None """ # GH # 17276 tm.assert_frame_equal(self.frame.clip(np.nan), self.frame) - tm.assert_frame_equal(self.frame.clip(upper=[1, 2, np.nan]), - self.frame) - tm.assert_frame_equal(self.frame.clip(lower=[1, np.nan, 3]), - self.frame) tm.assert_frame_equal(self.frame.clip(upper=np.nan, lower=np.nan), self.frame) + # GH #19992 + df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6], + 'col_2': [7, 8, 9]}) + + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan], + 'col_2': [7, 8, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6], + 'col_2': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + # Matrix-like def test_dot(self): a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1e6ea96a5de51..bcf209521f913 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1140,11 +1140,15 @@ def test_clip_with_na_args(self): s = Series([1, 2, 3]) assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - assert_series_equal(s.clip(upper=[1, 1, np.nan]), Series([1, 2, 3])) - assert_series_equal(s.clip(lower=[1, np.nan, 1]), Series([1, 2, 3])) assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + # GH #19992 + assert_series_equal(s.clip(lower=[0, 4, np.nan]), + Series([1, 4, np.nan])) + assert_series_equal(s.clip(upper=[1, np.nan, 1]), + Series([1, np.nan, 1])) + def test_clip_against_series(self): # GH #6966 From 6a0a95058659cec7515b0233d7795417dfb074fe Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 20 Jul 2018 05:28:01 -0700 Subject: [PATCH 44/55] Backport PR #21966: Fix memory leak in roll_quantile (#21973) --- doc/source/whatsnew/v0.23.4.txt | 1 + pandas/_libs/window.pyx | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index 5e19ab491647d..a30fbc75f11f8 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -27,6 +27,7 @@ Bug Fixes **Groupby/Resample/Rolling** - Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) +- Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`) - **Conversion** diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 5121d293efcb6..a77433e5d1115 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1482,6 +1482,8 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, else: output[i] = NaN + skiplist_destroy(skiplist) + return output From 14e1985f7a34b311cfb57c6f4f1bfe407e64bc75 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Thu, 26 Jul 2018 12:32:29 -0500 Subject: [PATCH 45/55] BUG: rolling with MSVC 2017 build (#21813) * Appveyor 3.7 * ci package list * change image type * try hack fix * lint * use isnan on problem function * use numpy compat isnan * use right isnan * work around OSX math undefs * cleanup const * fix reversion * ... (cherry picked from commit 7a2fbce899aad302891ff9a95aeb1bd55efe533a) --- appveyor.yml | 2 ++ doc/source/whatsnew/v0.23.4.txt | 2 +- pandas/_libs/src/headers/cmath | 1 + pandas/_libs/window.pyx | 21 +++++++++++---------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index f70fc829ec971..c6199c1493f22 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,12 +20,14 @@ environment: matrix: - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index a30fbc75f11f8..7890d199564f6 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ -- +- Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`) - .. _whatsnew_0234.bug_fixes: diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath index d8e2239406cae..2bccf9bb13d77 100644 --- a/pandas/_libs/src/headers/cmath +++ b/pandas/_libs/src/headers/cmath @@ -6,6 +6,7 @@ #if defined(_MSC_VER) && (_MSC_VER < 1800) #include namespace std { + __inline int isnan(double x) { return _isnan(x); } __inline int signbit(double num) { return _copysign(1.0, num) < 0; } } #else diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index a77433e5d1115..6954094b46e69 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -14,6 +14,7 @@ cnp.import_array() cdef extern from "../src/headers/cmath" namespace "std": + bint isnan(double) nogil int signbit(double) nogil double sqrt(double x) nogil @@ -654,16 +655,16 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, double *ssqdm_x) nogil: """ add a value from the var calc """ cdef double delta - - # Not NaN - if val == val: - nobs[0] = nobs[0] + 1 - - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] - mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] + # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug + if isnan(val): + return + + nobs[0] = nobs[0] + 1 + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, From 398582616c434330283d82fd029ace7dbd3c6993 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Tue, 17 Jul 2018 14:01:51 +0200 Subject: [PATCH 46/55] DOC add Python2.7 warning to recent whatsnew; include 23.3 (#21944) (cherry picked from commit 4802002ab0564ae384e425c074fde688a228a43f) --- doc/source/whatsnew/v0.23.1.txt | 5 +++++ doc/source/whatsnew/v0.23.2.txt | 4 ++++ doc/source/whatsnew/v0.23.4.txt | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index a52ba22cf36d2..9f8635743ea6a 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -6,6 +6,11 @@ v0.23.1 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + .. contents:: What's new in v0.23.1 :local: :backlinks: none diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index bd86576ad8586..77ad860fc4e8e 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -11,6 +11,10 @@ and bug fixes. We recommend that all users upgrade to this version. Pandas 0.23.2 is first pandas release that's compatible with Python 3.7 (:issue:`20552`) +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. .. contents:: What's new in v0.23.2 :local: diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index 7890d199564f6..c17f4ffdd6b8e 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -6,6 +6,10 @@ v0.23.4 This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. .. contents:: What's new in v0.23.4 :local: From 12cfef9f80732279687df4ca701967c0ead0a1cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 2 Aug 2018 15:26:40 -0500 Subject: [PATCH 47/55] 0.23.4 whatsnew (#22177) (cherry picked from commit e4381b6e7c3cf1c6f424d01e3dc2613710d79b0d) --- doc/source/whatsnew/v0.23.4.txt | 36 ++------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt index c17f4ffdd6b8e..9a3ad3f61ee49 100644 --- a/doc/source/whatsnew/v0.23.4.txt +++ b/doc/source/whatsnew/v0.23.4.txt @@ -1,7 +1,7 @@ .. _whatsnew_0234: -v0.23.4 -------- +v0.23.4 (August 3, 2018) +------------------------ This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes and bug fixes. We recommend that all users upgrade to this version. @@ -21,7 +21,6 @@ Fixed Regressions ~~~~~~~~~~~~~~~~~ - Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`) -- .. _whatsnew_0234.bug_fixes: @@ -32,37 +31,6 @@ Bug Fixes - Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) - Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`) -- - -**Conversion** - -- -- - -**Indexing** - -- -- - -**I/O** - -- -- - -**Categorical** - -- -- - -**Timezones** - -- -- - -**Timedelta** - -- -- **Missing** From b9bacc95c013db0c5cb23a6ddc5496c39668a7c4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 28 Jul 2018 09:16:07 -0400 Subject: [PATCH 48/55] TST: skip pytables test with not-updated pytables conda package (#22099) (cherry picked from commit 017e910a90cbb29c0f844f4d6aa966ebb5cd680a) --- pandas/tests/io/test_pytables.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 7dafc9603f96d..3c6b52074763e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -14,7 +14,7 @@ from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, - isna, compat, concat, Timestamp) + isna, compat, concat, Timestamp, _np_version_under1p15) import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -2140,6 +2140,10 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... pytest.raises(TypeError, store.append, 'df_unimplemented', df) + @pytest.mark.skipif( + not _np_version_under1p15, + reason=("pytables conda build package needs build " + "with numpy 1.15: gh-22098")) def test_calendar_roundtrip_issue(self): # 8591 From 0409521665bd436a10aea7e06336066bf07ff057 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Aug 2018 12:19:26 -0500 Subject: [PATCH 49/55] RLS: 0.23.4 From c420e75851361025c8f20c5d00c44c7feef56d5a Mon Sep 17 00:00:00 2001 From: William Ayd Date: Tue, 7 Aug 2018 09:23:03 -0700 Subject: [PATCH 50/55] Added whatsnew for v0.23.5 (#22233) --- doc/source/whatsnew/v0.23.5.txt | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 doc/source/whatsnew/v0.23.5.txt diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt new file mode 100644 index 0000000000000..ee0ee4259f86d --- /dev/null +++ b/doc/source/whatsnew/v0.23.5.txt @@ -0,0 +1,39 @@ +.. _whatsnew_0235: + +v0.23.5 (TBD 0, 2018) +--------------------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. contents:: What's new in v0.23.5 + :local: + :backlinks: none + +.. _whatsnew_0235.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. _whatsnew_0235.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- +- + +**Missing** + +- +- From faa199298eaeb1173571da47eaaecaf3b455c7d3 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Aug 2018 08:45:12 -0600 Subject: [PATCH 51/55] Backport PR #22169: BUG: Fix using "inf"/"-inf" in na_values for csv with int index column (#22259) --- doc/source/whatsnew/v0.23.5.txt | 4 ++++ pandas/core/algorithms.py | 4 ++-- pandas/tests/io/parser/na_values.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index ee0ee4259f86d..6a36adb915b3c 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -37,3 +37,7 @@ Bug Fixes - - + +**I/O** + +- Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bcde32696c1ff..9d8d208d2d5c1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -95,7 +95,7 @@ def _ensure_data(values, dtype=None): values = _ensure_float64(values) return values, 'float64', 'float64' - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here return _ensure_object(values), 'object', 'object' @@ -429,7 +429,7 @@ def isin(comps, values): values = values.astype('int64', copy=False) comps = comps.astype('int64', copy=False) f = lambda x, y: htable.ismember_int64(x, y) - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): values = values.astype(object) comps = comps.astype(object) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index d2c3f82e95c4d..cc224efd533b7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -369,3 +369,14 @@ def test_no_na_filter_on_index(self): expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index([np.nan, 5.0], name="b")) tm.assert_frame_equal(out, expected) + + def test_inf_na_values_with_int_index(self): + # see gh-17128 + data = "idx,col1,col2\n1,3,4\n2,inf,-inf" + + # Don't fail with OverflowError with infs and integer index column + out = self.read_csv(StringIO(data), index_col=[0], + na_values=['inf', '-inf']) + expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, + index=Index([1, 2], name="idx")) + tm.assert_frame_equal(out, expected) From 11c0523f8fffe33131890d6bd2c71f8edacea5c4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Aug 2018 08:45:29 -0600 Subject: [PATCH 52/55] Backport PR #22253: Resampling with NaT in TimedeltaIndex raises MemoryError (#22258) --- doc/source/whatsnew/v0.23.5.txt | 2 +- pandas/core/resample.py | 3 +-- pandas/tests/test_resample.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 6a36adb915b3c..304ab12752ad4 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -30,7 +30,7 @@ Bug Fixes **Groupby/Resample/Rolling** -- +- Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). - **Missing** diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0707cc756682e..e6b9f88c52cd7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1383,8 +1383,7 @@ def _get_time_delta_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - start = ax[0] - end = ax[-1] + start, end = ax.min(), ax.max() labels = binner = TimedeltaIndex(start=start, end=end, freq=self.freq, diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c1257cce9a9a4..bcc50a25623a1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2870,6 +2870,16 @@ def test_asfreq_bug(self): freq='1T')) assert_frame_equal(result, expected) + def test_resample_with_nat(self): + # GH 13223 + index = pd.to_timedelta(['0s', pd.NaT, '2s']) + result = DataFrame({'value': [2, 3, 5]}, index).resample('1s').mean() + expected = DataFrame({'value': [2.5, np.nan, 5.0]}, + index=timedelta_range('0 day', + periods=3, + freq='1S')) + assert_frame_equal(result, expected) + class TestResamplerGrouper(object): From 932de54ac027b9cc8147642ea4448a63fdda33b2 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 20 Aug 2018 04:04:28 -0700 Subject: [PATCH 53/55] Backport PR #22424: CI: add missing tzlocal dependency (rpy2, doc build) (#22425) --- ci/travis-36-doc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index c22dddbe0ba3f..8705b82412e7c 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -36,6 +36,7 @@ dependencies: - sphinx - sqlalchemy - statsmodels + - tzlocal - xarray - xlrd - xlsxwriter From 183e92f1309a15e34c890e6c18dd5c7c53f61210 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 11 Sep 2018 09:40:58 -0700 Subject: [PATCH 54/55] CI / BLD: Various CI Backports (#22637) * CI: Bump NumPy to 1.9.3 Backport of gh-22499. * BLD: Fix openpyxl to 2.5.5 Backport of gh-22601. * CI: Resolve timeout issue on Travis Backported from gh-22429. * CI: Migrate to CircleCI 2.0 Backport of gh-21814. * Upgrade Cython to >=0.28.2 Backported from gh-21688. * TST: Patch locale handling Backported from gh-21739. Backport of gh-22213. --- .circleci/config.yml | 147 ++++++++++++++++++ ci/appveyor-27.yaml | 2 +- ci/appveyor-36.yaml | 2 +- ci/circle-27-compat.yaml | 6 +- ci/circle-35-ascii.yaml | 2 +- ci/circle-36-locale.yaml | 2 +- ci/circle-36-locale_slow.yaml | 2 +- ci/install_circle.sh | 19 +-- ci/install_db_circle.sh | 8 - ci/requirements-optional-conda.txt | 2 +- ci/requirements-optional-pip.txt | 4 +- ci/run_circle.sh | 2 +- ci/travis-27-locale.yaml | 2 +- ci/travis-27.yaml | 1 + ci/travis-35-osx.yaml | 2 +- ci/travis-36-doc.yaml | 2 +- ci/travis-36-slow.yaml | 2 +- ci/travis-36.yaml | 2 +- circle.yml | 38 ----- pandas/tests/indexes/datetimes/test_misc.py | 19 ++- pandas/tests/io/json/test_compression.py | 2 + pandas/tests/io/json/test_pandas.py | 2 + pandas/tests/io/parser/test_network.py | 2 + pandas/tests/io/test_excel.py | 1 + .../tests/scalar/timestamp/test_timestamp.py | 20 ++- pandas/tests/series/test_datetime_values.py | 20 ++- pandas/tests/util/test_util.py | 22 ++- pandas/util/testing.py | 32 ++-- 28 files changed, 272 insertions(+), 95 deletions(-) create mode 100644 .circleci/config.yml delete mode 100755 ci/install_db_circle.sh delete mode 100644 circle.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000000..e947f30d285cd --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,147 @@ +version: 2 +jobs: + + # -------------------------------------------------------------------------- + # 0. py27_compat + # -------------------------------------------------------------------------- + py27_compat: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + environment: + JOB: "2.7_COMPAT" + ENV_FILE: "ci/circle-27-compat.yaml" + LOCALE_OVERRIDE: "it_IT.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 1. py36_locale + # -------------------------------------------------------------------------- + py36_locale: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE" + ENV_FILE: "ci/circle-36-locale.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 2. py36_locale_slow + # -------------------------------------------------------------------------- + py36_locale_slow: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE_SLOW" + ENV_FILE: "ci/circle-36-locale_slow.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --only-slow --skip-network + + # -------------------------------------------------------------------------- + # 3. py35_ascii + # -------------------------------------------------------------------------- + py35_ascii: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.5_ASCII" + ENV_FILE: "ci/circle-35-ascii.yaml" + LOCALE_OVERRIDE: "C" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + +workflows: + version: 2 + build_and_test: + jobs: + - py27_compat + - py36_locale + - py36_locale_slow + - py35_ascii diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml index 84107c605b14f..e47ebf75344fa 100644 --- a/ci/appveyor-27.yaml +++ b/ci/appveyor-27.yaml @@ -12,7 +12,7 @@ dependencies: - matplotlib - numexpr - numpy=1.10* - - openpyxl + - openpyxl=2.5.5 - pytables==3.2.2 - python=2.7.* - pytz diff --git a/ci/appveyor-36.yaml b/ci/appveyor-36.yaml index 5e370de39958a..d007f04ca0720 100644 --- a/ci/appveyor-36.yaml +++ b/ci/appveyor-36.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy=1.13* - - openpyxl + - openpyxl=2.5.5 - pyarrow - pytables - python-dateutil diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml index 81a48d4edf11c..e037877819b14 100644 --- a/ci/circle-27-compat.yaml +++ b/ci/circle-27-compat.yaml @@ -4,11 +4,11 @@ channels: - conda-forge dependencies: - bottleneck=1.0.0 - - cython=0.24 + - cython=0.28.2 - jinja2=2.8 - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr - - numpy=1.9.2 - - openpyxl + - numpy=1.9.3 + - openpyxl=2.5.5 - psycopg2 - pytables=3.2.2 - python-dateutil=2.5.0 diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml index 602c414b49bb2..745678791458d 100644 --- a/ci/circle-35-ascii.yaml +++ b/ci/circle-35-ascii.yaml @@ -2,7 +2,7 @@ name: pandas channels: - defaults dependencies: - - cython + - cython>=0.28.2 - nomkl - numpy - python-dateutil diff --git a/ci/circle-36-locale.yaml b/ci/circle-36-locale.yaml index cc852c1e2aeeb..a85e0b58f5e33 100644 --- a/ci/circle-36-locale.yaml +++ b/ci/circle-36-locale.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pymysql - pytables diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml index cc852c1e2aeeb..a85e0b58f5e33 100644 --- a/ci/circle-36-locale_slow.yaml +++ b/ci/circle-36-locale_slow.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pymysql - pytables diff --git a/ci/install_circle.sh b/ci/install_circle.sh index 5ffff84c88488..f8bcf6bcffc99 100755 --- a/ci/install_circle.sh +++ b/ci/install_circle.sh @@ -6,14 +6,7 @@ echo "[home_dir: $home_dir]" echo "[ls -ltr]" ls -ltr -echo "[Using clean Miniconda install]" -rm -rf "$MINICONDA_DIR" - -# install miniconda -wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 -bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 - -export PATH="$MINICONDA_DIR/bin:$PATH" +apt-get update -y && apt-get install -y build-essential postgresql-client-9.6 echo "[update conda]" conda config --set ssl_verify false || exit 1 @@ -48,9 +41,17 @@ source $ENVS_FILE # edit the locale override if needed if [ -n "$LOCALE_OVERRIDE" ]; then + + apt-get update && apt-get -y install locales locales-all + + export LANG=$LOCALE_OVERRIDE + export LC_ALL=$LOCALE_OVERRIDE + + python -c "import locale; locale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")" || exit 1; + echo "[Adding locale to the first line of pandas/__init__.py]" rm -f pandas/__init__.pyc - sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")\n" sed -i "$sedc" pandas/__init__.py echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py diff --git a/ci/install_db_circle.sh b/ci/install_db_circle.sh deleted file mode 100755 index a00f74f009f54..0000000000000 --- a/ci/install_db_circle.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -echo "installing dbs" -mysql -e 'create database pandas_nosetest;' -psql -c 'create database pandas_nosetest;' -U postgres - -echo "done" -exit 0 diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index e8cfcdf80f2e8..ca60c772392e7 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -11,7 +11,7 @@ lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql pytables diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 877c52fa0b4fd..a6009c270c2a6 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -13,7 +13,7 @@ lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql tables @@ -26,4 +26,4 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt diff --git a/ci/run_circle.sh b/ci/run_circle.sh index 435985bd42148..fc2a8b849a354 100755 --- a/ci/run_circle.sh +++ b/ci/run_circle.sh @@ -6,4 +6,4 @@ export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas +pytest --strict --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index 1312c1296d46a..eacae4630edeb 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.24 - lxml - matplotlib=1.4.3 - - numpy=1.9.2 + - numpy=1.9.3 - openpyxl=2.4.0 - python-dateutil - python-blosc diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 22b993a2da886..26a520a16a4cc 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -27,6 +27,7 @@ dependencies: - PyCrypto - pymysql=0.6.3 - pytables + - blosc=1.14.3 - python-blosc - python-dateutil=2.5.0 - python=2.7* diff --git a/ci/travis-35-osx.yaml b/ci/travis-35-osx.yaml index e74abac4c9775..5722d91781999 100644 --- a/ci/travis-35-osx.yaml +++ b/ci/travis-35-osx.yaml @@ -12,7 +12,7 @@ dependencies: - nomkl - numexpr - numpy=1.10.4 - - openpyxl + - openpyxl=2.5.5 - pytables - python=3.5* - pytz diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index 8705b82412e7c..05ff26020ac7d 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -21,7 +21,7 @@ dependencies: - notebook - numexpr - numpy=1.13* - - openpyxl + - openpyxl=2.5.5 - pandoc - pyqt - pytables diff --git a/ci/travis-36-slow.yaml b/ci/travis-36-slow.yaml index 6c475dc48723c..ae6353216cc2d 100644 --- a/ci/travis-36-slow.yaml +++ b/ci/travis-36-slow.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - patsy - psycopg2 - pymysql diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index 006276ba1a65f..83f963b9d9b6d 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -17,7 +17,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pyarrow - pymysql diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 66415defba6fe..0000000000000 --- a/circle.yml +++ /dev/null @@ -1,38 +0,0 @@ -machine: - environment: - # these are globally set - MINICONDA_DIR: /home/ubuntu/miniconda3 - - -database: - override: - - ./ci/install_db_circle.sh - - -checkout: - post: - # since circleci does a shallow fetch - # we need to populate our tags - - git fetch --depth=1000 - - -dependencies: - override: - - > - case $CIRCLE_NODE_INDEX in - 0) - sudo apt-get install language-pack-it && ./ci/install_circle.sh JOB="2.7_COMPAT" ENV_FILE="ci/circle-27-compat.yaml" LOCALE_OVERRIDE="it_IT.UTF-8" ;; - 1) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE" ENV_FILE="ci/circle-36-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 2) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE_SLOW" ENV_FILE="ci/circle-36-locale_slow.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 3) - ./ci/install_circle.sh JOB="3.5_ASCII" ENV_FILE="ci/circle-35-ascii.yaml" LOCALE_OVERRIDE="C" ;; - esac - - ./ci/show_circle.sh - - -test: - override: - - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: - parallel: true diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 056924f2c6663..743cbc107cce5 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,5 +1,6 @@ import locale import calendar +import unicodedata import pytest @@ -7,7 +8,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (Index, DatetimeIndex, datetime, offsets, - date_range, Timestamp) + date_range, Timestamp, compat) class TestTimeSeries(object): @@ -284,10 +285,24 @@ def test_datetime_name_accessors(self, time_locale): dti = DatetimeIndex(freq='M', start='2012', end='2013') result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) + + # work around different normalization schemes + # https://github.com/pandas-dev/pandas/issues/22342 + if not compat.PY2: + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + tm.assert_index_equal(result, expected) + for date, expected in zip(dti, expected_months): result = date.month_name(locale=time_locale) - assert result == expected.capitalize() + expected = expected.capitalize() + + if not compat.PY2: + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", result) + + assert result == expected dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.month_name(locale=time_locale)[-1]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 05ceace20f5a4..1b9cbc57865d2 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -2,6 +2,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.util.testing import assert_frame_equal, assert_raises_regex @@ -31,6 +32,7 @@ def test_read_zipped_json(datapath): assert_frame_equal(uncompressed_df, compressed_df) +@td.skip_if_not_us_locale def test_with_s3_url(compression): boto3 = pytest.importorskip('boto3') pytest.importorskip('s3fs') diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index bcbac4400c953..b5a2be87de1c4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -15,6 +15,7 @@ assert_series_equal, network, ensure_clean, assert_index_equal) import pandas.util.testing as tm +import pandas.util._test_decorators as td _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -1040,6 +1041,7 @@ def test_read_inline_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) + @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_resource): # GH17200 diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index e2243b8087a5b..72d2c5fd8d18f 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -55,10 +55,12 @@ def tips_df(datapath): @pytest.mark.usefixtures("s3_resource") +@td.skip_if_not_us_locale() class TestS3(object): def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') + # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4e2b2af0ebfe7..20f403e71fd36 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -576,6 +576,7 @@ def test_read_from_http_url(self, ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_no('s3fs') + @td.skip_if_not_us_locale def test_read_from_s3_url(self, ext): boto3 = pytest.importorskip('boto3') moto = pytest.importorskip('moto') diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 4689c7bea626f..e829506e95b53 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -5,6 +5,7 @@ import dateutil import calendar import locale +import unicodedata import numpy as np from dateutil.tz import tzutc @@ -20,7 +21,7 @@ from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz from pandas.errors import OutOfBoundsDatetime -from pandas.compat import long, PY3 +from pandas.compat import long, PY3, PY2 from pandas.compat.numpy import np_datetime64_compat from pandas import Timestamp, Period, Timedelta, NaT @@ -116,8 +117,21 @@ def test_names(self, data, time_locale): expected_day = calendar.day_name[0].capitalize() expected_month = calendar.month_name[8].capitalize() - assert data.day_name(time_locale) == expected_day - assert data.month_name(time_locale) == expected_month + result_day = data.day_name(time_locale) + result_month = data.month_name(time_locale) + + # Work around https://github.com/pandas-dev/pandas/issues/22342 + # different normalizations + + if not PY2: + expected_day = unicodedata.normalize("NFD", expected_day) + expected_month = unicodedata.normalize("NFD", expected_month) + + result_day = unicodedata.normalize("NFD", result_day,) + result_month = unicodedata.normalize("NFD", result_month) + + assert result_day == expected_day + assert result_month == expected_month # Test NaT nan_ts = Timestamp(NaT) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 47798d0ddd7f5..5e924ac5c8894 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -3,6 +3,7 @@ import locale import calendar +import unicodedata import pytest from datetime import datetime, date @@ -13,7 +14,8 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range, - PeriodIndex, DatetimeIndex, TimedeltaIndex) + PeriodIndex, DatetimeIndex, TimedeltaIndex, + compat) import pandas.core.common as com from pandas.util.testing import assert_series_equal @@ -309,10 +311,24 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): s = Series(DatetimeIndex(freq='M', start='2012', end='2013')) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) + + # work around https://github.com/pandas-dev/pandas/issues/22342 + if not compat.PY2: + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + tm.assert_series_equal(result, expected) + for s_date, expected in zip(s, expected_months): result = s_date.month_name(locale=time_locale) - assert result == expected.capitalize() + expected = expected.capitalize() + + if not compat.PY2: + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", expected) + + assert result == expected + s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1]) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 145be7f85b193..c049dfc874940 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -433,6 +433,26 @@ def teardown_class(cls): del cls.locales del cls.current_locale + def test_can_set_locale_valid_set(self): + # Setting the default locale should return True + assert tm.can_set_locale('') is True + + def test_can_set_locale_invalid_set(self): + # Setting an invalid locale should return False + assert tm.can_set_locale('non-existent_locale') is False + + def test_can_set_locale_invalid_get(self, monkeypatch): + # In some cases, an invalid locale can be set, + # but a subsequent getlocale() raises a ValueError + # See GH 22129 + + def mockgetlocale(): + raise ValueError() + + with monkeypatch.context() as m: + m.setattr(locale, 'getlocale', mockgetlocale) + assert tm.can_set_locale('') is False + def test_get_locales(self): # all systems should have at least a single locale assert len(tm.get_locales()) > 0 @@ -466,7 +486,7 @@ def test_set_locale(self): enc = codecs.lookup(enc).name new_locale = lang, enc - if not tm._can_set_locale(new_locale): + if not tm.can_set_locale(new_locale): with pytest.raises(locale.Error): with tm.set_locale(new_locale): pass diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b7edbff00a4b9..bb79c25126fab 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -478,6 +478,8 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): A string of the form .. For example to set the current locale to US English with a UTF8 encoding, you would pass "en_US.UTF-8". + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. Notes ----- @@ -489,37 +491,37 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): try: locale.setlocale(lc_var, new_locale) - - try: - normalized_locale = locale.getlocale() - except ValueError: - yield new_locale + normalized_locale = locale.getlocale() + if com._all_not_none(*normalized_locale): + yield '.'.join(normalized_locale) else: - if com._all_not_none(*normalized_locale): - yield '.'.join(normalized_locale) - else: - yield new_locale + yield new_locale finally: locale.setlocale(lc_var, current_locale) -def _can_set_locale(lc): - """Check to see if we can set a locale without throwing an exception. +def can_set_locale(lc, lc_var=locale.LC_ALL): + """ + Check to see if we can set a locale, and subsequently get the locale, + without raising an Exception. Parameters ---------- lc : str The locale to attempt to set. + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. Returns ------- - isvalid : bool + is_valid : bool Whether the passed locale can be set """ try: - with set_locale(lc): + with set_locale(lc, lc_var=lc_var): pass - except locale.Error: # horrible name for a Exception subclass + except (ValueError, + locale.Error): # horrible name for a Exception subclass return False else: return True @@ -546,7 +548,7 @@ def _valid_locales(locales, normalize): else: normalizer = lambda x: x.strip() - return list(filter(_can_set_locale, map(normalizer, locales))) + return list(filter(can_set_locale, map(normalizer, locales))) # ----------------------------------------------------------------------------- # Stdout / stderr decorators From af7b0ba461a5b81733afdc7fc816a869b798093d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 11 Sep 2018 14:45:25 -0700 Subject: [PATCH 55/55] BUG: NaN should have pct rank of NaN (#22634) Backport of gh-22600. --- doc/source/whatsnew/v0.23.5.txt | 3 +++ pandas/_libs/groupby_helper.pxi.in | 7 ++++++- pandas/tests/groupby/test_rank.py | 19 ++++++++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 304ab12752ad4..f69e38e7fdd50 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -20,6 +20,9 @@ and bug fixes. We recommend that all users upgrade to this version. Fixed Regressions ~~~~~~~~~~~~~~~~~ +- Calling :meth:`DataFrameGroupBy.rank` and :meth:`SeriesGroupBy.rank` with empty groups + and ``pct=True`` was raising a ``ZeroDivisionError`` due to `c1068d9 + `_ (:issue:`22519`) - - diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b3e9b7c9e69ee..d7885e112a7e0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -587,7 +587,12 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if pct: for i in range(N): - out[i, 0] = out[i, 0] / grp_sizes[i, 0] + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + else: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 203c3c73bec94..d978e144e5013 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -1,7 +1,7 @@ import pytest import numpy as np import pandas as pd -from pandas import DataFrame, concat +from pandas import DataFrame, Series, concat from pandas.util import testing as tm @@ -252,3 +252,20 @@ def test_rank_object_raises(ties_method, ascending, na_option, df.groupby('key').rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) + + +def test_rank_empty_group(): + # see gh-22519 + column = "A" + df = DataFrame({ + "A": [0, 1, 0], + "B": [1., np.nan, 2.] + }) + + result = df.groupby(column).B.rank(pct=True) + expected = Series([0.5, np.nan, 1.0], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby(column).rank(pct=True) + expected = DataFrame({"B": [0.5, np.nan, 1.0]}) + tm.assert_frame_equal(result, expected)