Skip to content

Commit f390f9b

Browse files
committed
BUG: fix bugs in tests, more resampling
1 parent 93c9fb3 commit f390f9b

File tree

6 files changed

+148
-59
lines changed

6 files changed

+148
-59
lines changed

pandas/core/groupby.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -796,7 +796,7 @@ def get_grouper(self, obj):
796796
class BinGrouper(Grouper):
797797

798798
def __init__(self, bins, binlabels, filter_empty=False):
799-
self.bins = bins
799+
self.bins = com._ensure_int32(bins)
800800
self.binlabels = _ensure_index(binlabels)
801801
self._filter_empty_groups = filter_empty
802802

@@ -1048,6 +1048,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
10481048
if isinstance(key, CustomGrouper):
10491049
gpr = key.get_grouper(obj)
10501050
return gpr, []
1051+
elif isinstance(key, Grouper):
1052+
return key, []
10511053

10521054
if not isinstance(key, (tuple, list)):
10531055
keys = [key]

pandas/core/internals.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,12 @@ def interpolate(self, method='pad', axis=0, inplace=False, limit=None):
232232

233233
return make_block(values, self.items, self.ref_items)
234234

235+
def take(self, indexer, axis=1, fill_value=np.nan):
236+
assert(axis >= 1)
237+
new_values = com.take_fast(self.values, indexer, None,
238+
None, axis=axis,
239+
fill_value=fill_value)
240+
return make_block(new_values, self.items, self.ref_items)
235241

236242
#-------------------------------------------------------------------------------
237243
# Is this even possible?

pandas/src/groupby.pyx

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,9 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner,
607607
bins[bc] = j
608608
bc += 1
609609

610+
# if len(bins) > 0 and bins[-1] == lenidx:
611+
# bins = bins[:-1]
612+
610613
return bins
611614

612615
# add passing bin edges, instead of labels
@@ -628,7 +631,10 @@ def group_add_bin(ndarray[float64_t, ndim=2] out,
628631
nobs = np.zeros_like(out)
629632
sumx = np.zeros_like(out)
630633

631-
ngroups = len(bins) + 1
634+
if bins[len(bins) - 1] == len(values):
635+
ngroups = len(bins)
636+
else:
637+
ngroups = len(bins) + 1
632638
N, K = (<object> values).shape
633639

634640
b = 0
@@ -660,7 +666,7 @@ def group_add_bin(ndarray[float64_t, ndim=2] out,
660666

661667
for i in range(ngroups):
662668
for j in range(K):
663-
if nobs[i] == 0:
669+
if nobs[i, j] == 0:
664670
out[i, j] = nan
665671
else:
666672
out[i, j] = sumx[i, j]
@@ -682,7 +688,10 @@ def group_prod_bin(ndarray[float64_t, ndim=2] out,
682688
nobs = np.zeros_like(out)
683689
prodx = np.ones_like(out)
684690

685-
ngroups = len(bins) + 1
691+
if bins[len(bins) - 1] == len(values):
692+
ngroups = len(bins)
693+
else:
694+
ngroups = len(bins) + 1
686695
N, K = (<object> values).shape
687696

688697
b = 0
@@ -714,7 +723,7 @@ def group_prod_bin(ndarray[float64_t, ndim=2] out,
714723

715724
for i in range(ngroups):
716725
for j in range(K):
717-
if nobs[i] == 0:
726+
if nobs[i, j] == 0:
718727
out[i, j] = nan
719728
else:
720729
out[i, j] = prodx[i, j]
@@ -738,8 +747,11 @@ def group_min_bin(ndarray[float64_t, ndim=2] out,
738747
minx = np.empty_like(out)
739748
minx.fill(np.inf)
740749

750+
if bins[len(bins) - 1] == len(values):
751+
ngroups = len(bins)
752+
else:
753+
ngroups = len(bins) + 1
741754

742-
ngroups = len(bins) + 1
743755
N, K = (<object> values).shape
744756

745757
b = 0
@@ -773,7 +785,7 @@ def group_min_bin(ndarray[float64_t, ndim=2] out,
773785

774786
for i in range(ngroups):
775787
for j in range(K):
776-
if nobs[i] == 0:
788+
if nobs[i, j] == 0:
777789
out[i, j] = nan
778790
else:
779791
out[i, j] = minx[i, j]
@@ -796,7 +808,11 @@ def group_max_bin(ndarray[float64_t, ndim=2] out,
796808
maxx = np.empty_like(out)
797809
maxx.fill(-np.inf)
798810

799-
ngroups = len(bins) + 1
811+
if bins[len(bins) - 1] == len(values):
812+
ngroups = len(bins)
813+
else:
814+
ngroups = len(bins) + 1
815+
800816
N, K = (<object> values).shape
801817

802818
b = 0
@@ -830,7 +846,7 @@ def group_max_bin(ndarray[float64_t, ndim=2] out,
830846

831847
for i in range(ngroups):
832848
for j in range(K):
833-
if nobs[i] == 0:
849+
if nobs[i, j] == 0:
834850
out[i, j] = nan
835851
else:
836852
out[i, j] = maxx[i, j]
@@ -851,7 +867,11 @@ def group_ohlc(ndarray[float64_t, ndim=2] out,
851867
float64_t vopen, vhigh, vlow, vclose, NA
852868
bint got_first = 0
853869

854-
ngroups = len(bins) + 1
870+
if bins[len(bins) - 1] == len(values):
871+
ngroups = len(bins)
872+
else:
873+
ngroups = len(bins) + 1
874+
855875
N, K = (<object> values).shape
856876

857877
if out.shape[1] != 4:
@@ -922,7 +942,10 @@ def group_mean_bin(ndarray[float64_t, ndim=2] out,
922942
sumx = np.zeros_like(out)
923943

924944
N, K = (<object> values).shape
925-
ngroups = len(bins) + 1
945+
if bins[len(bins) - 1] == len(values):
946+
ngroups = len(bins)
947+
else:
948+
ngroups = len(bins) + 1
926949

927950
b = 0
928951
if K > 1:
@@ -953,8 +976,8 @@ def group_mean_bin(ndarray[float64_t, ndim=2] out,
953976

954977
for i in range(ngroups):
955978
for j in range(K):
956-
count = nobs[i]
957-
if nobs[i] == 0:
979+
count = nobs[i, j]
980+
if nobs[i, j] == 0:
958981
out[i, j] = nan
959982
else:
960983
out[i, j] = sumx[i, j] / count
@@ -975,7 +998,11 @@ def group_var_bin(ndarray[float64_t, ndim=2] out,
975998
sumx = np.zeros_like(out)
976999
sumxx = np.zeros_like(out)
9771000

978-
ngroups = len(bins) + 1
1001+
if bins[len(bins) - 1] == len(values):
1002+
ngroups = len(bins)
1003+
else:
1004+
ngroups = len(bins) + 1
1005+
9791006
N, K = (<object> values).shape
9801007

9811008
b = 0
@@ -1010,7 +1037,7 @@ def group_var_bin(ndarray[float64_t, ndim=2] out,
10101037

10111038
for i in range(ngroups):
10121039
for j in range(K):
1013-
ct = nobs[i]
1040+
ct = nobs[i, j]
10141041
if ct < 2:
10151042
out[i, j] = nan
10161043
else:

pandas/tests/test_tseries.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -289,14 +289,15 @@ def test_generate_bins():
289289
bins = func(values, binner, closed='left')
290290
assert((bins == np.array([2, 5, 6])).all())
291291

292-
bins = func(values, binner, closed='left')
293-
assert((bins == np.array([2, 5, 6])).all())
294-
295292
bins = func(values, binner, closed='right')
296293
assert((bins == np.array([3, 6, 6])).all())
297294

295+
for func in [lib.generate_bins_dt64, generate_bins_generic]:
296+
values = np.array([1,2,3,4,5,6], dtype=np.int64)
297+
binner = np.array([0,3,6], dtype=np.int64)
298+
298299
bins = func(values, binner, closed='right')
299-
assert((bins == np.array([3, 6, 6])).all())
300+
assert((bins == np.array([3, 6])).all())
300301

301302
class TestBinGroupers(unittest.TestCase):
302303

@@ -338,22 +339,22 @@ def _check_versions(self, irr_func, bin_func, np_func):
338339

339340
assert_almost_equal(out, exp)
340341

341-
# duplicate bins
342342
bins = np.array([3, 9, 10], dtype=np.int32)
343-
out = np.zeros((4, 1), np.float64)
343+
out = np.zeros((3, 1), np.float64)
344344
counts = np.zeros(len(out), dtype=np.int32)
345345
bin_func(out, counts, obj, bins)
346346
exp = np.array([np_func(obj[:3]), np_func(obj[3:9]),
347-
np_func(obj[9:]), np.nan],
347+
np_func(obj[9:])],
348348
dtype=np.float64)
349349
assert_almost_equal(out.squeeze(), exp)
350350

351+
# duplicate bins
351352
bins = np.array([3, 6, 10, 10], dtype=np.int32)
352-
out = np.zeros((5, 1), np.float64)
353+
out = np.zeros((4, 1), np.float64)
353354
counts = np.zeros(len(out), dtype=np.int32)
354355
bin_func(out, counts, obj, bins)
355356
exp = np.array([np_func(obj[:3]), np_func(obj[3:6]),
356-
np_func(obj[6:10]), np.nan, np.nan],
357+
np_func(obj[6:10]), np.nan],
357358
dtype=np.float64)
358359
assert_almost_equal(out.squeeze(), exp)
359360

pandas/tseries/resample.py

Lines changed: 54 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -73,34 +73,33 @@ def _get_time_grouper(self, obj):
7373

7474
if self.kind is None or self.kind == 'timestamp':
7575
binner, bins, binlabels = self._get_time_bins(axis)
76-
grouper = BinGrouper(bins, binlabels)
7776
else:
78-
index = binner = PeriodIndex(start=axis[0], end=axis[-1],
79-
freq=self.freq)
80-
81-
end_stamps = (index + 1).asfreq('D', 's').to_timestamp()
82-
bins = axis.searchsorted(end_stamps, side='left')
83-
84-
grouper = BinGrouper(bins, index)
77+
binner, bins, binlabels = self._get_time_period_bins(axis)
8578

79+
grouper = BinGrouper(bins, binlabels)
8680
return binner, grouper
8781

8882
def _get_time_bins(self, axis):
8983
return _make_time_bins(axis, self.freq, begin=self.begin,
90-
end=self.end, nperiods=self.nperiods,
91-
closed=self.closed, label=self.label)
84+
end=self.end, closed=self.closed,
85+
label=self.label)
86+
87+
def _get_time_period_bins(self, axis):
88+
return _make_period_bins(axis, self.freq, begin=self.begin,
89+
end=self.end, closed=self.closed,
90+
label=self.label)
9291

9392
def _resample_timestamps(self, obj):
94-
axis = obj._get_axis(self.axis)
93+
axlabels = obj._get_axis(self.axis)
9594

9695
binner, grouper = self._get_time_grouper(obj)
9796

9897
# downsamples
99-
if len(grouper.binlabels) < len(axis):
100-
grouped = obj.groupby(grouper, axis=axis)
98+
if len(grouper.binlabels) < len(axlabels):
99+
grouped = obj.groupby(grouper, axis=self.axis)
101100
result = grouped.agg(self.how)
102101
else:
103-
assert(axis == 0)
102+
assert(self.axis == 0)
104103
# upsampling
105104

106105
# this is sort of a hack
@@ -115,12 +114,12 @@ def _resample_timestamps(self, obj):
115114
return result
116115

117116
def _resample_periods(self, obj):
118-
axis = obj._get_axis(self.axis)
117+
axlabels = obj._get_axis(self.axis)
119118

120119
# Start vs. end of period
121-
memb = axis.asfreq(self.freq, how=self.convention)
120+
memb = axlabels.asfreq(self.freq, how=self.convention)
122121

123-
if is_subperiod(self.axis.freq, self.freq):
122+
if is_subperiod(axlabels.freq, self.freq):
124123
# Downsampling
125124
if len(memb) > 1:
126125
rng = np.arange(memb.values[0], memb.values[-1])
@@ -131,9 +130,9 @@ def _resample_periods(self, obj):
131130
index = period_range(memb[0], memb[-1], freq=self.freq)
132131
grouper = BinGrouper(bins, index)
133132

134-
grouped = obj.groupby(grouper, axis=axis)
133+
grouped = obj.groupby(grouper, axis=self.axis)
135134
return grouped.agg(self.how)
136-
elif is_superperiod(self.axis.freq, self.freq):
135+
elif is_superperiod(axlabels.freq, self.freq):
137136
# Generate full range
138137
new_index = period_range(memb[0], memb[-1], freq=self.freq)
139138

@@ -144,7 +143,7 @@ def _resample_periods(self, obj):
144143
return _take_new_index(obj, indexer, new_index, axis=self.axis)
145144
else:
146145
raise ValueError('Frequency %s cannot be resampled to %s'
147-
% (self.axis.freq, self.freq))
146+
% (axlabels.freq, self.freq))
148147

149148

150149
def _take_new_index(obj, indexer, new_index, axis=0):
@@ -168,31 +167,47 @@ def _take_new_index(obj, indexer, new_index, axis=0):
168167
raise NotImplementedError
169168

170169

171-
def _make_period_bins(axis, freq):
172-
index = PeriodIndex(start=axis[0], end=axis[-1], freq=freq)
173-
end_stamps = (index + 1).asfreq('D', 's').to_timestamp()
170+
def _make_period_bins(axis, freq, begin=None, end=None,
171+
closed='right', label='right'):
172+
assert(isinstance(axis, DatetimeIndex))
173+
174+
if len(axis) == 0:
175+
# TODO: Should we be a bit more careful here?
176+
return [], [], []
177+
178+
first, last = _get_range_edges(axis, begin, end, freq, closed=closed)
179+
binlabels = binner = PeriodIndex(start=first, end=last, freq=freq)
180+
181+
# a little hack
182+
trimmed = False
183+
if len(binner) > 2 and binner[-2] == axis[-1]:
184+
binner = binner[:-1]
185+
trimmed = True
186+
187+
end_stamps = (binlabels + 1).asfreq('D', 's').to_timestamp()
174188
bins = axis.searchsorted(end_stamps, side='left')
175189

176-
return index, bins, index
190+
if label == 'right':
191+
bins = bins[1:]
192+
labels = binner[1:]
193+
elif not trimmed:
194+
labels = binner[:-1]
195+
else:
196+
labels = binner
197+
198+
return binner, bins, labels
177199

178200

179-
def _make_time_bins(axis, freq, begin=None, end=None, nperiods=None,
201+
def _make_time_bins(axis, freq, begin=None, end=None,
180202
closed='right', label='right'):
181203
assert(isinstance(axis, DatetimeIndex))
182204

183205
if len(axis) == 0:
184206
# TODO: Should we be a bit more careful here?
185207
return [], [], []
186208

187-
if isinstance(freq, basestring):
188-
freq = to_offset(freq)
189-
190-
if not isinstance(freq, DateOffset):
191-
raise ValueError("Rule not a recognized offset")
192-
193209
first, last = _get_range_edges(axis, begin, end, freq, closed=closed)
194-
binner = DatetimeIndex(freq=freq, start=first, end=last,
195-
periods=nperiods)
210+
binner = DatetimeIndex(freq=freq, start=first, end=last)
196211

197212
# a little hack
198213
trimmed = False
@@ -213,6 +228,12 @@ def _make_time_bins(axis, freq, begin=None, end=None, nperiods=None,
213228
return binner, bins, labels
214229

215230
def _get_range_edges(axis, begin, end, offset, closed='left'):
231+
if isinstance(offset, basestring):
232+
offset = to_offset(offset)
233+
234+
if not isinstance(offset, DateOffset):
235+
raise ValueError("Rule not a recognized offset")
236+
216237
if begin is None:
217238
if closed == 'left':
218239
first = Timestamp(offset.rollback(axis[0]))

0 commit comments

Comments
 (0)