Skip to content

Commit d859d15

Browse files
author
y-p
committed
ENH: Add helpers to pd.common: pprint_thing/_encoded(),console_encode()
1 parent cbeff93 commit d859d15

File tree

2 files changed

+92
-10
lines changed

2 files changed

+92
-10
lines changed

pandas/core/common.py

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -867,16 +867,6 @@ def load(path):
867867
finally:
868868
f.close()
869869

870-
def console_encode(value):
871-
if py3compat.PY3 or not isinstance(value, unicode):
872-
return value
873-
874-
try:
875-
import sys
876-
return value.encode(sys.stdin.encoding or 'utf-8', 'replace')
877-
except (AttributeError, TypeError):
878-
return value.encode('ascii', 'replace')
879-
880870
class UTF8Recoder:
881871
"""
882872
Iterator that reads an encoded stream and reencodes the input to UTF-8
@@ -968,3 +958,94 @@ def _concat_compat(to_concat, axis=0):
968958
return new_values.view(_NS_DTYPE)
969959
else:
970960
return np.concatenate(to_concat, axis=axis)
961+
962+
# Unicode consolidation
963+
# ---------------------
964+
#
965+
# pprinting utility functions for generating Unicode text or bytes(3.x)/str(2.x)
966+
# representations of objects.
967+
# Try to use these as much as possible rather then rolling your own.
968+
#
969+
# When to use
970+
# -----------
971+
#
972+
# 1) If you're writing code internal to pandas (no I/O directly involved),
973+
# use pprint_thing().
974+
#
975+
# It will always return unicode text which can handled by other
976+
# parts of the package without breakage.
977+
#
978+
# 2) If you need to send something to the console, use console_encode().
979+
#
980+
# console_encode() should (hopefully) choose the right encoding for you
981+
# based on the encoding set in fmt.print_config.encoding.
982+
#
983+
# 3) if you need to write something out to file, use pprint_thing_encoded(encoding).
984+
#
985+
# If no encoding is specified, it defaults to utf-8. SInce encoding pure ascii with
986+
# utf-8 is a no-op you can safely use the default utf-8 if you're working with
987+
# straight ascii.
988+
989+
def _pprint_seq(seq,_nest_lvl=0):
990+
"""
991+
internal. pprinter for iterables. you should probably use pprint_thing()
992+
rather then calling this directly.
993+
"""
994+
fmt=u"[%s]" if hasattr(seq,'__setitem__') else u"(%s)"
995+
return fmt % ", ".join(pprint_thing(e,_nest_lvl+1) for e in seq)
996+
997+
def pprint_thing(thing,_nest_lvl=0):
998+
"""
999+
This function is the sanctioned way of converting objects
1000+
to a unicode representation.
1001+
1002+
properly handles nested sequences containing unicode strings
1003+
(unicode(object) does not)
1004+
1005+
Parameters
1006+
----------
1007+
thing : anything to be formatted
1008+
_nest_lvl : internal use only. pprint_thing() is mutually-recursive
1009+
with pprint_sequence, this argument is used to keep track of the
1010+
current nesting level, and limit it.
1011+
1012+
Returns
1013+
-------
1014+
result - unicode object on py2, str on py3. Always Unicode.
1015+
1016+
"""
1017+
from pandas.core.format import print_config
1018+
if thing is None:
1019+
result = ''
1020+
elif _is_sequence(thing) and _nest_lvl < print_config.pprint_nest_depth:
1021+
result = _pprint_seq(thing,_nest_lvl)
1022+
else:
1023+
# when used internally in the package, everything
1024+
# passed in should be a unicode object or have a unicode
1025+
# __str__. However as an aid to transition, we also accept
1026+
# utf8 encoded strings, if that's not it, we have no way
1027+
# to know, and the user should deal with it himself.
1028+
# so we resort to utf-8 with replacing errors
1029+
1030+
try:
1031+
result = unicode(thing) # we should try this first
1032+
except UnicodeDecodeError:
1033+
# either utf-8 or we replace errors
1034+
result = str(thing).decode('utf-8',"replace")
1035+
1036+
return unicode(result) # always unicode
1037+
1038+
def pprint_thing_encoded(object,encoding='utf-8',errors='replace'):
1039+
value=pprint_thing(object) # get unicode representation of object
1040+
return value.encode(encoding, errors)
1041+
1042+
def console_encode(object):
1043+
from pandas.core.format import print_config
1044+
"""
1045+
this is the sanctioned way to prepare something for
1046+
sending *to the console*, it delegates to pprint_thing() to get
1047+
a unicode representation of the object relies on the global encoding
1048+
set in print_config.encoding. Use this everywhere
1049+
where you output to the console.
1050+
"""
1051+
return pprint_thing_encoded(object,print_config.encoding)

pandas/core/format.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,7 @@ def __init__(self):
10981098
self.notebook_repr_html = True
10991099
self.date_dayfirst = False
11001100
self.date_yearfirst = False
1101+
self.pprint_nest_depth = 3
11011102
self.multi_sparse = True
11021103
self.encoding = self.detect_encoding()
11031104

0 commit comments

Comments
 (0)