Skip to content

DEPR: Deprecate str.split return_type #10085

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
@@ -221,6 +221,28 @@ enhancements are performed to make string operation easier.
idx.str.startswith('a')
s[s.index.str.startswith('a')]


- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`)

.. ipython:: python

s = Series(['a,b', 'a,c', 'b,c'])

# return Series
s.str.split(',')

# return DataFrame
s.str.split(',', expand=True)

idx = Index(['a,b', 'a,c', 'b,c'])

# return Index
idx.str.split(',')

# return MultiIndex
idx.str.split(',', expand=True)


- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)

.. _whatsnew_0161.api:
@@ -249,6 +271,13 @@ API changes

- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`)

.. _whatsnew_0161.deprecations:

Deprecations
^^^^^^^^^^^^

- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`)

.. _whatsnew_0161.performance:

Performance Improvements
54 changes: 23 additions & 31 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
from pandas.compat import zip
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
import pandas.compat as compat
from pandas.util.decorators import Appender
from pandas.util.decorators import Appender, deprecate_kwarg
import re
import pandas.lib as lib
import warnings
@@ -696,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
return _na_map(f, arr)


def str_split(arr, pat=None, n=None, return_type='series'):
def str_split(arr, pat=None, n=None):
"""
Split each string (a la re.split) in the Series/Index by given
pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -705,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
----------
pat : string, default None
String or regular expression to split on. If None, splits on whitespace
n : int, default None (all)
return_type : {'series', 'index', 'frame'}, default 'series'
If frame, returns a DataFrame (elements are strings)
If series or index, returns the same type as the original object
(elements are lists of strings).
Notes
-----
Both 0 and -1 will be interpreted as return all splits
n : int, default -1 (all)
None, 0 and -1 will be interpreted as return all splits
expand : bool, default False
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index.
return_type : deprecated, use `expand`
Returns
-------
split : Series/Index of objects or DataFrame
split : Series/Index or DataFrame/MultiIndex of objects
"""
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.index import Index

if return_type not in ('series', 'index', 'frame'):
raise ValueError("return_type must be {'series', 'index', 'frame'}")
if return_type == 'frame' and isinstance(arr, Index):
raise ValueError("return_type='frame' is not supported for string "
"methods on Index")
if pat is None:
if n is None or n == 0:
n = -1
@@ -742,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
n = 0
regex = re.compile(pat)
f = lambda x: regex.split(x, maxsplit=n)
if return_type == 'frame':
res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
else:
res = _na_map(f, arr)
res = _na_map(f, arr)
return res


@@ -1083,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
return DataFrame(result, index=self.series.index)

def _wrap_result_expand(self, result, expand=False):
from pandas.core.index import Index
if not isinstance(expand, bool):
raise ValueError("expand must be True or False")

from pandas.core.index import Index, MultiIndex
if not hasattr(result, 'ndim'):
return result

@@ -1096,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):

if expand:
result = list(result)
return Index(result, name=name)
return MultiIndex.from_tuples(result, names=name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this partially closes #10008 yes?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Updated #10008 for current status. The current impl can return MultiIndex for partition (#9773) already, but it doesn't work for split (thus changed).

else:
return Index(result, name=name)
else:
index = self.series.index
if expand:
@@ -1114,10 +1104,12 @@ def cat(self, others=None, sep=None, na_rep=None):
result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
return self._wrap_result(result)

@deprecate_kwarg('return_type', 'expand',
mapping={'series': False, 'frame': True})
@copy(str_split)
def split(self, pat=None, n=-1, return_type='series'):
result = str_split(self.series, pat, n=n, return_type=return_type)
return self._wrap_result(result)
def split(self, pat=None, n=-1, expand=False):
result = str_split(self.series, pat, n=n)
return self._wrap_result_expand(result, expand=expand)

_shared_docs['str_partition'] = ("""
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1131,7 +1123,7 @@ def split(self, pat=None, n=-1, return_type='series'):
String to split on.
expand : bool, default True
* If True, return DataFrame/MultiIndex expanding dimensionality.
* If False, return Series/Index
* If False, return Series/Index.
Returns
-------
11 changes: 6 additions & 5 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
@@ -1280,11 +1280,12 @@ def test_str_attribute(self):
idx = Index(['a b c', 'd e', 'f'])
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
tm.assert_index_equal(idx.str.split(), expected)
tm.assert_index_equal(idx.str.split(return_type='series'), expected)
# return_type 'index' is an alias for 'series'
tm.assert_index_equal(idx.str.split(return_type='index'), expected)
with self.assertRaisesRegexp(ValueError, 'not supported'):
idx.str.split(return_type='frame')
tm.assert_index_equal(idx.str.split(expand=False), expected)

expected = MultiIndex.from_tuples([('a', 'b', 'c'),
('d', 'e', np.nan),
('f', np.nan, np.nan)])
tm.assert_index_equal(idx.str.split(expand=True), expected)

# test boolean case, should return np.array instead of boolean Index
idx = Index(['a1', 'a2', 'b1', 'b2'])
71 changes: 67 additions & 4 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
@@ -1206,14 +1206,19 @@ def test_split(self):
result = values.str.split('__')
tm.assert_series_equal(result, exp)

result = values.str.split('__', expand=False)
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['a_b_c', NA, 'd_e_f', True, datetime.today(),
None, 1, 2.])

rs = Series(mixed).str.split('_')
rs = mixed.str.split('_')
xp = Series([['a', 'b', 'c'], NA, ['d', 'e', 'f'], NA, NA,
NA, NA, NA])
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

rs = mixed.str.split('_', expand=False)
tm.assert_isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

@@ -1226,6 +1231,9 @@ def test_split(self):
[u('f'), u('g'), u('h')]])
tm.assert_series_equal(result, exp)

result = values.str.split('_', expand=False)
tm.assert_series_equal(result, exp)

def test_split_noargs(self):
# #1859
s = Series(['Wes McKinney', 'Travis Oliphant'])
@@ -1259,7 +1267,10 @@ def test_split_no_pat_with_nonzero_n(self):

def test_split_to_dataframe(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', return_type='frame')

with tm.assert_produces_warning():
result = s.str.split('_', return_type='frame')

exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

@@ -1282,9 +1293,61 @@ def test_split_to_dataframe(self):
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

with tm.assertRaisesRegexp(ValueError, "return_type must be"):
with tm.assertRaisesRegexp(ValueError, "expand must be"):
s.str.split('_', return_type="some_invalid_type")

def test_split_to_dataframe_expand(self):
s = Series(['nosplit', 'alsonosplit'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])})
tm.assert_frame_equal(result, exp)

s = Series(['some_equal_splits', 'with_no_nans'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['equal', 'no'],
2: ['splits', 'nans']})
tm.assert_frame_equal(result, exp)

s = Series(['some_unequal_splits', 'one_of_these_things_is_not'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'one'], 1: ['unequal', 'of'],
2: ['splits', 'these'], 3: [NA, 'things'],
4: [NA, 'is'], 5: [NA, 'not']})
tm.assert_frame_equal(result, exp)

s = Series(['some_splits', 'with_index'], index=['preserve', 'me'])
result = s.str.split('_', expand=True)
exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']},
index=['preserve', 'me'])
tm.assert_frame_equal(result, exp)

with tm.assertRaisesRegexp(ValueError, "expand must be"):
s.str.split('_', return_type="some_invalid_type")

def test_split_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
result = idx.str.split('_', expand=True)
exp = Index([np.array(['nosplit']), np.array(['alsonosplit'])])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 1)

idx = Index(['some_equal_splits', 'with_no_nans'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
('with', 'no', 'nans')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 3)

idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA),
('one', 'of', 'these', 'things', 'is', 'not')])
tm.assert_index_equal(result, exp)
self.assertEqual(result.nlevels, 6)

with tm.assertRaisesRegexp(ValueError, "expand must be"):
idx.str.split('_', return_type="some_invalid_type")

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])