Skip to content

Doc for GH 8946 #8952

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 25 additions & 12 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
@@ -328,13 +328,23 @@ old categories must be included in the new categories and no new categories are
Comparisons
-----------

Comparing `Categoricals` with other objects is possible in two cases:
Comparing categorical data with other objects is possible in three cases:

* comparing a categorical Series to another categorical Series, when `categories` and `ordered` is
the same or
* comparing a categorical Series to a scalar.
* comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array,
...) of the same length as the categorical data or
* all comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to
another categorical Series, when ``ordered==True`` and the `categories` are the same or
* all comparisons of a categorical data to a scalar.

All other comparisons will raise a TypeError.
All other comparisons, especially "non-equality" comparisons of two categoricals with different
categories or a categorical with any list-like object, will raise a TypeError.

.. note::

Any "non-equality" comparisons of categorical data with a `Series`, `np.array`, `list` or
categorical data with different categories or ordering will raise an `TypeError` because custom
categories ordering could be interpreted in two ways: one with taking in account the
ordering and one without.

.. ipython:: python
@@ -353,6 +363,13 @@ Comparing to a categorical with the same categories and ordering or to a scalar
cat > cat_base
cat > 2
Equality comparisons work with any list-like object of same length and scalars:

.. ipython:: python
cat == cat_base2
cat == 2
This doesn't work because the categories are not the same:

.. ipython:: python
@@ -362,13 +379,9 @@ This doesn't work because the categories are not the same:
except TypeError as e:
print("TypeError: " + str(e))
.. note::

Comparisons with `Series`, `np.array` or a `Categorical` with different categories or ordering
will raise an `TypeError` because custom categories ordering could be interpreted in two ways:
one with taking in account the ordering and one without. If you want to compare a categorical
series with such a type, you need to be explicit and convert the categorical data back to the
original values:
If you want to do a "non-equality" comparison of a categorical series with a list-like object
which is not categorical data, you need to be explicit and convert the categorical data back to
the original values:

.. ipython:: python
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
@@ -59,6 +59,8 @@ API changes
p = pd.Panel(np.random.rand(2, 5, 4) > 0.1)
p.all()

- Allow equality comparisons of Series with a categorical dtype and object dtype; previously these would raise ``TypeError`` (:issue:`8938`)

.. _whatsnew_0152.enhancements:

Enhancements
6 changes: 6 additions & 0 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
@@ -64,6 +64,12 @@ def f(self, other):
else:
return np.repeat(False, len(self))
else:

# allow categorical vs object dtype array comparisons for equality
# these are only positional comparisons
if op in ['__eq__','__ne__']:
return getattr(np.array(self),op)(np.array(other))

msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \
"compare values, use 'np.asarray(cat) <op> other'."
raise TypeError(msg.format(op=op,typ=type(other)))
49 changes: 26 additions & 23 deletions pandas/core/ops.py
Original file line number Diff line number Diff line change
@@ -541,10 +541,13 @@ def _comp_method_SERIES(op, name, str_rep, masker=False):
"""
def na_op(x, y):

if com.is_categorical_dtype(x) != (not np.isscalar(y) and com.is_categorical_dtype(y)):
msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \
"compare values, use 'series <op> np.asarray(cat)'."
raise TypeError(msg.format(op=op,typ=type(y)))
# dispatch to the categorical if we have a categorical
# in either operand
if com.is_categorical_dtype(x):
return op(x,y)
elif com.is_categorical_dtype(y) and not lib.isscalar(y):
return op(y,x)

if x.dtype == np.object_:
if isinstance(y, list):
y = lib.list_to_object_array(y)
@@ -586,33 +589,33 @@ def wrapper(self, other):
msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
"If you want to compare values, use 'series <op> np.asarray(other)'."
raise TypeError(msg.format(op=op,typ=self.dtype))
else:

mask = isnull(self)

values = self.get_values()
other = _index.convert_scalar(values,_values_from_object(other))
mask = isnull(self)

if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
values = values.view('i8')
values = self.get_values()
other = _index.convert_scalar(values,_values_from_object(other))

# scalars
res = na_op(values, other)
if np.isscalar(res):
raise TypeError('Could not compare %s type with Series'
% type(other))
if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
values = values.view('i8')

# always return a full value series here
res = _values_from_object(res)
# scalars
res = na_op(values, other)
if np.isscalar(res):
raise TypeError('Could not compare %s type with Series'
% type(other))

res = pd.Series(res, index=self.index, name=self.name,
dtype='bool')
# always return a full value series here
res = _values_from_object(res)

# mask out the invalids
if mask.any():
res[mask] = masker
res = pd.Series(res, index=self.index, name=self.name,
dtype='bool')

# mask out the invalids
if mask.any():
res[mask] = masker

return res
return res
return wrapper


54 changes: 53 additions & 1 deletion pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
@@ -2211,11 +2211,63 @@ def f():
tm.assert_series_equal(res, exp)

# And test NaN handling...
cat = pd.Series(pd.Categorical(["a","b","c", np.nan]))
cat = Series(Categorical(["a","b","c", np.nan]))
exp = Series([True, True, True, False])
res = (cat == cat)
tm.assert_series_equal(res, exp)

def test_cat_equality(self):

# GH 8938
# allow equality comparisons
a = Series(list('abc'),dtype="category")
b = Series(list('abc'),dtype="object")
c = Series(['a','b','cc'],dtype="object")
d = Series(list('acb'),dtype="object")
e = Categorical(list('abc'))
f = Categorical(list('acb'))

# vs scalar
self.assertFalse((a=='a').all())
self.assertTrue(((a!='a') == ~(a=='a')).all())

self.assertFalse(('a'==a).all())
self.assertTrue((a=='a')[0])
self.assertTrue(('a'==a)[0])
self.assertFalse(('a'!=a)[0])

# vs list-like
self.assertTrue((a==a).all())
self.assertFalse((a!=a).all())

self.assertTrue((a==list(a)).all())
self.assertTrue((a==b).all())
self.assertTrue((b==a).all())
self.assertTrue(((~(a==b))==(a!=b)).all())
self.assertTrue(((~(b==a))==(b!=a)).all())

self.assertFalse((a==c).all())
self.assertFalse((c==a).all())
self.assertFalse((a==d).all())
self.assertFalse((d==a).all())

# vs a cat-like
self.assertTrue((a==e).all())
self.assertTrue((e==a).all())
self.assertFalse((a==f).all())
self.assertFalse((f==a).all())

self.assertTrue(((~(a==e)==(a!=e)).all()))
self.assertTrue(((~(e==a)==(e!=a)).all()))
self.assertTrue(((~(a==f)==(a!=f)).all()))
self.assertTrue(((~(f==a)==(f!=a)).all()))

# non-equality is not comparable
self.assertRaises(TypeError, lambda: a < b)
self.assertRaises(TypeError, lambda: b < a)
self.assertRaises(TypeError, lambda: a > b)
self.assertRaises(TypeError, lambda: b > a)

def test_concat(self):
cat = pd.Categorical(["a","b"], categories=["a","b"])
vals = [1,2]