Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 30 additions & 24 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
intended for public consumption
"""
from textwrap import dedent
from typing import Dict
from typing import Dict, Optional, Tuple, Union
from warnings import catch_warnings, simplefilter, warn

import numpy as np
@@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
Returns
-------
labels : ndarray
codes : ndarray
An integer ndarray that's an indexer into `uniques`.
``uniques.take(labels)`` will have the same values as `values`.
``uniques.take(codes)`` will have the same values as `values`.
uniques : ndarray, Index, or Categorical
The unique valid values. When `values` is Categorical, `uniques`
is a Categorical. When `values` is some other pandas object, an
@@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> labels
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
With ``sort=True``, the `uniques` will be sorted, and `labels` will be
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> labels
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
array(['a', 'b', 'c'], dtype=object)
Missing values are indicated in `labels` with `na_sentinel`
Missing values are indicated in `codes` with `na_sentinel`
(``-1`` by default). Note that missing values are never
included in `uniques`.
>>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> labels
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
@@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
will differ. For Categoricals, a `Categorical` is returned.
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
>>> labels, uniques = pd.factorize(cat)
>>> labels
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
[a, c]
@@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
returned.
>>> cat = pd.Series(['a', 'a', 'c'])
>>> labels, uniques = pd.factorize(cat)
>>> labels
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
Index(['a', 'c'], dtype='object')
@@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
sort=dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
@@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
)
@Appender(_shared_docs["factorize"])
@deprecate_kwarg(old_arg_name="order", new_arg_name=None)
def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None):
def factorize(
values,
sort: bool = False,
order=None,
na_sentinel: int = -1,
size_hint: Optional[int] = None,
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
# 2.) factorizing labels and uniques
# 3.) Maybe boxing the output in an Index
# 2.) factorizing codes and uniques
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should remain labels

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're factorizing the values into codes and uniques, so should be codes?

Maybe the sentence should actually be worded as "2) factorizing values into codes and uniques" (more explicit?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

codes is correct; we are changing on purpose to conform with other usages in the codebase

# 3.) Maybe boxing the uniques in an Index
#
# Step 2 is dispatched to extension types (like Categorical). They are
# responsible only for factorization. All data coercion, sorting and boxing
@@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=

if is_extension_array_dtype(values):
values = extract_array(values)
labels, uniques = values.factorize(na_sentinel=na_sentinel)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
values, dtype = _ensure_data(values)
@@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
else:
na_value = None

labels, uniques = _factorize_array(
codes, uniques = _factorize_array(
values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
)

if sort and len(uniques) > 0:
uniques, labels = safe_sort(
uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
uniques, codes = safe_sort(
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
)

uniques = _reconstruct_data(uniques, dtype, original)
@@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=

uniques = Index(uniques)

return labels, uniques
return codes, uniques


def value_counts(
8 changes: 4 additions & 4 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
@@ -690,11 +690,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
Parameters
----------
na_sentinel : int, default -1
Value to use in the `labels` array to indicate missing values.
Value to use in the `codes` array to indicate missing values.
Returns
-------
labels : ndarray
codes : ndarray
An integer NumPy array that's an indexer into the original
ExtensionArray.
uniques : ExtensionArray
@@ -724,12 +724,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
# Complete control over factorization.
arr, na_value = self._values_for_factorize()

labels, uniques = _factorize_array(
codes, uniques = _factorize_array(
arr, na_sentinel=na_sentinel, na_value=na_value
)

uniques = self._from_factorized(uniques, self)
return labels, uniques
return codes, uniques

_extension_array_shared_docs[
"repeat"
6 changes: 3 additions & 3 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
@@ -710,11 +710,11 @@ def factorize(self, na_sentinel=-1):
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
# The sparsity on this is backwards from what Sparse would want. Want
# ExtensionArray.factorize -> Tuple[EA, EA]
# Given that we have to return a dense array of labels, why bother
# Given that we have to return a dense array of codes, why bother
# implementing an efficient factorize?
labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
uniques = SparseArray(uniques, dtype=self.dtype)
return labels, uniques
return codes, uniques

def value_counts(self, dropna=True):
"""
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
@@ -1518,7 +1518,7 @@ def memory_usage(self, deep=False):
sort=textwrap.dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
18 changes: 9 additions & 9 deletions pandas/tests/arrays/categorical/test_algos.py
Original file line number Diff line number Diff line change
@@ -11,23 +11,23 @@ def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
labels, uniques = pd.factorize(cat)
expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


@@ -36,13 +36,13 @@ def test_factorized_sort_ordered():
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)

labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


20 changes: 10 additions & 10 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
@@ -113,29 +113,29 @@ def test_unique(self, data, box, method):

@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize(self, data_for_grouping, na_sentinel):
labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_labels = np.array(
codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_codes = np.array(
[0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
)
expected_uniques = data_for_grouping.take([0, 4, 7])

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
self.assert_extension_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)

tm.assert_numpy_array_equal(l1, l2)
self.assert_extension_array_equal(u1, u2)
tm.assert_numpy_array_equal(codes_1, codes_2)
self.assert_extension_array_equal(uniques_1, uniques_2)

def test_factorize_empty(self, data):
labels, uniques = pd.factorize(data[:0])
expected_labels = np.array([], dtype=np.intp)
codes, uniques = pd.factorize(data[:0])
expected_codes = np.array([], dtype=np.intp)
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
self.assert_extension_array_equal(uniques, expected_uniques)

def test_fillna_copy_frame(self, data_missing):
140 changes: 70 additions & 70 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
@@ -32,56 +32,56 @@
class TestFactorize:
def test_basic(self):

labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))

labels, uniques = algos.factorize(
codes, uniques = algos.factorize(
["a", "b", "b", "a", "a", "c", "c", "c"], sort=True
)
exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array(["a", "b", "c"], dtype=object)
tm.assert_numpy_array_equal(uniques, exp)

labels, uniques = algos.factorize(list(reversed(range(5))))
codes, uniques = algos.factorize(list(reversed(range(5))))
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
tm.assert_numpy_array_equal(uniques, exp)

labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
codes, uniques = algos.factorize(list(reversed(range(5))), sort=True)

exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
tm.assert_numpy_array_equal(uniques, exp)

labels, uniques = algos.factorize(list(reversed(np.arange(5.0))))
codes, uniques = algos.factorize(list(reversed(np.arange(5.0))))
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64)
tm.assert_numpy_array_equal(uniques, exp)

labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True)
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)
tm.assert_numpy_array_equal(uniques, exp)

def test_mixed(self):

# doc example reshaping.rst
x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
labels, uniques = algos.factorize(x)
codes, uniques = algos.factorize(x)

exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = Index(["A", "B", 3.14, np.inf])
tm.assert_index_equal(uniques, exp)

labels, uniques = algos.factorize(x, sort=True)
codes, uniques = algos.factorize(x, sort=True)
exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = Index([3.14, np.inf, "A", "B"])
tm.assert_index_equal(uniques, exp)

@@ -91,16 +91,16 @@ def test_datelike(self):
v1 = Timestamp("20130101 09:00:00.00004")
v2 = Timestamp("20130101")
x = Series([v1, v1, v1, v2, v2, v1])
labels, uniques = algos.factorize(x)
codes, uniques = algos.factorize(x)

exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = DatetimeIndex([v1, v2])
tm.assert_index_equal(uniques, exp)

labels, uniques = algos.factorize(x, sort=True)
codes, uniques = algos.factorize(x, sort=True)
exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
exp = DatetimeIndex([v2, v1])
tm.assert_index_equal(uniques, exp)

@@ -110,28 +110,28 @@ def test_datelike(self):
x = Series([v1, v1, v1, v2, v2, v1])

# periods are not 'sorted' as they are converted back into an index
labels, uniques = algos.factorize(x)
codes, uniques = algos.factorize(x)
exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

labels, uniques = algos.factorize(x, sort=True)
codes, uniques = algos.factorize(x, sort=True)
exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))

# GH 5986
v1 = pd.to_timedelta("1 day 1 min")
v2 = pd.to_timedelta("1 day")
x = Series([v1, v2, v1, v1, v2, v2, v1])
labels, uniques = algos.factorize(x)
codes, uniques = algos.factorize(x)
exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))

labels, uniques = algos.factorize(x, sort=True)
codes, uniques = algos.factorize(x, sort=True)
exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
tm.assert_numpy_array_equal(codes, exp)
tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))

def test_factorize_nan(self):
@@ -158,7 +158,7 @@ def test_factorize_nan(self):
tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)

@pytest.mark.parametrize(
"data,expected_label,expected_level",
"data, expected_codes, expected_uniques",
[
(
[(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
@@ -173,14 +173,14 @@ def test_factorize_nan(self):
([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
],
)
def test_factorize_tuple_list(self, data, expected_label, expected_level):
def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
# GH9454
result = pd.factorize(data)
codes, uniques = pd.factorize(data)

tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp))
tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))

expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object)
tm.assert_numpy_array_equal(result[1], expected_level_array)
expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
tm.assert_numpy_array_equal(uniques, expected_uniques_array)

def test_complex_sorting(self):
# gh 12666 - check no segfault
@@ -197,52 +197,52 @@ def test_complex_sorting(self):
def test_float64_factorize(self, writable):
data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
data.setflags(write=writable)
exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
codes, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_uint64_factorize(self, writable):
data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64)
data.setflags(write=writable)
exp_labels = np.array([0, 1, 0], dtype=np.intp)
exp_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64)
expected_codes = np.array([0, 1, 0], dtype=np.intp)
expected_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
codes, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_int64_factorize(self, writable):
data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64)
data.setflags(write=writable)
exp_labels = np.array([0, 1, 0], dtype=np.intp)
exp_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64)
expected_codes = np.array([0, 1, 0], dtype=np.intp)
expected_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
codes, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_string_factorize(self, writable):
data = np.array(["a", "c", "a", "b", "c"], dtype=object)
data.setflags(write=writable)
exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp)
exp_uniques = np.array(["a", "c", "b"], dtype=object)
expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
expected_uniques = np.array(["a", "c", "b"], dtype=object)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
codes, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_object_factorize(self, writable):
data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object)
data.setflags(write=writable)
exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
exp_uniques = np.array(["a", "c", "b"], dtype=object)
expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
expected_uniques = np.array(["a", "c", "b"], dtype=object)

labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
codes, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

def test_deprecate_order(self):
# gh 19727 - check warning is raised for deprecated keyword, order.
@@ -263,11 +263,11 @@ def test_deprecate_order(self):
)
def test_parametrized_factorize_na_value_default(self, data):
# arrays that include the NA default for that type, but isn't used.
l, u = algos.factorize(data)
codes, uniques = algos.factorize(data)
expected_uniques = data[[0, 1]]
expected_labels = np.array([0, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(l, expected_labels)
tm.assert_numpy_array_equal(u, expected_uniques)
expected_codes = np.array([0, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize(
"data, na_value",
@@ -282,11 +282,11 @@ def test_parametrized_factorize_na_value_default(self, data):
],
)
def test_parametrized_factorize_na_value(self, data, na_value):
l, u = algos._factorize_array(data, na_value=na_value)
codes, uniques = algos._factorize_array(data, na_value=na_value)
expected_uniques = data[[1, 3]]
expected_labels = np.array([-1, 0, -1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(l, expected_labels)
tm.assert_numpy_array_equal(u, expected_uniques)
expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("na_sentinel", [-1, -10, 100])
@@ -305,14 +305,14 @@ def test_parametrized_factorize_na_value(self, data, na_value):
ids=["numpy_array", "extension_array"],
)
def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel)
codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel)
if sort:
expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
expected_uniques = algos.safe_sort(uniques)
else:
expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
expected_codes = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
expected_uniques = uniques
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
if isinstance(data, np.ndarray):
tm.assert_numpy_array_equal(uniques, expected_uniques)
else:
12 changes: 6 additions & 6 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
@@ -707,9 +707,9 @@ def test_factorize(self):
else:
exp_arr = np.array(range(len(o)), dtype=np.intp)
exp_uniques = o
labels, uniques = o.factorize()
codes, uniques = o.factorize()

tm.assert_numpy_array_equal(labels, exp_arr)
tm.assert_numpy_array_equal(codes, exp_arr)
if isinstance(o, Series):
tm.assert_index_equal(uniques, Index(orig), check_names=False)
else:
@@ -736,9 +736,9 @@ def test_factorize_repeated(self):
exp_arr = np.array(
[5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp
)
labels, uniques = n.factorize(sort=True)
codes, uniques = n.factorize(sort=True)

tm.assert_numpy_array_equal(labels, exp_arr)
tm.assert_numpy_array_equal(codes, exp_arr)
if isinstance(o, Series):
tm.assert_index_equal(
uniques, Index(orig).sort_values(), check_names=False
@@ -747,8 +747,8 @@ def test_factorize_repeated(self):
tm.assert_index_equal(uniques, o, check_names=False)

exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp)
labels, uniques = n.factorize(sort=False)
tm.assert_numpy_array_equal(labels, exp_arr)
codes, uniques = n.factorize(sort=False)
tm.assert_numpy_array_equal(codes, exp_arr)

if isinstance(o, Series):
expected = Index(o.iloc[5:10].append(o.iloc[:5]))