Skip to content

Commit c321b3d

Browse files
itholiczhengruifeng
authored andcommitted
[SPARK-43875][PS][TESTS] Enabling Categorical tests for Pandas 2.0.0 and above
### What changes were proposed in this pull request? This PR proposes to enable Categorical tests for pandas 2.0.0 and above. See https://pandas.pydata.org/docs/whatsnew/v2.0.0.html for more detail. ### Why are the changes needed? To match the behavior with pandas 2.0.0 and above. ### Does this PR introduce _any_ user-facing change? No, this is test-only. ### How was this patch tested? Enabling & updating the existing UTs. Closes #42530 from itholic/pandas_categorical_test. Authored-by: itholic <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 48faaa8 commit c321b3d

File tree

2 files changed

+8
-35
lines changed

2 files changed

+8
-35
lines changed

python/pyspark/pandas/tests/indexes/test_category.py

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,6 @@ def test_categorical_index(self):
7575
):
7676
ps.CategoricalIndex([1, 2, 3]).all()
7777

78-
@unittest.skipIf(
79-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
80-
"TODO(SPARK-43568): Enable CategoricalIndexTests.test_categories_setter for pandas 2.0.0.",
81-
)
8278
def test_categories_setter(self):
8379
pdf = pd.DataFrame(
8480
{
@@ -92,20 +88,10 @@ def test_categories_setter(self):
9288
pidx = pdf.index
9389
psidx = psdf.index
9490

95-
pidx.categories = ["z", "y", "x"]
96-
psidx.categories = ["z", "y", "x"]
97-
# Pandas deprecated all the in-place category-setting behaviors, dtypes also not be
98-
# refreshed in categories.setter since Pandas 1.4+, we should also consider to clean up
99-
# this test when in-place category-setting removed:
100-
# https://github.com/pandas-dev/pandas/issues/46820
101-
if LooseVersion("1.4") >= LooseVersion(pd.__version__) >= LooseVersion("1.1"):
102-
self.assert_eq(pidx, psidx)
103-
self.assert_eq(pdf, psdf)
104-
else:
105-
pidx = pidx.set_categories(pidx.categories)
106-
pdf.index = pidx
107-
self.assert_eq(pidx, psidx)
108-
self.assert_eq(pdf, psdf)
91+
pidx = pidx.rename_categories(["z", "y", "x"])
92+
psidx = psidx.rename_categories(["z", "y", "x"])
93+
self.assert_eq(pidx, psidx)
94+
self.assert_eq(pdf, psdf)
10995

11096
with self.assertRaises(ValueError):
11197
psidx.categories = [1, 2, 3, 4]
@@ -122,10 +108,6 @@ def test_add_categories(self):
122108
self.assertRaises(ValueError, lambda: psidx.add_categories(3))
123109
self.assertRaises(ValueError, lambda: psidx.add_categories([4, 4]))
124110

125-
@unittest.skipIf(
126-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
127-
"TODO(SPARK-43633): Enable CategoricalIndexTests.test_remove_categories for pandas 2.0.0.",
128-
)
129111
def test_remove_categories(self):
130112
pidx = pd.CategoricalIndex([1, 2, 3], categories=[3, 2, 1])
131113
psidx = ps.from_pandas(pidx)

python/pyspark/pandas/tests/test_categorical.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,6 @@ def test_astype(self):
198198

199199
self.assert_eq(pscser.astype(str), pcser.astype(str))
200200

201-
@unittest.skipIf(
202-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
203-
"TODO(SPARK-43564): Enable CategoricalTests.test_factorize for pandas 2.0.0.",
204-
)
205201
def test_factorize(self):
206202
pser = pd.Series(["a", "b", "c", None], dtype=CategoricalDtype(["c", "a", "d", "b"]))
207203
psser = ps.from_pandas(pser)
@@ -212,8 +208,8 @@ def test_factorize(self):
212208
self.assert_eq(kcodes.tolist(), pcodes.tolist())
213209
self.assert_eq(kuniques, puniques)
214210

215-
pcodes, puniques = pser.factorize(na_sentinel=-2)
216-
kcodes, kuniques = psser.factorize(na_sentinel=-2)
211+
pcodes, puniques = pser.factorize(use_na_sentinel=-2)
212+
kcodes, kuniques = psser.factorize(use_na_sentinel=-2)
217213

218214
self.assert_eq(kcodes.tolist(), pcodes.tolist())
219215
self.assert_eq(kuniques, puniques)
@@ -345,11 +341,6 @@ def test_groupby_apply(self):
345341
# psdf.groupby("a").apply(len).sort_index(), pdf.groupby("a").apply(len).sort_index(),
346342
# )
347343

348-
@unittest.skipIf(
349-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
350-
"TODO(SPARK-43813): Enable CategoricalTests.test_groupby_apply_without_shortcut "
351-
"for pandas 2.0.0.",
352-
)
353344
def test_groupby_apply_without_shortcut(self):
354345
with ps.option_context("compute.shortcut_limit", 0):
355346
self.test_groupby_apply()
@@ -360,8 +351,8 @@ def identity(df) -> ps.DataFrame[zip(psdf.columns, psdf.dtypes)]:
360351
return df
361352

362353
self.assert_eq(
363-
psdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
364-
pdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
354+
psdf.groupby("a").apply(identity).sort_values(["b"]).reset_index(drop=True),
355+
pdf.groupby("a").apply(identity).sort_values(["b"]).reset_index(drop=True),
365356
)
366357

367358
def test_groupby_transform(self):

0 commit comments

Comments
 (0)