[SPARK-43875][PS][TESTS] Enabling Categorical tests for Pandas 2.0.0 and above

itholic · zhengruifeng · commit c321b3dd66f6 · 2023-08-18T08:39:05.000+08:00
### What changes were proposed in this pull request? This PR proposes to enable Categorical tests for pandas 2.0.0 and above. See https://pandas.pydata.org/docs/whatsnew/v2.0.0.html for more detail. ### Why are the changes needed? To match the behavior with pandas 2.0.0 and above. ### Does this PR introduce _any_ user-facing change? No, this is test-only. ### How was this patch tested? Enabling & updating the existing UTs. Closes #42530 from itholic/pandas_categorical_test. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -75,10 +75,6 @@ def test_categorical_index(self):
         ):
             ps.CategoricalIndex([1, 2, 3]).all()
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43568): Enable CategoricalIndexTests.test_categories_setter for pandas 2.0.0.",
-    )
     def test_categories_setter(self):
         pdf = pd.DataFrame(
             {
@@ -92,20 +88,10 @@ def test_categories_setter(self):
         pidx = pdf.index
         psidx = psdf.index
 
-        pidx.categories = ["z", "y", "x"]
-        psidx.categories = ["z", "y", "x"]
-        # Pandas deprecated all the in-place category-setting behaviors, dtypes also not be
-        # refreshed in categories.setter since Pandas 1.4+, we should also consider to clean up
-        # this test when in-place category-setting removed:
-        # https://github.com/pandas-dev/pandas/issues/46820
-        if LooseVersion("1.4") >= LooseVersion(pd.__version__) >= LooseVersion("1.1"):
-            self.assert_eq(pidx, psidx)
-            self.assert_eq(pdf, psdf)
-        else:
-            pidx = pidx.set_categories(pidx.categories)
-            pdf.index = pidx
-            self.assert_eq(pidx, psidx)
-            self.assert_eq(pdf, psdf)
+        pidx = pidx.rename_categories(["z", "y", "x"])
+        psidx = psidx.rename_categories(["z", "y", "x"])
+        self.assert_eq(pidx, psidx)
+        self.assert_eq(pdf, psdf)
 
         with self.assertRaises(ValueError):
             psidx.categories = [1, 2, 3, 4]
@@ -122,10 +108,6 @@ def test_add_categories(self):
         self.assertRaises(ValueError, lambda: psidx.add_categories(3))
         self.assertRaises(ValueError, lambda: psidx.add_categories([4, 4]))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43633): Enable CategoricalIndexTests.test_remove_categories for pandas 2.0.0.",
-    )
     def test_remove_categories(self):
         pidx = pd.CategoricalIndex([1, 2, 3], categories=[3, 2, 1])
         psidx = ps.from_pandas(pidx)
diff --git a/python/pyspark/pandas/tests/test_categorical.py b/python/pyspark/pandas/tests/test_categorical.py
@@ -198,10 +198,6 @@ def test_astype(self):
 
         self.assert_eq(pscser.astype(str), pcser.astype(str))
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43564): Enable CategoricalTests.test_factorize for pandas 2.0.0.",
-    )
     def test_factorize(self):
         pser = pd.Series(["a", "b", "c", None], dtype=CategoricalDtype(["c", "a", "d", "b"]))
         psser = ps.from_pandas(pser)
@@ -212,8 +208,8 @@ def test_factorize(self):
         self.assert_eq(kcodes.tolist(), pcodes.tolist())
         self.assert_eq(kuniques, puniques)
 
-        pcodes, puniques = pser.factorize(na_sentinel=-2)
-        kcodes, kuniques = psser.factorize(na_sentinel=-2)
+        pcodes, puniques = pser.factorize(use_na_sentinel=-2)
+        kcodes, kuniques = psser.factorize(use_na_sentinel=-2)
 
         self.assert_eq(kcodes.tolist(), pcodes.tolist())
         self.assert_eq(kuniques, puniques)
@@ -345,11 +341,6 @@ def test_groupby_apply(self):
         #     psdf.groupby("a").apply(len).sort_index(), pdf.groupby("a").apply(len).sort_index(),
         # )
 
-    @unittest.skipIf(
-        LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
-        "TODO(SPARK-43813): Enable CategoricalTests.test_groupby_apply_without_shortcut "
-        "for pandas 2.0.0.",
-    )
     def test_groupby_apply_without_shortcut(self):
         with ps.option_context("compute.shortcut_limit", 0):
             self.test_groupby_apply()
@@ -360,8 +351,8 @@ def identity(df) -> ps.DataFrame[zip(psdf.columns, psdf.dtypes)]:
             return df
 
         self.assert_eq(
-            psdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
-            pdf.groupby("a").apply(identity).sort_values(["a", "b"]).reset_index(drop=True),
+            psdf.groupby("a").apply(identity).sort_values(["b"]).reset_index(drop=True),
+            pdf.groupby("a").apply(identity).sort_values(["b"]).reset_index(drop=True),
         )
 
     def test_groupby_transform(self):