[python-package] adapt to scikit-learn 1.6 testing changes, pin more packages in R 3.6 CI jobs (#6718)

jameslamb · web-flow · commit 4531ff548d43 · 2024-11-14T20:35:16.000-06:00
diff --git a/.ci/install-old-r-packages.R b/.ci/install-old-r-packages.R
@@ -0,0 +1,79 @@
+# [description]
+#
+#    Installs a pinned set of packages that worked together
+#    as of the last R 3.6 release.
+#
+
+.install_packages <- function(packages) {
+    install.packages(  # nolint: undesirable_function
+        pkgs = paste(  # nolint: paste
+            "https://cran.r-project.org/src/contrib/Archive"
+            , packages
+            , sep = "/"
+        )
+        , dependencies = FALSE
+        , lib = Sys.getenv("R_LIBS")
+        , repos = NULL
+    )
+}
+
+# when confronted with a bunch of URLs like this, install.packages() sometimes
+# struggles to determine install order... so install packages in batches here,
+# starting from the root of the dependency graph and working up
+
+# there was only a single release of {praise}, so there is no contrib/Archive URL for it
+install.packages(  # nolint: undesirable_function
+    pkgs = "https://cran.r-project.org/src/contrib/praise_1.0.0.tar.gz"
+    , dependencies = FALSE
+    , lib = Sys.getenv("R_LIBS")
+    , repos = NULL
+)
+
+.install_packages(c(
+    "brio/brio_1.1.4.tar.gz"              # nolint: non_portable_path
+    , "cli/cli_3.6.2.tar.gz"              # nolint: non_portable_path
+    , "crayon/crayon_1.5.2.tar.gz"        # nolint: non_portable_path
+    , "digest/digest_0.6.36.tar.gz"       # nolint: non_portable_path
+    , "evaluate/evaluate_0.23.tar.gz"     # nolint: non_portable_path
+    , "fansi/fansi_1.0.5.tar.gz"          # nolint: non_portable_path
+    , "fs/fs_1.6.4.tar.gz"                # nolint: non_portable_path
+    , "glue/glue_1.7.0.tar.gz"            # nolint: non_portable_path
+    , "jsonlite/jsonlite_1.8.8.tar.gz"    # nolint: non_portable_path
+    , "lattice/lattice_0.20-41.tar.gz"    # nolint: non_portable_path
+    , "magrittr/magrittr_2.0.2.tar.gz"    # nolint: non_portable_path
+    , "pkgconfig/pkgconfig_2.0.2.tar.gz"  # nolint: non_portable_path
+    , "ps/ps_1.8.0.tar.gz"                # nolint: non_portable_path
+    , "R6/R6_2.5.0.tar.gz"                # nolint: non_portable_path
+    , "rlang/rlang_1.1.3.tar.gz"          # nolint: non_portable_path
+    , "rprojroot/rprojroot_2.0.3.tar.gz"  # nolint: non_portable_path
+    , "utf8/utf8_1.2.3.tar.gz"            # nolint: non_portable_path
+    , "withr/withr_3.0.1.tar.gz"          # nolint: non_portable_path
+))
+
+.install_packages(c(
+    "desc/desc_1.4.2.tar.gz"              # nolint: non_portable_path
+    , "diffobj/diffobj_0.3.4.tar.gz"      # nolint: non_portable_path
+    , "lifecycle/lifecycle_1.0.3.tar.gz"  # nolint: non_portable_path
+    , "processx/processx_3.8.3.tar.gz"    # nolint: non_portable_path
+))
+
+.install_packages(c(
+    "callr/callr_3.7.5.tar.gz"    # nolint: non_portable_path
+    , "vctrs/vctrs_0.6.4.tar.gz"  # nolint: non_portable_path
+))
+
+.install_packages(c(
+    "pillar/pillar_1.8.1.tar.gz"    # nolint: non_portable_path
+    , "tibble/tibble_3.2.0.tar.gz"  # nolint: non_portable_path
+))
+
+.install_packages(c(
+    "pkgbuild/pkgbuild_1.4.4.tar.gz"    # nolint: non_portable_path
+    , "rematch2/rematch2_2.1.1.tar.gz"  # nolint: non_portable_path
+    , "waldo/waldo_0.5.3.tar.gz"        # nolint: non_portable_path
+))
+
+.install_packages(c(
+    "pkgload/pkgload_1.3.4.tar.gz"      # nolint: non_portable_path
+    , "testthat/testthat_3.2.1.tar.gz"  # nolint: non_portable_path
+))
diff --git a/.ci/test-r-package.sh b/.ci/test-r-package.sh
@@ -108,10 +108,10 @@ if [[ $OS_NAME == "macos" ]]; then
     export R_TIDYCMD=/usr/local/bin/tidy
 fi
 
-# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6
+# fix for issue where CRAN was not returning {evaluate}, {lattice}, or {waldo} when using R 3.6
 # "Warning: dependency ‘lattice’ is not available"
 if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
-    Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')"
+    Rscript --vanilla ./.ci/install-old-r-packages.R
 else
     # {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
     # This should be unnecessary on R >=4.4.0
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
@@ -14,6 +14,14 @@
     from sklearn.utils.multiclass import check_classification_targets
     from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
 
+    # sklearn.utils Tags types can be imported unconditionally once
+    # lightgbm's minimum scikit-learn version is 1.6 or higher
+    try:
+        from sklearn.utils import ClassifierTags as _sklearn_ClassifierTags
+        from sklearn.utils import RegressorTags as _sklearn_RegressorTags
+    except ImportError:
+        _sklearn_ClassifierTags = None
+        _sklearn_RegressorTags = None
     try:
         from sklearn.exceptions import NotFittedError
         from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
@@ -140,6 +148,8 @@ class _LGBMRegressorBase:  # type: ignore
     _LGBMCheckClassificationTargets = None
     _LGBMComputeSampleWeight = None
     _LGBMValidateData = None
+    _sklearn_ClassifierTags = None
+    _sklearn_RegressorTags = None
     _sklearn_version = None
 
 # additional scikit-learn imports only for type hints
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
@@ -40,6 +40,8 @@
     _LGBMModelBase,
     _LGBMRegressorBase,
     _LGBMValidateData,
+    _sklearn_ClassifierTags,
+    _sklearn_RegressorTags,
     _sklearn_version,
     dt_DataTable,
     pd_DataFrame,
@@ -703,7 +705,6 @@ def _update_sklearn_tags_from_dict(
         tags.input_tags.allow_nan = tags_dict["allow_nan"]
         tags.input_tags.sparse = "sparse" in tags_dict["X_types"]
         tags.target_tags.one_d_labels = "1dlabels" in tags_dict["X_types"]
-        tags._xfail_checks = tags_dict["_xfail_checks"]
         return tags
 
     def __sklearn_tags__(self) -> Optional["_sklearn_Tags"]:
@@ -1291,7 +1292,10 @@ def _more_tags(self) -> Dict[str, Any]:
         return tags
 
     def __sklearn_tags__(self) -> "_sklearn_Tags":
-        return LGBMModel.__sklearn_tags__(self)
+        tags = LGBMModel.__sklearn_tags__(self)
+        tags.estimator_type = "regressor"
+        tags.regressor_tags = _sklearn_RegressorTags(multi_label=False)
+        return tags
 
     def fit(  # type: ignore[override]
         self,
@@ -1350,7 +1354,10 @@ def _more_tags(self) -> Dict[str, Any]:
         return tags
 
     def __sklearn_tags__(self) -> "_sklearn_Tags":
-        return LGBMModel.__sklearn_tags__(self)
+        tags = LGBMModel.__sklearn_tags__(self)
+        tags.estimator_type = "classifier"
+        tags.classifier_tags = _sklearn_ClassifierTags(multi_class=True, multi_label=False)
+        return tags
 
     def fit(  # type: ignore[override]
         self,
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
@@ -17,11 +17,18 @@
 from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain
-from sklearn.utils.estimator_checks import parametrize_with_checks
+from sklearn.utils.estimator_checks import parametrize_with_checks as sklearn_parametrize_with_checks
 from sklearn.utils.validation import check_is_fitted
 
 import lightgbm as lgb
-from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
+from lightgbm.compat import (
+    DATATABLE_INSTALLED,
+    PANDAS_INSTALLED,
+    _sklearn_version,
+    dt_DataTable,
+    pd_DataFrame,
+    pd_Series,
+)
 
 from .utils import (
     assert_silent,
@@ -35,6 +42,9 @@
     softmax,
 )
 
+SKLEARN_MAJOR, SKLEARN_MINOR, *_ = _sklearn_version.split(".")
+SKLEARN_VERSION_GTE_1_6 = (int(SKLEARN_MAJOR), int(SKLEARN_MINOR)) >= (1, 6)
+
 decreasing_generator = itertools.count(0, -1)
 estimator_classes = (lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker)
 task_to_model_factory = {
@@ -1432,7 +1442,28 @@ def test_getting_feature_names_in_pd_input(estimator_class):
     np.testing.assert_array_equal(model.feature_names_in_, X.columns)
 
 
-@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
+# Starting with scikit-learn 1.6 (https://github.com/scikit-learn/scikit-learn/pull/30149),
+# the only API for marking estimator tests as expected to fail is to pass a keyword argument
+# to parametrize_with_checks(). That function didn't accept additional arguments in earlier
+# versions.
+#
+# This block defines a patched version of parametrize_with_checks() so lightgbm's tests
+# can be compatible with scikit-learn <1.6 and >=1.6.
+#
+# This should be removed once minimum supported scikit-learn version is at least 1.6.
+if SKLEARN_VERSION_GTE_1_6:
+    parametrize_with_checks = sklearn_parametrize_with_checks
+else:
+
+    def parametrize_with_checks(estimator, *args, **kwargs):
+        return sklearn_parametrize_with_checks(estimator)
+
+
+def _get_expected_failed_tests(estimator):
+    return estimator._more_tags()["_xfail_checks"]
+
+
+@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()], expected_failed_checks=_get_expected_failed_tests)
 def test_sklearn_integration(estimator, check):
     estimator.set_params(min_child_samples=1, min_data_in_bin=1)
     check(estimator)
@@ -1457,7 +1488,6 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato
         assert sklearn_tags.input_tags.allow_nan is True
         assert sklearn_tags.input_tags.sparse is True
         assert sklearn_tags.target_tags.one_d_labels is True
-        assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"]
 
 
 @pytest.mark.parametrize("task", all_tasks)