automl · eddiebergman · Feb 1, 2022 · Sep 15, 2021 · Sep 15, 2021 · Sep 16, 2021
diff --git a/autosklearn/data/validation.py b/autosklearn/data/validation.py
@@ -1,6 +1,6 @@
 # -*- encoding: utf-8 -*-
 import logging
-from typing import List, Optional, Tuple, Union, overload
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -152,22 +152,6 @@ def fit(
 
         return self
 
-    @overload
-    def transform(
-        self,
-        X: SUPPORTED_FEAT_TYPES,
-        y: None
-    ) -> Tuple[Union[np.ndarray, pd.DataFrame, spmatrix], None]:
-        ...
-
-    @overload
-    def transform(
-        self,
-        X: SUPPORTED_FEAT_TYPES,
-        y: Union[List, pd.Series, pd.DataFrame, np.ndarray]
-    ) -> Tuple[Union[spmatrix, pd.DataFrame, np.ndarray], np.ndarray]:
-        ...
-
     def transform(
         self,
         X: SUPPORTED_FEAT_TYPES,

diff --git a/autosklearn/util/data.py b/autosklearn/util/data.py
@@ -135,8 +135,8 @@ def predict_RAM_usage(X: np.ndarray, categorical: List[bool]) -> float:
 
 
 def subsample(
-    X: SUPPORTED_FEAT_TYPES,
-    y: Union[List, np.ndarray, pd.DataFrame, pd.Series],
+    X: Union[np.ndarray, spmatrix],
+    y: np.ndarray,
     is_classification: bool,
     sample_size: Union[float, int],
     random_state: Optional[Union[int, np.random.RandomState]] = None,
@@ -154,19 +154,12 @@ def subsample(
     Interestingly enough, StratifiedShuffleSplut and descendants don't support
     sparse `y` in `split(): _check_array` call. Hence, neither do we.
 
-    NOTE3:
-    The core autosklearn library doesn't rely on the full type of X.
-    The typing could be reduced to:
-    *   X: np.ndarray | spmatrix
-    *   Y: np.ndarray
-
-
     Parameters
     ----------
-    X: SUPPORTED_FEAT_TYPES
+    X: Union[np.ndarray, spmatrix]
         The X's to subsample
 
-    Y: List | np.ndarray | pd.DataFrame | Series
+    y: np.ndarray
         The Y's to subsample
 
     is_classification: bool
@@ -182,7 +175,7 @@ def subsample(
 
     Returns
     -------
-    (SUPPORTED_FEAT_TYPES, List | np.ndarray | pd.DataFrame | Series)
+    (np.ndarray | spmatrix, np.ndarray)
         The X and y subsampled according to sample_size
     """
     if isinstance(X, List):
@@ -198,6 +191,8 @@ def subsample(
         )
         left_idxs, _ = next(splitter.split(X=X, y=y))
 
+        # This function supports pandas objects but they won't get here
+        # yet as we do not reduce the size of pandas dataframes.
         if isinstance(X, pd.DataFrame):
             idxs = X.index[left_idxs]
             X = X.loc[idxs]

diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py
@@ -215,10 +215,12 @@ def dtype(arr):
     # These next part of the tests rely on some encoding to have taken place
     # This happens when `is_classification` and not task_type = multilabel-indicator
     #
-    # TargetValidator._fit()
+    # As state in TargetValidator._fit()
     # > Also, encoding multilabel indicator data makes the data multiclass
     #   Let the user employ a MultiLabelBinarizer if needed
     #
+    # As a result of this, we don't encode 'multilabel-indicator' labels and
+    # there is nothing else to check here
     if validator.type_of_target == 'multilabel-indicator':
         assert validator.encoder is None
 

diff --git a/test/test_util/test_data.py b/test/test_util/test_data.py
@@ -166,6 +166,8 @@ def test_reduce_precision_correctly_reduces_precision(X, dtype, x_type):
     expected: Dict[type, type] = {
         np.float32: np.float32,
         np.float64: np.float32,
+        np.dtype('float32'): np.float32,
+        np.dtype('float64'): np.float32
     }
     if hasattr(np, 'float96'):
         expected[np.float96] = np.float64
@@ -189,8 +191,8 @@ def test_reduce_precision_with_unsupported_dtypes(X, dtype):
     with pytest.raises(ValueError) as err:
         reduce_precision(X)
 
-    expected = f"X.dtype = {dtype} not equal to any supported {supported_precision_reductions}"
-    assert err.value == expected
+    expected = f"X.dtype = {X.dtype} not equal to any supported {supported_precision_reductions}"
+    assert err.value.args[0] == expected
 
 
 @parametrize("X", [
@@ -215,15 +217,18 @@ def test_reduce_dataset_reduces_size_and_precision(
     random_state = 0
     memory_limit = 1  # Force reductions
 
-    X_out, y_out = reduce_dataset_size_if_too_large(
-        X=X,
-        y=y,
-        random_state=random_state,
-        memory_limit=memory_limit,
-        operations=operations,
-        multiplier=multiplier,
-        is_classification=is_classification,
-    )
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore")
+
+        X_out, y_out = reduce_dataset_size_if_too_large(
+            X=X,
+            y=y,
+            random_state=random_state,
+            memory_limit=memory_limit,
+            operations=operations,
+            multiplier=multiplier,
+            is_classification=is_classification,
+        )
 
     def bytes(arr):
         return arr.nbytes if isinstance(arr, np.ndarray) else arr.data.nbytes
@@ -254,8 +259,8 @@ def test_reduce_dataset_invalid_dtype_for_precision_reduction():
             is_classification=False
         )
 
-    expected_err = f"Unsupported type `{dtype}` for precision reduction"
-    assert err.value == expected_err
+    expected_err = f"Unsupported type `{X.dtype}` for precision reduction"
+    assert err.value.args[0] == expected_err
 
 
 def test_reduce_dataset_invalid_operations():
@@ -272,7 +277,7 @@ def test_reduce_dataset_invalid_operations():
         )
 
     expected_err = f"Unknown operation `{invalid_op}`"
-    assert err.value == expected_err
+    assert err.value.args[0] == expected_err
 
 
 @pytest.mark.parametrize(