openml · mfeurer · Feb 18, 2021 · Feb 15, 2021 · Feb 16, 2021 · Feb 18, 2021
diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
-        scikit-learn: [0.21.2, 0.22.2, 0.23.1]
+        scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
         exclude:  # no scikit-learn 0.21.2 release for Python 3.8
           - python-version: 3.8
             scikit-learn: 0.21.2

diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -8,7 +8,6 @@
 # License: BSD 3-Clause
 
 import openml
-import numpy as np
 from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
@@ -54,7 +53,7 @@
 task = openml.tasks.get_task(403)
 
 # Build any classifier or pipeline
-clf = tree.ExtraTreeClassifier()
+clf = tree.DecisionTreeClassifier()
 
 # Run the flow
 run = openml.runs.run_model_on_task(clf, task)
@@ -83,7 +82,10 @@
 # ############################
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(1)
+# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
+# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
+# variables and missing values in both.
+task = openml.tasks.get_task(96)
 
 # OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
 from openml.extensions.sklearn import cat, cont
@@ -96,20 +98,14 @@
                 [
                     (
                         "categorical",
-                        pipeline.Pipeline(
-                            [
-                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
-                                (
-                                    "Encoder",
-                                    preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore"
-                                    ),
-                                ),
-                            ]
-                        ),
+                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
                         cat,  # returns the categorical feature indices
                     ),
-                    ("continuous", "passthrough", cont),  # returns the numeric feature indices
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        cont,
+                    ),  # returns the numeric feature indices
                 ]
             ),
         ),
@@ -146,20 +142,14 @@
                 [
                     (
                         "categorical",
-                        pipeline.Pipeline(
-                            [
-                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
-                                (
-                                    "Encoder",
-                                    preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore"
-                                    ),
-                                ),
-                            ]
-                        ),
+                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
                         categorical_feature_indices,
                     ),
-                    ("continuous", "passthrough", numeric_feature_indices),
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        numeric_feature_indices,
+                    ),
                 ]
             ),
         ),
@@ -182,7 +172,9 @@
 task = openml.tasks.get_task(6)
 
 # The following lines can then be executed offline:
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
+run = openml.runs.run_model_on_task(
+    pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
+)
 
 # The run may be stored offline, and the flow will be stored along with it:
 run.to_filesystem(directory="myrun")

diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -59,12 +59,9 @@
 # easy as you want it to be
 
 
-cat_imp = make_pipeline(
-    SimpleImputer(strategy="most_frequent"),
-    OneHotEncoder(handle_unknown="ignore", sparse=False),
-    TruncatedSVD(),
-)
-ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
+cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
+cont_imp = SimpleImputer(strategy="median")
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
 model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
 
 # Let's change some hyperparameters. Of course, in any good application we

diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
 cat_cols = list_categorical_attributes(flow_type=flow_type)
 num_cols = list(set(X.columns) - set(cat_cols))
 
-# Missing value imputers
-cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
+# Missing value imputers for numeric columns
 num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
 
-# Creating the one-hot encoder
+# Creating the one-hot encoder for numerical representation of categorical columns
 enc = OneHotEncoder(handle_unknown="ignore")
 
-# Pipeline to handle categorical column transformations
-cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
-
 # Combining column transformers
-ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
+ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
 
 # Creating the full pipeline with the surrogate model
 clf = RandomForestRegressor(n_estimators=50)

diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -211,6 +211,61 @@ def remove_all_in_parentheses(string: str) -> str:
 
         return short_name.format(pipeline)
 
+    @classmethod
+    def _min_dependency_str(cls, sklearn_version: str) -> str:
+        """ Returns a string containing the minimum dependencies for the sklearn version passed.
+
+        Parameters
+        ----------
+        sklearn_version : str
+            A version string of the xx.xx.xx
+
+        Returns
+        -------
+        str
+        """
+        openml_major_version = int(LooseVersion(openml.__version__).version[1])
+        # This explicit check is necessary to support existing entities on the OpenML servers
+        # that used the fixed dependency string (in the else block)
+        if openml_major_version > 11:
+            # OpenML v0.11 onwards supports sklearn>=0.24
+            # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with
+            # variables declared for extracting minimum dependency for that version
+            if LooseVersion(sklearn_version) >= "0.24":
+                from sklearn import _min_dependencies as _mindep
+
+                dependency_list = {
+                    "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION),
+                    "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION),
+                    "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION),
+                    "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION),
+                }
+            elif LooseVersion(sklearn_version) >= "0.23":
+                dependency_list = {
+                    "numpy": "1.13.3",
+                    "scipy": "0.19.1",
+                    "joblib": "0.11",
+                    "threadpoolctl": "2.0.0",
+                }
+                if LooseVersion(sklearn_version).version[2] == 0:
+                    dependency_list.pop("threadpoolctl")
+            elif LooseVersion(sklearn_version) >= "0.21":
+                dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"}
+            elif LooseVersion(sklearn_version) >= "0.19":
+                dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"}
+            else:
+                dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
+        else:
+            # this is INCORRECT for sklearn versions >= 0.19 and < 0.24
+            # given that OpenML has existing flows uploaded with such dependency information,
+            # we change no behaviour for older sklearn version, however from 0.24 onwards
+            # the dependency list will be accurately updated for any flow uploaded to OpenML
+            dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
+
+        sklearn_dep = "sklearn=={}".format(sklearn_version)
+        dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()])
+        return "\n".join([sklearn_dep, dep_str])
+
     ################################################################################################
     # Methods for flow serialization and de-serialization
 
@@ -769,20 +824,13 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
             tags=tags,
             extension=self,
             language="English",
-            # TODO fill in dependencies!
             dependencies=dependencies,
         )
 
         return flow
 
     def _get_dependencies(self) -> str:
-        dependencies = "\n".join(
-            [
-                self._format_external_version("sklearn", sklearn.__version__,),
-                "numpy>=1.6.1",
-                "scipy>=0.9",
-            ]
-        )
+        dependencies = self._min_dependency_str(sklearn.__version__)
         return dependencies
 
     def _get_tags(self) -> List[str]:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -35,16 +35,6 @@
 logger.setLevel(logging.DEBUG)
 
 file_list = []
-directory = None
-
-# finding the root directory of conftest.py and going up to OpenML main directory
-# exploiting the fact that conftest.py always resides in the root directory for tests
-static_dir = os.path.dirname(os.path.abspath(__file__))
-logger.info("static directory: {}".format(static_dir))
-while True:
-    if "openml" in os.listdir(static_dir):
-        break
-    static_dir = os.path.join(static_dir, "..")
 
 
 def worker_id() -> str:
@@ -66,12 +56,11 @@ def read_file_list() -> List[str]:
 
     :return: List[str]
     """
-    directory = os.path.join(static_dir, "tests/files/")
-    if worker_id() == "master":
-        logger.info("Collecting file lists from: {}".format(directory))
-    files = os.walk(directory)
+    this_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
+    directory = os.path.join(this_dir, "..")
+    logger.info("Collecting file lists from: {}".format(directory))
     file_list = []
-    for root, _, filenames in files:
+    for root, _, filenames in os.walk(directory):
         for filename in filenames:
             file_list.append(os.path.join(root, filename))
     return file_list

diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -146,7 +146,7 @@ def test_serialize_model(self):
         fixture_short_name = "sklearn.DecisionTreeClassifier"
         # str obtained from self.extension._get_sklearn_description(model)
         fixture_description = "A decision tree classifier."
-        version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
+        version_fixture = self.extension._min_dependency_str(sklearn.__version__)
 
         presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"'
         # min_impurity_decrease has been introduced in 0.20
@@ -189,6 +189,8 @@ def test_serialize_model(self):
         if LooseVersion(sklearn.__version__) >= "0.22":
             fixture_parameters.update({"ccp_alpha": "0.0"})
             fixture_parameters.move_to_end("ccp_alpha", last=False)
+        if LooseVersion(sklearn.__version__) >= "0.24":
+            del fixture_parameters["presort"]
 
         structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}
 
@@ -225,7 +227,7 @@ def test_serialize_model_clustering(self):
         fixture_description = "K-Means clustering{}".format(
             "" if LooseVersion(sklearn.__version__) < "0.22" else "."
         )
-        version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
+        version_fixture = self.extension._min_dependency_str(sklearn.__version__)
 
         n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
         precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
@@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self):
                 (sklearn.tree.DecisionTreeClassifier.__init__, 14),
                 (sklearn.pipeline.Pipeline.__init__, 2),
             ]
-        else:
+        elif sklearn_version < "0.24":
             fns = [
                 (sklearn.ensemble.RandomForestRegressor.__init__, 18),
                 (sklearn.tree.DecisionTreeClassifier.__init__, 14),
                 (sklearn.pipeline.Pipeline.__init__, 2),
             ]
+        else:
+            fns = [
+                (sklearn.ensemble.RandomForestRegressor.__init__, 18),
+                (sklearn.tree.DecisionTreeClassifier.__init__, 13),
+                (sklearn.pipeline.Pipeline.__init__, 2),
+            ]
 
         for fn, num_params_with_defaults in fns:
             defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
@@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self):
                 "bootstrap": [True, False],
                 "criterion": ["gini", "entropy"],
             },
-            cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1),
+            cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
             n_iter=5,
         )
         flow = self.extension.model_to_flow(model)

diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
         # Note that CI does not test against 0.19.1.
         openml.config.server = self.production_server
         _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
-        flow = 8175
-        expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied."
+        if sklearn_major > 23:
+            flow = 18587  # 18687, 18725 --- flows building random forest on >= 0.23
+            flow_sklearn_version = "0.23.1"
+        else:
+            flow = 8175
+            flow_sklearn_version = "0.19.1"
+        expected = (
+            "Trying to deserialize a model with dependency "
+            "sklearn=={} not satisfied.".format(flow_sklearn_version)
+        )
         self.assertRaisesRegex(
             ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True
         )
@@ -335,7 +343,8 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
             flow = openml.flows.get_flow(flow_id=flow, reinstantiate=True, strict_version=False)
             # ensure that a new flow was created
             assert flow.flow_id is None
-            assert "0.19.1" not in flow.dependencies
+            assert "sklearn==0.19.1" not in flow.dependencies
+            assert "sklearn>=0.19.1" not in flow.dependencies
 
     def test_get_flow_id(self):
         if self.long_version:

diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
@@ -1,6 +1,6 @@
 # License: BSD 3-Clause
 
-from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.testing import TestBase
 from openml.extensions.sklearn import cat, cont
 
 import sklearn
@@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase):
     """Test the example code of Bischl et al. (2018)"""
 
     @unittest.skipIf(
-        LooseVersion(sklearn.__version__) < "0.20",
-        reason="columntransformer introduction in 0.20.0",
+        LooseVersion(sklearn.__version__) < "0.24",
+        reason="columntransformer introduction in 0.24.0",
     )
     def test_Figure1a(self):
         """Test listing in Figure 1a on a single task and the old OpenML100 study.
@@ -39,15 +39,14 @@ def test_Figure1a(self):
         import openml
         import sklearn.metrics
         import sklearn.tree
+        from sklearn.impute import SimpleImputer
         from sklearn.pipeline import Pipeline, make_pipeline
         from sklearn.compose import ColumnTransformer
         from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
         benchmark_suite = openml.study.get_study("OpenML100", "tasks")  # obtain the benchmark suite
-        cat_imp = make_pipeline(
-            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
-        )
-        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        cat_imp = OneHotEncoder(handle_unknown="ignore")
+        cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         clf = Pipeline(
             steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]