From 5ea2e729c51e73ee9b090c99ecb28732d6561510 Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Mon, 15 Feb 2021 20:18:13 +0100 Subject: [PATCH 1/3] Squashing commits --- .github/workflows/ubuntu-test.yml | 2 +- .../30_extended/flows_and_runs_tutorial.py | 48 ++++++++----------- examples/30_extended/run_setup_tutorial.py | 9 ++-- .../40_paper/2018_neurips_perrone_example.py | 10 ++-- openml/extensions/sklearn/extension.py | 35 ++++++++++++++ tests/conftest.py | 19 ++------ .../test_sklearn_extension.py | 16 +++++-- tests/test_flows/test_flow_functions.py | 15 ++++-- tests/test_study/test_study_examples.py | 13 +++-- 9 files changed, 96 insertions(+), 71 deletions(-) diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml index 33b57179b..21f0e106c 100644 --- a/.github/workflows/ubuntu-test.yml +++ b/.github/workflows/ubuntu-test.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - scikit-learn: [0.21.2, 0.22.2, 0.23.1] + scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24] exclude: # no scikit-learn 0.21.2 release for Python 3.8 - python-version: 3.8 scikit-learn: 0.21.2 diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 5e73e7e9a..9f8c89375 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -8,7 +8,6 @@ # License: BSD 3-Clause import openml -import numpy as np from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree ############################################################################ @@ -54,7 +53,7 @@ task = openml.tasks.get_task(403) # Build any classifier or pipeline -clf = tree.ExtraTreeClassifier() +clf = tree.DecisionTreeClassifier() # Run the flow run = openml.runs.run_model_on_task(clf, task) @@ -83,7 +82,10 @@ # ############################ # # When you need to handle 'dirty' data, build pipelines to model then automatically. -task = openml.tasks.get_task(1) +# To demonstrate this using the dataset `credit-a `_ via +# `task `_ as it contains both numerical and categorical +# variables and missing values in both. +task = openml.tasks.get_task(96) # OpenML helper functions for sklearn can be plugged in directly for complicated pipelines from openml.extensions.sklearn import cat, cont @@ -96,20 +98,14 @@ [ ( "categorical", - pipeline.Pipeline( - [ - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), - ( - "Encoder", - preprocessing.OneHotEncoder( - sparse=False, handle_unknown="ignore" - ), - ), - ] - ), + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), cat, # returns the categorical feature indices ), - ("continuous", "passthrough", cont), # returns the numeric feature indices + ( + "continuous", + impute.SimpleImputer(strategy="median"), + cont, + ), # returns the numeric feature indices ] ), ), @@ -146,20 +142,14 @@ [ ( "categorical", - pipeline.Pipeline( - [ - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), - ( - "Encoder", - preprocessing.OneHotEncoder( - sparse=False, handle_unknown="ignore" - ), - ), - ] - ), + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), categorical_feature_indices, ), - ("continuous", "passthrough", numeric_feature_indices), + ( + "continuous", + impute.SimpleImputer(strategy="median"), + numeric_feature_indices, + ), ] ), ), @@ -182,7 +172,9 @@ task = openml.tasks.get_task(6) # The following lines can then be executed offline: -run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False) +run = openml.runs.run_model_on_task( + pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array", +) # The run may be stored offline, and the flow will be stored along with it: run.to_filesystem(directory="myrun") diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index afc49a98b..8579d1d38 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -59,12 +59,9 @@ # easy as you want it to be -cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore", sparse=False), - TruncatedSVD(), -) -ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)]) +cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),) +cont_imp = SimpleImputer(strategy="median") +ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),]) # Let's change some hyperparameters. Of course, in any good application we diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py index 60d212116..5ae339ae2 100644 --- a/examples/40_paper/2018_neurips_perrone_example.py +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"): cat_cols = list_categorical_attributes(flow_type=flow_type) num_cols = list(set(X.columns) - set(cat_cols)) -# Missing value imputers -cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None") +# Missing value imputers for numeric columns num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1) -# Creating the one-hot encoder +# Creating the one-hot encoder for numerical representation of categorical columns enc = OneHotEncoder(handle_unknown="ignore") -# Pipeline to handle categorical column transformations -cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)]) - # Combining column transformers -ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)]) +ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)]) # Creating the full pipeline with the surrogate model clf = RandomForestRegressor(n_estimators=50) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 0d049c4fd..ed2760782 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -211,6 +211,41 @@ def remove_all_in_parentheses(string: str) -> str: return short_name.format(pipeline) + @classmethod + def _min_dependency_str(cls, sklearn_version: str) -> str: + """ Returns a string containing the minimum dependencies for the sklearn version passed. + + Parameters + ---------- + sklearn_version : str + A version string of the xx.xx.xx + + Returns + ------- + str + """ + if LooseVersion(sklearn_version) >= "0.24": + # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with + # variables declared for extracting minimum dependency for that version + from sklearn import _min_dependencies as _mindep + + dependency_list = { + "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION), + "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION), + "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION), + "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION), + } + else: + # this is INCORRECT for sklearn versions >= 0.19 and < 0.24 + # given that OpenML has existing flows uploaded with such dependency information, + # we change no behaviour for older sklearn version, however from 0.24 onwards + # the dependency list will be accurately updated for any flow uploaded to OpenML + dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} + + sklearn_dep = "sklearn=={}".format(sklearn_version) + dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()]) + return "\n".join([sklearn_dep, dep_str]) + ################################################################################################ # Methods for flow serialization and de-serialization diff --git a/tests/conftest.py b/tests/conftest.py index 1b733ac19..c1f728a72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,16 +35,6 @@ logger.setLevel(logging.DEBUG) file_list = [] -directory = None - -# finding the root directory of conftest.py and going up to OpenML main directory -# exploiting the fact that conftest.py always resides in the root directory for tests -static_dir = os.path.dirname(os.path.abspath(__file__)) -logger.info("static directory: {}".format(static_dir)) -while True: - if "openml" in os.listdir(static_dir): - break - static_dir = os.path.join(static_dir, "..") def worker_id() -> str: @@ -66,12 +56,11 @@ def read_file_list() -> List[str]: :return: List[str] """ - directory = os.path.join(static_dir, "tests/files/") - if worker_id() == "master": - logger.info("Collecting file lists from: {}".format(directory)) - files = os.walk(directory) + this_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__))) + directory = os.path.join(this_dir, "..") + logger.info("Collecting file lists from: {}".format(directory)) file_list = [] - for root, _, filenames in files: + for root, _, filenames in os.walk(directory): for filename in filenames: file_list.append(os.path.join(root, filename)) return file_list diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 8ca6f9d45..4dc8744f1 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -146,7 +146,7 @@ def test_serialize_model(self): fixture_short_name = "sklearn.DecisionTreeClassifier" # str obtained from self.extension._get_sklearn_description(model) fixture_description = "A decision tree classifier." - version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ + version_fixture = self.extension._min_dependency_str(sklearn.__version__) presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"' # min_impurity_decrease has been introduced in 0.20 @@ -189,6 +189,8 @@ def test_serialize_model(self): if LooseVersion(sklearn.__version__) >= "0.22": fixture_parameters.update({"ccp_alpha": "0.0"}) fixture_parameters.move_to_end("ccp_alpha", last=False) + if LooseVersion(sklearn.__version__) >= "0.24": + del fixture_parameters["presort"] structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []} @@ -225,7 +227,7 @@ def test_serialize_model_clustering(self): fixture_description = "K-Means clustering{}".format( "" if LooseVersion(sklearn.__version__) < "0.22" else "." ) - version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__ + version_fixture = self.extension._min_dependency_str(sklearn.__version__) n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"' @@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self): (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] - else: + elif sklearn_version < "0.24": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 18), (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] + else: + fns = [ + (sklearn.ensemble.RandomForestRegressor.__init__, 18), + (sklearn.tree.DecisionTreeClassifier.__init__, 13), + (sklearn.pipeline.Pipeline.__init__, 2), + ] for fn, num_params_with_defaults in fns: defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn) @@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self): "bootstrap": [True, False], "criterion": ["gini", "entropy"], }, - cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1), + cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True), n_iter=5, ) flow = self.extension.model_to_flow(model) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 8ebbdef2b..a65dcbf70 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self): # Note that CI does not test against 0.19.1. openml.config.server = self.production_server _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] - flow = 8175 - expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied." + if sklearn_major > 23: + flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23 + flow_sklearn_version = "0.23.1" + else: + flow = 8175 + flow_sklearn_version = "0.19.1" + expected = ( + "Trying to deserialize a model with dependency " + "sklearn=={} not satisfied.".format(flow_sklearn_version) + ) self.assertRaisesRegex( ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True ) @@ -335,7 +343,8 @@ def test_get_flow_reinstantiate_model_wrong_version(self): flow = openml.flows.get_flow(flow_id=flow, reinstantiate=True, strict_version=False) # ensure that a new flow was created assert flow.flow_id is None - assert "0.19.1" not in flow.dependencies + assert "sklearn==0.19.1" not in flow.dependencies + assert "sklearn>=0.19.1" not in flow.dependencies def test_get_flow_id(self): if self.long_version: diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index e2a228aee..682359a61 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,6 +1,6 @@ # License: BSD 3-Clause -from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.testing import TestBase from openml.extensions.sklearn import cat, cont import sklearn @@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase): """Test the example code of Bischl et al. (2018)""" @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", - reason="columntransformer introduction in 0.20.0", + LooseVersion(sklearn.__version__) < "0.24", + reason="columntransformer introduction in 0.24.0", ) def test_Figure1a(self): """Test listing in Figure 1a on a single task and the old OpenML100 study. @@ -39,15 +39,14 @@ def test_Figure1a(self): import openml import sklearn.metrics import sklearn.tree + from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline, make_pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite - cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") - ) - cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + cat_imp = OneHotEncoder(handle_unknown="ignore") + cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) clf = Pipeline( steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())] From 844257695a59c37f0eb9d6d618304723b991bd4d Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Tue, 16 Feb 2021 17:55:32 +0100 Subject: [PATCH 2/3] All flow dependencies for sklearn>0.24 will change now --- openml/extensions/sklearn/extension.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index ed2760782..7bb2d3ef0 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -804,20 +804,13 @@ def _serialize_model(self, model: Any) -> OpenMLFlow: tags=tags, extension=self, language="English", - # TODO fill in dependencies! dependencies=dependencies, ) return flow def _get_dependencies(self) -> str: - dependencies = "\n".join( - [ - self._format_external_version("sklearn", sklearn.__version__,), - "numpy>=1.6.1", - "scipy>=0.9", - ] - ) + dependencies = self._min_dependency_str(sklearn.__version__) return dependencies def _get_tags(self) -> List[str]: From ffb4a2d72db28c8ab7e47c2d88710bbd0a292022 Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Thu, 18 Feb 2021 01:15:12 +0100 Subject: [PATCH 3/3] Dep. string change only for OpenML>v0.11 --- openml/extensions/sklearn/extension.py | 38 ++++++++++++++++++++------ 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 7bb2d3ef0..4442f798c 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -224,17 +224,37 @@ def _min_dependency_str(cls, sklearn_version: str) -> str: ------- str """ - if LooseVersion(sklearn_version) >= "0.24": + openml_major_version = int(LooseVersion(openml.__version__).version[1]) + # This explicit check is necessary to support existing entities on the OpenML servers + # that used the fixed dependency string (in the else block) + if openml_major_version > 11: + # OpenML v0.11 onwards supports sklearn>=0.24 # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with # variables declared for extracting minimum dependency for that version - from sklearn import _min_dependencies as _mindep - - dependency_list = { - "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION), - "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION), - "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION), - "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION), - } + if LooseVersion(sklearn_version) >= "0.24": + from sklearn import _min_dependencies as _mindep + + dependency_list = { + "numpy": "{}".format(_mindep.NUMPY_MIN_VERSION), + "scipy": "{}".format(_mindep.SCIPY_MIN_VERSION), + "joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION), + "threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION), + } + elif LooseVersion(sklearn_version) >= "0.23": + dependency_list = { + "numpy": "1.13.3", + "scipy": "0.19.1", + "joblib": "0.11", + "threadpoolctl": "2.0.0", + } + if LooseVersion(sklearn_version).version[2] == 0: + dependency_list.pop("threadpoolctl") + elif LooseVersion(sklearn_version) >= "0.21": + dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"} + elif LooseVersion(sklearn_version) >= "0.19": + dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"} + else: + dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} else: # this is INCORRECT for sklearn versions >= 0.19 and < 0.24 # given that OpenML has existing flows uploaded with such dependency information,