Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding sklearn min. dependencies for all versions #1022

Merged
merged 3 commits into from
Feb 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ubuntu-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
scikit-learn: [0.21.2, 0.22.2, 0.23.1]
scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
exclude: # no scikit-learn 0.21.2 release for Python 3.8
- python-version: 3.8
scikit-learn: 0.21.2
Expand Down
48 changes: 20 additions & 28 deletions examples/30_extended/flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# License: BSD 3-Clause

import openml
import numpy as np
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree

############################################################################
Expand Down Expand Up @@ -54,7 +53,7 @@
task = openml.tasks.get_task(403)

# Build any classifier or pipeline
clf = tree.ExtraTreeClassifier()
clf = tree.DecisionTreeClassifier()

# Run the flow
run = openml.runs.run_model_on_task(clf, task)
Expand Down Expand Up @@ -83,7 +82,10 @@
# ############################
#
# When you need to handle 'dirty' data, build pipelines to model then automatically.
task = openml.tasks.get_task(1)
# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
# variables and missing values in both.
task = openml.tasks.get_task(96)

# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
from openml.extensions.sklearn import cat, cont
Expand All @@ -96,20 +98,14 @@
[
(
"categorical",
pipeline.Pipeline(
[
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
(
"Encoder",
preprocessing.OneHotEncoder(
sparse=False, handle_unknown="ignore"
),
),
]
),
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
cat, # returns the categorical feature indices
),
("continuous", "passthrough", cont), # returns the numeric feature indices
(
"continuous",
impute.SimpleImputer(strategy="median"),
cont,
), # returns the numeric feature indices
]
),
),
Expand Down Expand Up @@ -146,20 +142,14 @@
[
(
"categorical",
pipeline.Pipeline(
[
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
(
"Encoder",
preprocessing.OneHotEncoder(
sparse=False, handle_unknown="ignore"
),
),
]
),
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
categorical_feature_indices,
),
("continuous", "passthrough", numeric_feature_indices),
(
"continuous",
impute.SimpleImputer(strategy="median"),
numeric_feature_indices,
),
]
),
),
Expand All @@ -182,7 +172,9 @@
task = openml.tasks.get_task(6)

# The following lines can then be executed offline:
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
run = openml.runs.run_model_on_task(
pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
)

# The run may be stored offline, and the flow will be stored along with it:
run.to_filesystem(directory="myrun")
Expand Down
9 changes: 3 additions & 6 deletions examples/30_extended/run_setup_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,9 @@
# easy as you want it to be


cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore", sparse=False),
TruncatedSVD(),
)
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
cont_imp = SimpleImputer(strategy="median")
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])

# Let's change some hyperparameters. Of course, in any good application we
Expand Down
10 changes: 3 additions & 7 deletions examples/40_paper/2018_neurips_perrone_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
cat_cols = list_categorical_attributes(flow_type=flow_type)
num_cols = list(set(X.columns) - set(cat_cols))

# Missing value imputers
cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
# Missing value imputers for numeric columns
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)

# Creating the one-hot encoder
# Creating the one-hot encoder for numerical representation of categorical columns
enc = OneHotEncoder(handle_unknown="ignore")

# Pipeline to handle categorical column transformations
cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])

# Combining column transformers
ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])

# Creating the full pipeline with the surrogate model
clf = RandomForestRegressor(n_estimators=50)
Expand Down
64 changes: 56 additions & 8 deletions openml/extensions/sklearn/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,61 @@ def remove_all_in_parentheses(string: str) -> str:

return short_name.format(pipeline)

@classmethod
def _min_dependency_str(cls, sklearn_version: str) -> str:
""" Returns a string containing the minimum dependencies for the sklearn version passed.

Parameters
----------
sklearn_version : str
A version string of the xx.xx.xx

Returns
-------
str
"""
openml_major_version = int(LooseVersion(openml.__version__).version[1])
# This explicit check is necessary to support existing entities on the OpenML servers
# that used the fixed dependency string (in the else block)
if openml_major_version > 11:
# OpenML v0.11 onwards supports sklearn>=0.24
# assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with
# variables declared for extracting minimum dependency for that version
if LooseVersion(sklearn_version) >= "0.24":
from sklearn import _min_dependencies as _mindep

dependency_list = {
"numpy": "{}".format(_mindep.NUMPY_MIN_VERSION),
"scipy": "{}".format(_mindep.SCIPY_MIN_VERSION),
"joblib": "{}".format(_mindep.JOBLIB_MIN_VERSION),
"threadpoolctl": "{}".format(_mindep.THREADPOOLCTL_MIN_VERSION),
}
elif LooseVersion(sklearn_version) >= "0.23":
dependency_list = {
"numpy": "1.13.3",
"scipy": "0.19.1",
"joblib": "0.11",
"threadpoolctl": "2.0.0",
}
if LooseVersion(sklearn_version).version[2] == 0:
dependency_list.pop("threadpoolctl")
elif LooseVersion(sklearn_version) >= "0.21":
dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"}
elif LooseVersion(sklearn_version) >= "0.19":
dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"}
else:
dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}
else:
# this is INCORRECT for sklearn versions >= 0.19 and < 0.24
# given that OpenML has existing flows uploaded with such dependency information,
# we change no behaviour for older sklearn version, however from 0.24 onwards
# the dependency list will be accurately updated for any flow uploaded to OpenML
dependency_list = {"numpy": "1.6.1", "scipy": "0.9"}

sklearn_dep = "sklearn=={}".format(sklearn_version)
dep_str = "\n".join(["{}>={}".format(k, v) for k, v in dependency_list.items()])
return "\n".join([sklearn_dep, dep_str])

################################################################################################
# Methods for flow serialization and de-serialization

Expand Down Expand Up @@ -769,20 +824,13 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
tags=tags,
extension=self,
language="English",
# TODO fill in dependencies!
dependencies=dependencies,
)

return flow

def _get_dependencies(self) -> str:
dependencies = "\n".join(
[
self._format_external_version("sklearn", sklearn.__version__,),
"numpy>=1.6.1",
"scipy>=0.9",
]
)
dependencies = self._min_dependency_str(sklearn.__version__)
return dependencies

def _get_tags(self) -> List[str]:
Expand Down
19 changes: 4 additions & 15 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,6 @@
logger.setLevel(logging.DEBUG)

file_list = []
directory = None

# finding the root directory of conftest.py and going up to OpenML main directory
# exploiting the fact that conftest.py always resides in the root directory for tests
static_dir = os.path.dirname(os.path.abspath(__file__))
logger.info("static directory: {}".format(static_dir))
while True:
if "openml" in os.listdir(static_dir):
break
static_dir = os.path.join(static_dir, "..")


def worker_id() -> str:
Expand All @@ -66,12 +56,11 @@ def read_file_list() -> List[str]:

:return: List[str]
"""
directory = os.path.join(static_dir, "tests/files/")
if worker_id() == "master":
logger.info("Collecting file lists from: {}".format(directory))
files = os.walk(directory)
this_dir = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
directory = os.path.join(this_dir, "..")
logger.info("Collecting file lists from: {}".format(directory))
file_list = []
for root, _, filenames in files:
for root, _, filenames in os.walk(directory):
for filename in filenames:
file_list.append(os.path.join(root, filename))
return file_list
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def test_serialize_model(self):
fixture_short_name = "sklearn.DecisionTreeClassifier"
# str obtained from self.extension._get_sklearn_description(model)
fixture_description = "A decision tree classifier."
version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
version_fixture = self.extension._min_dependency_str(sklearn.__version__)

presort_val = "false" if LooseVersion(sklearn.__version__) < "0.22" else '"deprecated"'
# min_impurity_decrease has been introduced in 0.20
Expand Down Expand Up @@ -189,6 +189,8 @@ def test_serialize_model(self):
if LooseVersion(sklearn.__version__) >= "0.22":
fixture_parameters.update({"ccp_alpha": "0.0"})
fixture_parameters.move_to_end("ccp_alpha", last=False)
if LooseVersion(sklearn.__version__) >= "0.24":
del fixture_parameters["presort"]

structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}

Expand Down Expand Up @@ -225,7 +227,7 @@ def test_serialize_model_clustering(self):
fixture_description = "K-Means clustering{}".format(
"" if LooseVersion(sklearn.__version__) < "0.22" else "."
)
version_fixture = "sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9" % sklearn.__version__
version_fixture = self.extension._min_dependency_str(sklearn.__version__)

n_jobs_val = "null" if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
precomp_val = '"auto"' if LooseVersion(sklearn.__version__) < "0.23" else '"deprecated"'
Expand Down Expand Up @@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self):
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
(sklearn.pipeline.Pipeline.__init__, 2),
]
else:
elif sklearn_version < "0.24":
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
(sklearn.pipeline.Pipeline.__init__, 2),
]
else:
fns = [
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
(sklearn.pipeline.Pipeline.__init__, 2),
]

for fn, num_params_with_defaults in fns:
defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
Expand Down Expand Up @@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self):
"bootstrap": [True, False],
"criterion": ["gini", "entropy"],
},
cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1),
cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
n_iter=5,
)
flow = self.extension.model_to_flow(model)
Expand Down
15 changes: 12 additions & 3 deletions tests/test_flows/test_flow_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
# Note that CI does not test against 0.19.1.
openml.config.server = self.production_server
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
flow = 8175
expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied."
if sklearn_major > 23:
flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23
flow_sklearn_version = "0.23.1"
else:
flow = 8175
flow_sklearn_version = "0.19.1"
expected = (
"Trying to deserialize a model with dependency "
"sklearn=={} not satisfied.".format(flow_sklearn_version)
)
self.assertRaisesRegex(
ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True
)
Expand All @@ -335,7 +343,8 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
flow = openml.flows.get_flow(flow_id=flow, reinstantiate=True, strict_version=False)
# ensure that a new flow was created
assert flow.flow_id is None
assert "0.19.1" not in flow.dependencies
assert "sklearn==0.19.1" not in flow.dependencies
assert "sklearn>=0.19.1" not in flow.dependencies

def test_get_flow_id(self):
if self.long_version:
Expand Down
13 changes: 6 additions & 7 deletions tests/test_study/test_study_examples.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# License: BSD 3-Clause

from openml.testing import TestBase, SimpleImputer, CustomImputer
from openml.testing import TestBase
from openml.extensions.sklearn import cat, cont

import sklearn
Expand All @@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase):
"""Test the example code of Bischl et al. (2018)"""

@unittest.skipIf(
LooseVersion(sklearn.__version__) < "0.20",
reason="columntransformer introduction in 0.20.0",
LooseVersion(sklearn.__version__) < "0.24",
reason="columntransformer introduction in 0.24.0",
)
def test_Figure1a(self):
"""Test listing in Figure 1a on a single task and the old OpenML100 study.
Expand All @@ -39,15 +39,14 @@ def test_Figure1a(self):
import openml
import sklearn.metrics
import sklearn.tree
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite
cat_imp = make_pipeline(
SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
)
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
cat_imp = OneHotEncoder(handle_unknown="ignore")
cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
clf = Pipeline(
steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]
Expand Down