Skip to content

Commit

Permalink
Rework local openml directory (#987)
Browse files Browse the repository at this point in the history
* Fix #883 #884 #906 #972

* Address Mitar's comments

* rework for Windows/OSX, some mypy pleasing due to pre-commit

* type fixes and removing unused code
  • Loading branch information
mfeurer authored Feb 10, 2021
1 parent ab793a6 commit 47cda65
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 83 deletions.
2 changes: 1 addition & 1 deletion openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def _read_url_files(url, data=None, file_elements=None):

def __read_url(url, request_method, data=None, md5_checksum=None):
data = {} if data is None else data
if config.apikey is not None:
if config.apikey:
data["api_key"] = config.apikey
return _send_request(
request_method=request_method, url=url, data=data, md5_checksum=md5_checksum
Expand Down
123 changes: 70 additions & 53 deletions openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import logging
import logging.handlers
import os
from pathlib import Path
import platform
from typing import Tuple, cast

from io import StringIO
Expand All @@ -19,7 +21,7 @@
file_handler = None


def _create_log_handlers():
def _create_log_handlers(create_file_handler=True):
""" Creates but does not attach the log handlers. """
global console_handler, file_handler
if console_handler is not None or file_handler is not None:
Expand All @@ -32,12 +34,13 @@ def _create_log_handlers():
console_handler = logging.StreamHandler()
console_handler.setFormatter(output_formatter)

one_mb = 2 ** 20
log_path = os.path.join(cache_directory, "openml_python.log")
file_handler = logging.handlers.RotatingFileHandler(
log_path, maxBytes=one_mb, backupCount=1, delay=True
)
file_handler.setFormatter(output_formatter)
if create_file_handler:
one_mb = 2 ** 20
log_path = os.path.join(cache_directory, "openml_python.log")
file_handler = logging.handlers.RotatingFileHandler(
log_path, maxBytes=one_mb, backupCount=1, delay=True
)
file_handler.setFormatter(output_formatter)


def _convert_log_levels(log_level: int) -> Tuple[int, int]:
Expand Down Expand Up @@ -83,15 +86,18 @@ def set_file_log_level(file_output_level: int):

# Default values (see also /~https://github.com/openml/OpenML/wiki/Client-API-Standards)
_defaults = {
"apikey": None,
"apikey": "",
"server": "https://www.openml.org/api/v1/xml",
"cachedir": os.path.expanduser(os.path.join("~", ".openml", "cache")),
"cachedir": (
os.environ.get("XDG_CACHE_HOME", os.path.join("~", ".cache", "openml",))
if platform.system() == "Linux"
else os.path.join("~", ".openml")
),
"avoid_duplicate_runs": "True",
"connection_n_retries": 10,
"max_retries": 20,
"connection_n_retries": "10",
"max_retries": "20",
}

config_file = os.path.expanduser(os.path.join("~", ".openml", "config"))

# Default values are actually added here in the _setup() function which is
# called at the end of this module
Expand All @@ -116,8 +122,8 @@ def get_server_base_url() -> str:
avoid_duplicate_runs = True if _defaults["avoid_duplicate_runs"] == "True" else False

# Number of retries if the connection breaks
connection_n_retries = _defaults["connection_n_retries"]
max_retries = _defaults["max_retries"]
connection_n_retries = int(_defaults["connection_n_retries"])
max_retries = int(_defaults["max_retries"])


class ConfigurationForExamples:
Expand Down Expand Up @@ -187,62 +193,78 @@ def _setup():
global connection_n_retries
global max_retries

# read config file, create cache directory
try:
os.mkdir(os.path.expanduser(os.path.join("~", ".openml")))
except FileExistsError:
# For other errors, we want to propagate the error as openml does not work without cache
pass
if platform.system() == "Linux":
config_dir = Path(os.environ.get("XDG_CONFIG_HOME", Path("~") / ".config" / "openml"))
else:
config_dir = Path("~") / ".openml"
# Still use os.path.expanduser to trigger the mock in the unit test
config_dir = Path(os.path.expanduser(config_dir))
config_file = config_dir / "config"

# read config file, create directory for config file
if not os.path.exists(config_dir):
try:
os.mkdir(config_dir)
cache_exists = True
except PermissionError:
cache_exists = False
else:
cache_exists = True

if cache_exists:
_create_log_handlers()
else:
_create_log_handlers(create_file_handler=False)
openml_logger.warning(
"No permission to create OpenML directory at %s! This can result in OpenML-Python "
"not working properly." % config_dir
)

config = _parse_config()
config = _parse_config(config_file)
apikey = config.get("FAKE_SECTION", "apikey")
server = config.get("FAKE_SECTION", "server")

short_cache_dir = config.get("FAKE_SECTION", "cachedir")
cache_directory = os.path.expanduser(short_cache_dir)
cache_dir = config.get("FAKE_SECTION", "cachedir")
cache_directory = os.path.expanduser(cache_dir)

# create the cache subdirectory
try:
os.mkdir(cache_directory)
except FileExistsError:
# For other errors, we want to propagate the error as openml does not work without cache
pass
if not os.path.exists(cache_directory):
try:
os.mkdir(cache_directory)
except PermissionError:
openml_logger.warning(
"No permission to create openml cache directory at %s! This can result in "
"OpenML-Python not working properly." % cache_directory
)

avoid_duplicate_runs = config.getboolean("FAKE_SECTION", "avoid_duplicate_runs")
connection_n_retries = config.get("FAKE_SECTION", "connection_n_retries")
max_retries = config.get("FAKE_SECTION", "max_retries")
connection_n_retries = int(config.get("FAKE_SECTION", "connection_n_retries"))
max_retries = int(config.get("FAKE_SECTION", "max_retries"))
if connection_n_retries > max_retries:
raise ValueError(
"A higher number of retries than {} is not allowed to keep the "
"server load reasonable".format(max_retries)
)


def _parse_config():
def _parse_config(config_file: str):
""" Parse the config file, set up defaults. """
config = configparser.RawConfigParser(defaults=_defaults)

if not os.path.exists(config_file):
# Create an empty config file if there was none so far
fh = open(config_file, "w")
fh.close()
logger.info(
"Could not find a configuration file at %s. Going to "
"create an empty file there." % config_file
)

# The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file.
# Cheat the ConfigParser module by adding a fake section header
config_file_ = StringIO()
config_file_.write("[FAKE_SECTION]\n")
try:
# The ConfigParser requires a [SECTION_HEADER], which we do not expect in our config file.
# Cheat the ConfigParser module by adding a fake section header
config_file_ = StringIO()
config_file_.write("[FAKE_SECTION]\n")
with open(config_file) as fh:
for line in fh:
config_file_.write(line)
config_file_.seek(0)
config.read_file(config_file_)
except FileNotFoundError:
logger.info("No config file found at %s, using default configuration.", config_file)
except OSError as e:
logger.info("Error opening file %s: %s", config_file, e.message)
logger.info("Error opening file %s: %s", config_file, e.args[0])
config_file_.seek(0)
config.read_file(config_file_)
return config


Expand All @@ -257,11 +279,7 @@ def get_cache_directory():
"""
url_suffix = urlparse(server).netloc
reversed_url_suffix = os.sep.join(url_suffix.split(".")[::-1])
if not cache_directory:
_cachedir = _defaults(cache_directory)
else:
_cachedir = cache_directory
_cachedir = os.path.join(_cachedir, reversed_url_suffix)
_cachedir = os.path.join(cache_directory, reversed_url_suffix)
return _cachedir


Expand Down Expand Up @@ -297,4 +315,3 @@ def set_cache_directory(cachedir):
]

_setup()
_create_log_handlers()
14 changes: 0 additions & 14 deletions openml/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,8 @@
import time
from typing import Dict, Union, cast
import unittest
import warnings
import pandas as pd

# Currently, importing oslo raises a lot of warning that it will stop working
# under python3.8; remove this once they disappear
with warnings.catch_warnings():
warnings.simplefilter("ignore")
from oslo_concurrency import lockutils

import openml
from openml.tasks import TaskType
from openml.exceptions import OpenMLServerException
Expand Down Expand Up @@ -100,13 +93,6 @@ def setUp(self, n_levels: int = 1):
openml.config.avoid_duplicate_runs = False
openml.config.cache_directory = self.workdir

# If we're on travis, we save the api key in the config file to allow
# the notebook tests to read them.
if os.environ.get("TRAVIS") or os.environ.get("APPVEYOR"):
with lockutils.external_lock("config", lock_path=self.workdir):
with open(openml.config.config_file, "w") as fh:
fh.write("apikey = %s" % openml.config.apikey)

# Increase the number of retries to avoid spurious server failures
self.connection_n_retries = openml.config.connection_n_retries
openml.config.connection_n_retries = 10
Expand Down
10 changes: 6 additions & 4 deletions openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _list_all(listing_call, output_format="dict", *args, **filters):
limit=batch_size,
offset=current_offset,
output_format=output_format,
**active_filters
**active_filters,
)
except openml.exceptions.OpenMLServerNoResult:
# we want to return an empty dict in this case
Expand Down Expand Up @@ -277,9 +277,11 @@ def _create_cache_directory(key):
cache = config.get_cache_directory()
cache_dir = os.path.join(cache, key)
try:
os.makedirs(cache_dir)
except OSError:
pass
os.makedirs(cache_dir, exist_ok=True)
except Exception as e:
raise openml.exceptions.OpenMLCacheException(
f"Cannot create cache directory {cache_dir}."
) from e
return cache_dir


Expand Down
20 changes: 17 additions & 3 deletions tests/test_openml/test_config.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
# License: BSD 3-Clause

import tempfile
import os
import unittest.mock

import openml.config
import openml.testing


class TestConfig(openml.testing.TestBase):
def test_config_loading(self):
self.assertTrue(os.path.exists(openml.config.config_file))
self.assertTrue(os.path.isdir(os.path.expanduser("~/.openml")))
@unittest.mock.patch("os.path.expanduser")
@unittest.mock.patch("openml.config.openml_logger.warning")
@unittest.mock.patch("openml.config._create_log_handlers")
def test_non_writable_home(self, log_handler_mock, warnings_mock, expanduser_mock):
with tempfile.TemporaryDirectory(dir=self.workdir) as td:
expanduser_mock.side_effect = (
os.path.join(td, "openmldir"),
os.path.join(td, "cachedir"),
)
os.chmod(td, 0o444)
openml.config._setup()

self.assertEqual(warnings_mock.call_count, 2)
self.assertEqual(log_handler_mock.call_count, 1)
self.assertFalse(log_handler_mock.call_args_list[0][1]["create_file_handler"])


class TestConfigurationForExamples(openml.testing.TestBase):
Expand Down
30 changes: 22 additions & 8 deletions tests/test_utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from openml.testing import TestBase
import os
import tempfile
import unittest.mock

import numpy as np
import openml
import sys

if sys.version_info[0] >= 3:
from unittest import mock
else:
import mock
import openml
from openml.testing import TestBase


class OpenMLTaskTest(TestBase):
Expand All @@ -20,7 +19,7 @@ def mocked_perform_api_call(call, request_method):
def test_list_all(self):
openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)

@mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call)
@unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=mocked_perform_api_call)
def test_list_all_few_results_available(self, _perform_api_call):
# we want to make sure that the number of api calls is only 1.
# Although we have multiple versions of the iris dataset, there is only
Expand Down Expand Up @@ -86,3 +85,18 @@ def test_list_all_for_evaluations(self):

# might not be on test server after reset, please rerun test at least once if fails
self.assertEqual(len(evaluations), required_size)

@unittest.mock.patch("openml.config.get_cache_directory")
def test__create_cache_directory(self, config_mock):
with tempfile.TemporaryDirectory(dir=self.workdir) as td:
config_mock.return_value = td
openml.utils._create_cache_directory("abc")
self.assertTrue(os.path.exists(os.path.join(td, "abc")))
subdir = os.path.join(td, "def")
os.mkdir(subdir)
os.chmod(subdir, 0o444)
config_mock.return_value = subdir
with self.assertRaisesRegex(
openml.exceptions.OpenMLCacheException, r"Cannot create cache directory",
):
openml.utils._create_cache_directory("ghi")

0 comments on commit 47cda65

Please sign in to comment.