Skip to content

Commit

Permalink
Merge pull request #43 from pabloarosado/dev
Browse files Browse the repository at this point in the history
Generalise validation function
  • Loading branch information
pabloarosado authored Mar 16, 2021
2 parents 7653318 + 4a8cf4f commit a2a1e9f
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 14 deletions.
2 changes: 1 addition & 1 deletion modev/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def get_data(self, reload=False):
def get_indexes(self, reload=False):
_check_requirements([self.data], self.requirements_error_message)
if (self.train_indexes is None and self.test_indexes is None) or reload:
self.train_indexes, self.test_indexes = self.validation_function(self.data.index, **self.validation_pars)
self.train_indexes, self.test_indexes = self.validation_function(self.data, **self.validation_pars)
return self.train_indexes, self.test_indexes

def get_results(self, reload=False):
Expand Down
24 changes: 12 additions & 12 deletions modev/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def temporal_folds_split(raw_indexes, min_n_train_examples, dev_n_sets):
return parts


def one_set_n_sets_split(raw_indexes, test_fraction, test_n_sets, first_set_name, second_set_name,
def one_set_n_sets_split(data, test_fraction, test_n_sets, first_set_name, second_set_name,
labels=default_pars.validation_pars_labels, shuffle=default_pars.validation_pars_shuffle,
random_state=default_pars.random_state):
"""Splits a raw set of indexes into one set (e.g. a playground) and n sets (e.g. test sets).
Expand All @@ -110,8 +110,8 @@ def one_set_n_sets_split(raw_indexes, test_fraction, test_n_sets, first_set_name
Parameters
----------
raw_indexes : array_like
Indexes of data (e.g. data.index, assuming data is a pandas dataframe).
data : pd.DataFrame
Indexed data (e.g. a dataframe whose index can be accessed with data.index).
test_fraction : float
Fraction of data to use for test sets.
test_n_sets : int
Expand All @@ -135,7 +135,7 @@ def one_set_n_sets_split(raw_indexes, test_fraction, test_n_sets, first_set_name
"""
# To begin with, the raw dataset is train, and there is only one test set (named 'test_0'), which is empty.
indexes = {f'{first_set_name}_0': np.array(raw_indexes),
indexes = {f'{first_set_name}_0': np.array(data.index),
f'{second_set_name}_0': np.array([], dtype=int)}

# To avoid warnings, impose random_state None if there is no shuffling.
Expand Down Expand Up @@ -178,7 +178,7 @@ def _split_train_and_test_indexes(indexes, test_mode):
return train_indexes, test_indexes


def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars.validation_pars_playground_n_folds,
def k_fold_playground_n_tests_split(data, playground_n_folds=default_pars.validation_pars_playground_n_folds,
test_fraction=default_pars.validation_pars_test_fraction,
test_n_sets=default_pars.validation_pars_test_n_sets,
labels=default_pars.validation_pars_labels,
Expand All @@ -194,8 +194,8 @@ def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars
Parameters
----------
raw_indexes : array_like
All indexes of data. This could simply be the output of 'data.index' (assuming data is a pandas dataframe).
data : pd.DataFrame
Indexed data (e.g. a dataframe whose index can be accessed with data.index).
playground_n_folds : int
Number of folds to split playground into (also called 'k'), so that there will be k train sets and k dev sets.
test_fraction : float
Expand Down Expand Up @@ -226,7 +226,7 @@ def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars
if not shuffle:
random_state = None
# Split data set into playground and test set(s).
indexes = one_set_n_sets_split(raw_indexes=raw_indexes, test_fraction=test_fraction, test_n_sets=test_n_sets,
indexes = one_set_n_sets_split(data=data, test_fraction=test_fraction, test_n_sets=test_n_sets,
first_set_name=playground_key, second_set_name=test_key,
labels=labels, shuffle=shuffle, random_state=random_state)
# Split playground into k train and k dev sets.
Expand All @@ -242,7 +242,7 @@ def k_fold_playground_n_tests_split(raw_indexes, playground_n_folds=default_pars
return train_indexes, test_indexes


def temporal_fold_playground_n_tests_split(raw_indexes,
def temporal_fold_playground_n_tests_split(data,
min_n_train_examples=default_pars.validation_min_n_train_examples,
dev_n_sets=default_pars.validation_dev_n_sets,
test_fraction=default_pars.validation_pars_test_fraction,
Expand All @@ -261,8 +261,8 @@ def temporal_fold_playground_n_tests_split(raw_indexes,
Parameters
----------
raw_indexes : array_like
All indexes of data. This could simply be the output of 'data.index' (assuming data is a pandas dataframe).
data : pd.DataFrame
Indexed data (e.g. a dataframe whose index can be accessed with data.index).
min_n_train_examples : int
Minimum number of examples in a train set. It will be the exact number of examples of the first train set.
All subsequent train sets will be larger than the first one.
Expand All @@ -287,7 +287,7 @@ def temporal_fold_playground_n_tests_split(raw_indexes,
"""
# Split data set into playground and test set(s) without shuffling or stratifying (so they keep their order).
indexes = one_set_n_sets_split(raw_indexes=raw_indexes, test_fraction=test_fraction, test_n_sets=test_n_sets,
indexes = one_set_n_sets_split(data=data, test_fraction=test_fraction, test_n_sets=test_n_sets,
first_set_name=playground_key, second_set_name=test_key,
labels=None, shuffle=False, random_state=None)
# Split playground into k train and k dev temporal folds.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="modev",
version="0.4.0",
version="0.5.0",
author="Pablo Rosado",
author_email="mail@pablorosado.com",
description="Model Development for Data Science Projects.",
Expand Down

0 comments on commit a2a1e9f

Please sign in to comment.