-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_ml.py
82 lines (63 loc) · 2.96 KB
/
run_ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# imports
import os
import logging
from contextlib import suppress
import click
import yaml
import numpy as np
from mlflow import log_params, log_artifact
from mlflow.exceptions import MlflowException
import mlflow
# local absolute imports
from src.utils.utils import load_dataset_from_csv, get_train_test_data
from src.ml.train import get_sklearn_pipeline, train_and_hyperparameter_search
from src.ml.eval import evaluate_pipeline
def run_training(config_file: str, verbose: bool = True) -> None:
"""Function for training, hyperparametersearch and evaluation.
Args:
config_file (str): Path to the .yaml file with hyperparameters for training.
verbose (bool): If true, logging.infos results and meta-information of the run.
Returns:
run (mlflow.run): mlflow.run object with meta-information on the current run.
final_params (Dict): Final hyperparameters of the fitted pipeline.
test_results (Dict): Test evaluation results of the final pipeline.
"""
logging.info(f"\nAttempting to run ml pipeline from {config_file}\n")
with open(config_file, "r") as f:
config = yaml.safe_load(stream=f)
_id = mlflow.set_experiment("news") # pylint: disable=assignment-from-no-return
with suppress(MlflowException): mlflow.delete_experiment("0")
with mlflow.start_run(experiment_id=_id) as run:
log_artifact(config_file, "pipeline config")
np.random.seed(config["random_seed"])
# initialize pipeline with hyperparameters
pipeline = get_sklearn_pipeline()
# get training and testing data
news = load_dataset_from_csv()
x_train, y_train, x_test, y_test = get_train_test_data(news, config["test_size"])
# fit and tune on training data
final_pipeline, final_params = train_and_hyperparameter_search(
pipeline, config, x_train, y_train)
log_params(final_params)
# evaluate final pipeline
test_results = evaluate_pipeline(final_pipeline, (x_test, y_test))
# log final model to the run
mlflow.sklearn.log_model(
sk_model=final_pipeline,
artifact_path="model",
serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE
)
logging.info(f"\nMeta-information on the training run: \n{run.info}\n")
logging.info(f"\nFinal hyperparameters from gridsearch: \n{final_params}\n")
logging.info(f"\nAchieved evaluation results on test set: \n{test_results}\n")
# show meta information about the run
return run, final_params, test_results
@click.command()
@click.argument("config-file", type=click.Path(exists=True))
@click.option("--verbose", default=True, is_flag=True)
def run_training_cli(config_file: str, verbose: bool = True) -> None:
"""Wraps the run_training to provide it as CLI.
"""
run_training(config_file=config_file, verbose=verbose)
if __name__ == "__main__":
run_training_cli() # pylint: disable=no-value-for-parameter