-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprediction_model_knn_manual.py
94 lines (77 loc) · 5.69 KB
/
prediction_model_knn_manual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class PredictionModelKNNManual:
""" Manual Machine Learning Model - function that outputs prediction based on input to the model """
def __init__(self, prediction_config, prediction_data, prediction_utils):
self.prediction_config = prediction_config
self.prediction_data = prediction_data
self.prediction_utils = prediction_utils
self.training_columns = self.prediction_data.training_columns
self.target_column = self.prediction_config.DATASET_LOCATION[self.prediction_config.DATASET_CHOICE]["target_column"]
def get_target_column_prediction(self, model_feature_name):
_temp_testing_part = self.prediction_data.testing_part
column_name_predicted_target = "predicted_" + self.target_column + "_" + model_feature_name
self.prediction_data.testing_part[column_name_predicted_target] = _temp_testing_part[model_feature_name].apply(lambda x: self.process_target_prediction(model_feature_name, x))
print("Predicted Target Column (i.e. 'price') using Model %r: %r" % (model_feature_name, self.prediction_data.testing_part[column_name_predicted_target]) )
def process_target_prediction(self, model_feature_name, model_feature_value):
""" Compare, Inspect, Randomise, Cleanse, and Filter
Prior to Randomising and then Sorting, we Inspect and check the value count for "distance" value of 0. Its value is amount of
other rental listings that also accommodate 3 people, using feature (i.e. "accommodates" or "bathrooms"). Avoid bias (toward just the sort order by "distance"
column of the data set) when choosing the "nearest neighbors" (all may have distance 0 since only want 5 and there
are may be around 461 indexes with that distance). Show all listing indexes that have a distance of 0 from my data set
During Comparison, assign distance values to new "distance" column of Data Frame Series object.
During Inspection, use the Panda Series method value_counts to display unique value counts for each "distance" column
"""
column_name_distance_feature = "distance_" + model_feature_name
# Compare
_temp_training_part = self.prediction_data.training_part
# print(_temp_training_part[model_feature_name])
_temp_training_part[column_name_distance_feature] = self.prediction_utils.compare_observations(model_feature_value, _temp_training_part[model_feature_name])
# Inspect
# print(_temp_training_part[column_name_distance_feature].value_counts()) # .index.tolist()
# print(_temp_training_part[_temp_training_part[column_name_distance_feature] == 0][model_feature_name])
# Randomise and Sort
_temp_training_part_randomised = self.prediction_utils.randomise_dataframe_rows(_temp_training_part)
_temp_training_part_sorted = self.prediction_utils.sort_dataframe_by_feature(_temp_training_part_randomised, column_name_distance_feature)
# print(_temp_training_part_sorted.iloc[0:10][self.prediction_config.TARGET_COLUMN])
# Filter
predicted_target = self.prediction_utils.get_nearest_neighbors(_temp_training_part_sorted, model_feature_name)
return predicted_target
def get_mean_absolute_error(self, model_feature_name):
""" Mean Absolute Error (MAE) calculation """
mae = self.prediction_utils.calc_mean_absolute_error(self.prediction_data.testing_part, model_feature_name)
print("MAE for Model feature %r: %r: " % (model_feature_name, mae) )
return mae
def get_mean_squared_error(self, model_feature_name):
""" Mean Squared Error (MSE) calculation
MSE improved prediction accuracy over MAE since penalises predicted values that are further
from the actual value more that those that are closer to the actual value
"""
mse = self.prediction_utils.calc_mean_squared_error(self.prediction_data.testing_part, model_feature_name)
print("MSE for Model feature %r: %r: " % (model_feature_name, mse) )
return mse
def get_root_mean_squared_error(self, model_feature_name):
""" Root Mean Squared Error (RMSE) calculation
RMSE helps understand performance of prediction accuracy over MSE and MAE since
it takes the square root of MSE so the units matching base unit of the target feature
"""
rmse = self.prediction_utils.calc_root_mean_squared_error(self.prediction_data.testing_part, model_feature_name)
print("RMSE for Model feature %r: %r: " % (model_feature_name, rmse) )
return rmse
def run(prediction_config, prediction_data, prediction_utils):
prediction_model_manual = PredictionModelKNNManual(prediction_config, prediction_data, prediction_utils)
for index, training_model_feature_name in enumerate(prediction_data.get_training_columns()):
prediction_model_manual.get_target_column_prediction(training_model_feature_name)
mae = prediction_model_manual.get_mean_absolute_error(training_model_feature_name) # MAE
mse = prediction_model_manual.get_mean_squared_error(training_model_feature_name) # MSE
rmse = prediction_model_manual.get_root_mean_squared_error(training_model_feature_name) # RMSE
if mae and rmse:
mae_rmse_ratio_prefix = mae / rmse
print("MAE to RMSE Ratio: %.2f:1" % (mae_rmse_ratio_prefix) )
if prediction_config.PLOT_INDIVIDUAL_TRAIN_FEATURES_VS_TARGET == True:
prediction_utils.plot(training_model_feature_name, prediction_data.testing_part)
return {
"feature_names": training_model_feature_name,
"rmse": rmse,
"k_neighbors_qty": prediction_config.HYPERPARAMETER_FIXED,
"k_folds_qty": None,
"k_fold_cross_validation_toggle": False
}