-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdata_preprocessing.R
27 lines (22 loc) · 984 Bytes
/
data_preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
data = read.csv('data.csv')
# Taking care of missing value
data$Age <- ifelse(is.na(data$Age),
ave(data$Age, FUN = function(x) mean(x, na.rm =TRUE)),
data$Age)
data$Salary <- ifelse(is.na(data$Salary),
ave(data$Salary, FUN = function(x) mean(x, na.rm=TRUE)),
data$Salary)
# Encoding categorical data
data$Country <- factor(data$Country, levels = c('France', 'Spain', 'Germany'),
labels = c(1,2,3))
data$Purchased <- factor(data$Purchased, levels = c('No', 'Yes'),
labels = c(0, 1))
# Splitting data to training and testing set
library(caTools)
set.seed(2017)
split <- sample.split(data$Purchased, SplitRatio = 0.8)
training_set <- subset(data, split==TRUE)
testing_set <- subset(data, split==FALSE)
# Feature scaling
training_set[, 2:3] <- scale(training_set[, 2:3])
testing_set[, 2:3] <- scale(testing_set[, 2:3])