#########################################

#Machine Learning with R: an introduction

#########################################

############Ionosphere##########

#Basic R
#Machine learning specific packages
#caret

#load the packages

library(dplyr)     # for data manipulation
library(tidyverse)
library(ggplot2)  # for awesome graphics
library(naniar) #Nas visual
library(GGally) #correlations
library(corrplot) #correlation

# Modeling packages
library(caret)    # for cross-validation, etc.
library(glmnet) #GLM models
library(earth) #Mars
library(rpart) #decision trees
library(rpart.plot) #plot decision tree
library(randomForest) #random forest
library(gbm) #gbm
library(e1071) #svm

# Model interpretability packages
library(vip)      # variable importance

#access dataset
library(mlbench)
data(Ionosphere)
#? for more information

# Inspect data
dim(Ionosphere)
str(Ionosphere)

#View(Ionosphere)

#NAs visualization
vis_miss(Ionosphere)

#Cleaning data
#Change the data name
dataset <- Ionosphere

# 2. Data Cleaning

# Remove redundant variable V2

dataset <- dataset[,-2]
# convert the first input from integer to numeric
typeof(dataset$V1)
dataset$V1 <- as.numeric(as.character(dataset$V1))
typeof(dataset$V1)

#Some descriptives and graphical analysis

#option base R
summary(dataset)

#Exploratory data analysis
#
ggplot(data=dataset)+geom_point(mapping = aes(x=Class,y=V3))


#caret
x=dataset[,-c(34)] #predictors
y=dataset[,34]  #outcome

featurePlot(x=x, y=y, "box")

# R base
plot(dataset$V7,dataset$Class)


#split data
set.seed(123)
validation_index <- createDataPartition(dataset$Class, p=0.80, list=FALSE)
# select 20% of the data for validation
test <- dataset[-validation_index,]
# use the remaining 80% of data to training the models
train <- dataset[validation_index,]

#Logistic regression

#multiple logistic regression
model = glm(Class~., family=binomial,data = train)
summary(model)

#some predictions
pred <- predict(model, test,type="response")
predictions<-as.factor(ifelse(pred>0.5,"bad","good"))
confusionMatrix(predictions, test$Class)

#using caret

set.seed(123) #for reproducibility
model<- train(
  Class~., family="binomial",data = train,
  method = "glm"
)
model
summary(model)

#accuracy
predictions <- predict(model, test,type="raw")
confusionMatrix(predictions, test$Class)

#pre-processing
set.seed(123)
model <- train(
  Class~ .,
  data = dataset,
  method = "glm",
  family="binomial",
  preProcess = c("zv","center", "scale")
)
model
summary(model)

#accuracy
predictions <- predict(model, test, type="raw")
confusionMatrix(predictions,test$Class)

#cross-validation
set.seed(123)
ctrl <- trainControl(
  method = "cv",
  number = 10,
)

set.seed(123)
fit.rl <- train(
  Class ~ .,
  data = train,
  method = "glm",
  family="binomial",
  trControl= ctrl,
  preProcess = c("zv","center", "scale")
)
fit.rl

#accuracy
predictions <- predict(fit.rl, test, type="raw")
confusionMatrix(predictions,test$Class)

#SVM REGRESSION
library(e1071)

model = svm(Class ~ ., data = dataset)
print(model)

set.seed(123)

tuneGrid <- expand.grid(
  C = c(0.25, .5, 1),
  sigma = 0.1
)

fit.svm<- train(
  Class ~ .,
  data = train,
  method = 'svmRadial',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)

fit.svm

#accuracy
predictions <- predict(fit.svm, test)
confusionMatrix(predictions,test$Class)

#variable importance
varImp(fit.svm)
plot(varImp(fit.svm))

#knn
set.seed(123)

fit.knn <- train(
  Class ~ .,
  data = train,
  method = 'knn'
)
fit.knn

#Pre-process, Cross-validation and Tuning
tuneGrid <- expand.grid(
  k = seq(5, 9, by = 1)
)

fit.knn <- train(
  Class~ .,
  data = train,
  method = 'knn',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)

fit.knn

#accuracy
predictions <- predict(fit.knn, test)
confusionMatrix(predictions,test$Class)

#variable importance
varImp(fit.knn)
plot(varImp(fit.knn))

#decision tree

set.seed(123)
fit.dt<- rpart(Class~.,data = train,method = 'class')

rpart.plot(fit.dt)

set.seed(123)
tuneGrid <- data.frame(cp = seq(0.02, .2, .02))

fit.dt <- train(
  Class~.,data = train,
  method = 'rpart',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)

fit.dt
vip(fit.dt)

#accuracy

predictions<-predict(fit.dt,test)
confusionMatrix(predictions,test$Class)

# Random Forest

model = randomForest(Class~.,data = Ionosphere)
model

set.seed(123)

tuneGrid <- expand.grid(
  mtry = c(2:4)
)

fit.rf <- train(
  Class~.,
  data = train,
  method = 'rf',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)
fit.rf

#variable importance
vip(fit.rf)

#accuracy
predictions<-predict(fit.rf,test)
confusionMatrix(predictions,test$Class)


###GBM

#model = gbm(Class~., data = dataset)

set.seed(123)

tuneGrid <- expand.grid(
  n.trees = c(50, 100),
  interaction.depth = c(1, 2),
  shrinkage = 0.1,
  n.minobsinnode = 10
)

fit.gbm <- train(
  Class ~ .,
  data = train,
  method = 'gbm',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid,
  verbose = FALSE
)
fit.gbm

#accuracy
predictions<-predict(fit.gbm,test)
confusionMatrix(predictions,test$Class)

#select best model
# summarize accuracy of models
results <- resamples(
  list(rl=fit.rl,svm=fit.svm,knn=fit.knn, dt= fit.dt, rf=fit.rf,gbm=fit.gbm))
summary(results)
bwplot(results)

# summarize Best Model
print(fit.rf)
fit.rf$results

#make some predictions
predictions <- predict(fit.rf, test)
confusionMatrix(predictions, test$Class)



