#########################################

#Machine Learning with R: an introduction

#########################################

############Titanic##########

#Basic R
#Machine learning specific packages
#caret

#supress warnings messages
options(warn=-1)

#load the packages

library(dplyr)     # for data manipulation
library(tidyverse)
library(ggplot2)  # for awesome graphics
library(naniar) #Nas visual

# Modeling packages
library(caret)    # for cross-validation, etc.
library(glmnet) #GLM models
library(rpart) #decision trees
library(rpart.plot) #plot decision tree
library(randomForest) #random forest
library(gbm) #gbm
library(e1071) #svm

# Model interpretability packages
library(vip)      # variable importance

#access dataset
library(titanic)
data(titanic)
#Type ?titanic for more information about the data

# Inspect data
dim(titanic)
str(titanic)

#View(titanic)

#NAs visualization
vis_miss(titanic)
summary(titanic)

#Cleaning data

#Change the data name
#drop NA
#transforming categorical variables to factor

#
dataset <- titanic %>%
  mutate(pclass = factor(pclass), sex = factor(sex), survived = factor(survived)) %>%
  filter(age != is.na(age)) 

#Exploratory data analysis
mosaicplot(~ sex +  pclass + survived, data = dataset, color = TRUE)


# R base
boxplot(dataset$age~dataset$survived)
barplot(table(dataset$sex,dataset$survived))

#ggplot2
ggplot(dataset,aes(survived,fill=sex))+geom_bar()

#split data
set.seed(123)
validation_index <- createDataPartition(dataset$survived, p=0.80, list=FALSE)
# select 20% of the data for validation
test <- dataset[-validation_index,]
# use the remaining 80% of data to training the models
train <- dataset[validation_index,]

#Logistic regression

#multiple logistic regression
model = glm(survived~., family=binomial,data = train)
summary(model)
exp(model$coefficients) #odds ratio

#some predictions
pred <- predict(model, test,type="response")
predictions<-factor(as.vector(ifelse(pred>0.5,"yes","no")))
confusionMatrix(predictions, test$survived)

#using caret

set.seed(123) #for reproducibility
model<- train(
  survived~., family="binomial",data = train,
  method = "glm"
)
model
summary(model)

#accuracy
predictions <- predict(model, test,type="raw")
confusionMatrix(predictions, test$survived)

#pre-processing
set.seed(123)
model <- train(
  survived~ .,
  data = dataset,
  method = "glm",
  family="binomial",
  preProcess = c("zv","center", "scale")
)
model
summary(model)

#accuracy
predictions <- predict(model, test, type="raw")
confusionMatrix(predictions,test$survived)

#cross-validation
set.seed(123)
ctrl <- trainControl(
  method = "cv",
  number = 10,
)

set.seed(123)
fit.rl <- train(
  survived ~ .,
  data = train,
  method = "glm",
  family="binomial",
  trControl= ctrl,
  preProcess = c("zv","center", "scale")
)
fit.rl

#accuracy
predictions <- predict(fit.rl, test, type="raw")
confusionMatrix(predictions,test$survived)

#SVM REGRESSION
library(e1071)

model = svm(survived ~ ., data = dataset)
print(model)

set.seed(123)

tuneGrid <- expand.grid(
  C = c(0.25, .5, 1),
  sigma = 0.1
)

fit.svm<- train(
  survived ~ .,
  data = train,
  method = 'svmRadial',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)

fit.svm

#accuracy
predictions <- predict(fit.svm, test)
confusionMatrix(predictions,test$survived)

#variable importance
varImp(fit.svm)
plot(varImp(fit.svm))

#knn
set.seed(123)

fit.knn <- train(
  survived ~ .,
  data = train,
  method = 'knn'
)
fit.knn

#Pre-process, Cross-validation and Tuning
tuneGrid <- expand.grid(
  k = seq(5, 9, by = 1)
)

fit.knn <- train(
  survived~ .,
  data = train,
  method = 'knn',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)

fit.knn

#accuracy
predictions <- predict(fit.knn, test)
confusionMatrix(predictions,test$survived)

#variable importance
varImp(fit.knn)
plot(varImp(fit.knn))

#decision tree

set.seed(123)
fit.dt<- rpart(survived~.,data = train,method = 'class')

rpart.plot(fit.dt)

set.seed(123)
tuneGrid <- data.frame(cp = seq(0.02, .2, .02))

fit.dt <- train(
  survived~.,data = train,
  method = 'rpart',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)

fit.dt
vip(fit.dt)

#accuracy

predictions<-predict(fit.dt,test)
confusionMatrix(predictions,test$survived)

# Random Forest

model = randomForest(survived~.,data = dataset)
model

set.seed(123)

tuneGrid <- expand.grid(
  mtry = c(2:4)
)

fit.rf <- train(
  survived~.,
  data = train,
  method = 'rf',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid
)
fit.rf

#variable importance
vip(fit.rf)

#accuracy
predictions<-predict(fit.rf,test)
confusionMatrix(predictions,test$survived)


###GBM

#model = gbm(survived~., data = dataset)

set.seed(123)

tuneGrid <- expand.grid(
  n.trees = c(50, 100),
  interaction.depth = c(1, 2),
  shrinkage = 0.1,
  n.minobsinnode = 10
)

fit.gbm <- train(
  survived ~ .,
  data = train,
  method = 'gbm',
  preProcess = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = tuneGrid,
  verbose = FALSE
)
fit.gbm

#accuracy
predictions<-predict(fit.gbm,test)
confusionMatrix(predictions,test$survived)

#select best model
# summarize accuracy of models
results <- resamples(
  list(rl=fit.rl,svm=fit.svm,knn=fit.knn, dt= fit.dt, rf=fit.rf,gbm=fit.gbm))
summary(results)
bwplot(results)

# summarize Best Model
print(fit.gbm)
fit.gbm$results

#make some predictions
predictions <- predict(fit.gbm, test)
confusionMatrix(predictions, test$survived)

