TitanicDataAnalysis_Analyse_Report.R

##########################################################################
# Reprot and futher improvment
# ####################################################
#prepare data for the code to be run independent of other chapters
data <- read.csv("data.csv", header = TRUE)
RE_data <- read.csv("RE_data.csv", header = TRUE)
train <- RE_data[1:891, ]
test <- RE_data[892:1309, ]

# model need to be loaded in to memory
load("RF_model2.rda")
RF_model2

# The model's estimated accuracy (by the model construction) is **83.16%**.
# The default parameters of the model are: `mtry = 2` and `ntree = 500`

# The Top 10 trees and the summary of OOB of RF_model2
head(RF_model2$err.rate, 10)
summary(RF_model2$err.rate)

#The cross validations on the model `RF_model2`,
load("RF_model2_cv.rda")
RF_model2_cv

# 2D Visualization of Model RE_model2's Prediciton
#install.packages("Rtsne")
# library(Rtsne)
# library(ggplot2)

# Rtsne needs a seed to ensure consistent output between runs.
set.seed(984357)
features <- c("Sex", "Fare_pp", "Pclass", "Title", "Age_group", "Group_size", "Ticket_class", "Embarked")
#generate 2-d coordinate
Model_tsne <- Rtsne(train[, features], check_duplicates = FALSE)
# Plot
ggplot(NULL, aes(x = Model_tsne$Y[, 1], y = Model_tsne$Y[, 2], color = as.factor(train$Survived))) +
  geom_point() +
  labs(color = "Survived")

# The importance of the predictorsRF_model2
# library(randomForest)
# library(caret)
varImpPlot(RF_model2, main = "")

# print out the values
pre.or <- sort(RF_model2$importance[,3], decreasing = TRUE)
print(pre.or)

###############################
## Further Analysis
###############################

# The decision tree of RF_model2
# library(rpart.plot)
load("model3.rda")
prp(model3, type = 0, extra = 1, under = TRUE)


## further re-engineering `Title` attribute
#let us further re-Engineer title check the value
table(RE_data$Title)

# Parse out title from the raw data
data$Title <- gsub('(.*, )|(\\..*)', '', data$Name)
table(data$Title)

# Further bin or bucket them into a more appropriate category
# We can do with the knowledge of nobility, locality (country of origin)
# and other knowledge such as time (at the beginning of the 20 century).
# For example, "`Dona`" and "`the Countess`" are female nobility equivalent
# to "`Lady`", and  "`Ms`" and "`Mlle`" are essentially the same with "`Miss`";
# "`Mme`" is a military title equivalent to "`Madame`", so it can be
# categorised as "`Mrs`"; "`Jonkheer`" is an honorific nobility in the
# Netherlands; and "`Don`" is title of a university lecturer, they can be
# categorises as "`Sir`"; "`Col`", "`Capt`", and "`Major`" are military
# ranks and can be replaced with a more general title "`Officer`".
# With all of these, we can reduce the numbers of title's category.

# Re-map titles to be more exact
data$Title[data$Title %in% c("Dona", "the Countess")] <- "Lady"
data$Title[data$Title %in% c("Ms", "Mlle")] <- "Miss"
data$Title[data$Title == "Mme"] <- "Mrs"
data$Title[data$Title %in% c("Jonkheer", "Don")] <- "Sir"
data$Title[data$Title %in% c("Col", "Capt", "Major")] <- "Officer"
table(data$Title)

# Collapse titles based on visual analysis
indexes <- which(data$new.Title == "Lady" |
                   (data$new.Title == "Dr" &
                      data$Sex == "female") |
                   (data$new.Title == "Officer"&
                      data$Sex == "female")
)
data$new.Title[indexes] <- "Mrs"

indexes <- which(data$new.Title == "Rev" |
                   data$new.Title == "Sir" |
                   (data$new.Title == "Officer" &
                      data$Sex == "male")|
                   (data$new.Title == "Dr" &
                      data$Sex == "male")  )
data$new.Title[indexes] <- "Mr"

table(data$new.Title)

# Check any other gender slip-ups?
length(which(data$sex == "female" &
               (data$new.Title == "Master" |
                  data$new.Title == "Mr")))
length(which(data$sex == "male" &
               (data$new.Title == "Miss" |
                  data$new.Title == "Mrs")))

# Visualize
ggplot(data[1:891,], aes(x = new.Title, fill = as.factor(Survived))) +
  geom_bar()

# check up the impact of the new-title
set.seed(2222)
RE_data$New_Title <- data$new.Title
RF_model2_new <- randomForest(as.factor(Survived) ~ Sex + Fare_pp + Pclass + New_Title + Age_group + Group_size + Ticket_class  + Embarked,
                              data = RE_data[1:891,],
                              importance=TRUE)

RF_model2_new

# The new model has increased the over accuracy with 0.45%.
# It is not a lot but it approves the point that features re-engineer
# is a place to do a model's performance improvement.
###########################################################################