TitanicDataAnalysis1_Understand_Data.R

############################################################################
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
#
# The base of this code was from Dave Langer "Intoduction to Data Science."
# Credit to him. You can find him on Github: https://github.com/EasyD
# I have also integrated other sources like Titanic Forum, Python code
# and some Chinese R code.
#
# Notably,
# https://www.kaggle.com/startupsci/titanic-data-science-solutions
#
# The whole purpose of this is to teach My students on Data Science.
#
# This R source code file corresponds to video 1 of the YouTube series
# "Introduction to Data Science with R" located at the following URL:
#
# The task is to build a model based on the train data
# then to predict test data who can survive in Kaggle Titanic competition
##########################################################################
# 1. Understand Data
##########################################################################

# RStudio help function
# what help with any R function just type ?
# for example ?read.csv in Console

# First explore of datasets
# the first R code you can use is str.
# use ?str to find more

help(str)

# use str to explore train and test
str(train)
str(test)

# Add a "Survived" variable to the test set to allow for combining data sets
test <- data.frame(Survived = rep("NA", nrow(test)), test[,])

# Now check test. you can see that it has 12 variable now.
# And Survived is the first variable. because we used test[,]
# if you want add Survived into the second position Do this,
# test <- data.frame(test, Survived = rep("NA", nrow(test)), test[,2:ncol(test)])

# Combine datasets together. actually append test to train
data <- rbind(train, test)
# We may need to keep the raw data into a file in case we need it later.
write.csv(data, "data.cvs", row.names = FALSE )

#### We have done combined datasest with only a few line of code
# notice that the type of Survived has been changed to Chr.
# This is because we used "NA" as its value
# A bit about R data types
# ?str structure of dataset
# chr, int
# Factor in R is 'category'. it likes a selection from a list.
# for example, Cabin Factor w/187 levels. It means there are 187 selections.
# Sex Factor w/ 2 "female", "male", 2,1 means, two options 2- female, 1- male.
#
# NA : not available (absent value, missing value)

str(data)

# Exam PassengerID, type INT, we can check total number and the number of unique values.
# If they are equal and both equal to the number of records. it means there are
# unique and has no missing value.
length(data\$PassengerId)
length(unique(data\$PassengerId))

### Exam Survived
data\$Survived <- as.factor(data\$Survived)
table(data\$Survived)

# Calculate the survive rate in train data is 38% and the death rate is 61.61%
prop.table(table(as.factor(train\$Survived)))

### Examine Pclass value,
# Look into Kaggle's explanation about Pclass: it is a proxy for social class i.e. rich or poor
# It should be factor and it does not make sense to stay in int.
data\$Pclass <- as.factor(data\$Pclass)
test\$Pclass <- as.factor(test\$Pclass)
train\$Pclass <- as.factor(train\$Pclass)

# Distribution across classes
table(data\$Pclass)

# Distribution across classes with survive
table(data\$Pclass, data\$Survived)

# Calculate the distribution on Pclass
# Overall passenger distribution on classes.
prop.table(table(data\$Pclass))

# Train data passenger distribution on classes.
prop.table(table(train\$Pclass))

# Test data passenger distribution on classes.
prop.table(table(test\$Pclass))

# Calculate death distribution across classes with Train data
SurviveOverClass <- table(train\$Pclass, train\$Survived)
# Convert SurviveOverClass into data frame
SoC.data.fram <- data.frame(SurviveOverClass)
# Retrieve death distribution in classes
Death.distribution.on.class <- SoC.data.fram\$Freq[SoC.data.fram\$Var2==0]
prop.table(Death.distribution.on.class)

# Calculate death rate in train data
# Distribution across classes with survive in Train data
SurviveOverClass <- table(train\$Pclass, train\$Survived)
# Convert SurviveOverClass into data frame
SoC.data.fram <- data.frame(SurviveOverClass)
# Retrieve death distribution in classes
Death.distribution.on.class <- SoC.data.fram\$Freq[SoC.data.fram\$Var2==0]
prop.table(Death.distribution.on.class)

# Try summary on data
summary.data.frame(data)

## Exploratory data analysis with graph
# Load up ggplot2 package to use for visualizations
library(ggplot2)

# High class passenger has more chance of survive than passenger with lower class
# Hypothesis - Rich passengers can but expensive ticket. class=1 is more expensive
# survived at a higher rate
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
geom_bar(width = 0.5) +
xlab("Pclass") +
ylab("Total Count") +
labs(fill = "Survived")

# It sort of proved the survive rate with social class
# more people perished in the third class

### Exam Name attribute
# the original name typed as factor, which we really don't want (shows the uniqueness)
# convert Name type
data\$Name <- as.character(data\$Name)

# Confirm the name has 1037 unique values
length(unique(data\$Name))

# Find the two duplicate names
# First used which function to get the duplicate names and store them as a vector dup.names
# check it up ?which.
dup.names <- data[which(duplicated(data\$Name)), "Name"]

# Echo out
dup.names

### Exam Sex attribute
# Retrial male and females. then check their numbers.
summary(data\$Sex)

#Plot Sex distribution on entire dataset and get general an impression
ggplot(data[1:891,], aes(x = Sex)) +
geom_bar(fill="steelblue") +
xlab("Sex") +
ylab("Total Count")

# plot Survived over Sex on train. use data[1:891,]
ggplot(data[1:891,], aes(x = Sex, fill = Survived)) +
geom_bar() +
xlab("Sex") +
ylab("Total Count") +
labs(fill = "Survived")

### Examine Age
# Summary over data, train and test.
summary(data\$Age)
summary(train\$Age)
summary(test\$Age)

#It makes sense to change Age type to Factor to see distribution
summary(as.factor(data\$Age))

# Plot distribution of age group
ggplot(data, aes(x = Age)) +
geom_histogram(binwidth = 10, fill="steelblue") +
xlab("Age") +
ylab("Total Count")

# Plot Survived on age group on train dataset
ggplot(data[1:891,], aes(x = Age, fill = Survived)) +
geom_histogram(binwidth = 10) +
xlab("Age") +
ylab("Total Count")

### Exam SibSp, Its original type is int
summary((data\$SibSp))

# How many possible unique values?
length(unique(data\$SibSp))

# Treat it as a factor, so we know the value distribution
data\$SibSp <- as.factor(data\$SibSp)
summary(data\$SibSp)

# Plot entire SibSp distribution among the 7 values
ggplot(data, aes(x = SibSp)) +
geom_bar() +
xlab("SibSp") +
ylab("Total Count")+
coord_cartesian()

# Plot on the Survived on SibSp
ggplot(data[1:891,], aes(x = SibSp, fill = Survived)) +
geom_bar() +
xlab("SibSp") +
ylab("Total Count")  +
labs(fill = "Survived")

### Exam Parch
summary(data\$Parch)

# How many possible values?
length(unique(data\$Parch))

# Treat is as a factor so we know the value distribution
data\$Parch <- as.factor(data\$Parch)
summary(data\$Parch)

# Plot entire Parch distribution among the 8 posibilites
ggplot(data, aes(x = Parch)) +
geom_bar() +
xlab("Parch") +
ylab("Total Count")

# Plot on the Survived on SibSp
ggplot(data[1:891,], aes(x = Parch, fill = Survived)) +
geom_bar() +
xlab("Parch") +
ylab("Total Count")  +
labs(fill = "Survived")

### Exam  ticket
summary(data\$Ticket)
length(unique(data\$Ticket))
str(data\$Ticket)
which(is.na(data\$Ticket))

# Plot it value
ggplot(data[1:891,], aes(x = Ticket)) +
geom_bar() +
xlab("Ticket") +
ylab("Total Count")

# Plot on the survive on Ticket
ggplot(data[1:891,], aes(x = Ticket, fill = Survived)) +
geom_bar() +
xlab("Ticket") +
ylab("Total Count")  +
labs(fill = "Survived")

### Examine Fare
summary(data\$Fare)
length(unique(data\$Fare))

# Can't make fare a factor, treat as numeric & visualize with histogram
ggplot(data, aes(x = Fare)) +
geom_histogram(binwidth = 5) +
ggtitle("Fare Distribution") +
xlab("Fare") +
ylab("Total Count") +
ylim(0,200)

# Let's check to see if fare has predictive power
ggplot(data[1:891,], aes(x = Fare, fill = Survived)) +
geom_histogram(binwidth = 5) +
xlab("fare") +
ylab("Total Count") +
ylim(0,50) +
labs(fill = "Survived")

# Explore Fare distribution between train and test to see if they are overlapped?
ggplot(train, aes(x = Fare)) +
geom_histogram(binwidth = 5) +
ggtitle("Fare Distribution") +
xlab("Fare") +
ylab("Total Count") +
ylim(0,200)

ggplot(test, aes(x = Fare)) +
geom_histogram(binwidth = 5) +
ggtitle("Fare Distribution") +
xlab("Fare") +
ylab("Total Count") +
ylim(0,200)

### Cabin
# Examine cabin values
str(data\$Cabin)
# Cabin really isn't a factor, make a string and the display first 100
data\$Cabin <- as.character(data\$Cabin)
data\$Cabin[1:100]

# Find number of the missing value
table(train[which(train\$Cabin ==""), "Cabin"])

# Analysis of the cabin variable
str(data\$Cabin)

# Cabin really isn't a factor, make a string and the display first 100
data\$Cabin <- as.character(data\$Cabin)
data\$Cabin[1:100]

# Find out number of the missing value in the train
train\$Cabin <- as.character(train\$Cabin)
table(train[which(train\$Cabin ==""), "Cabin"])

# Replace empty cabins with a "U"
#data[which(data\$Cabin == ""), "Cabin"] <- "U"
data\$Cabin[1:100]

# Take a look at just the first char as a factor
cabin.first.char <- as.factor(substr(data\$Cabin, 1, 1))
str(cabin.first.char)
levels(cabin.first.char)

# Add to combined data set and plot
data\$cabin.first.char <- cabin.first.char

# High level plot
ggplot(data[1:891,], aes(x = cabin.first.char, fill = Survived)) +
geom_bar() +
ggtitle("Survivability by cabin.first.char") +
xlab("cabin.first.char") +
ylab("Total Count") +
ylim(0,750) +
labs(fill = "Survived")

### Examine Embark
str(data\$Embarked)
summary(data\$Embarked)

# Plot Embarked data distribution and the Survived data over it
ggplot(data, aes(x = Embarked)) +
geom_bar(width=0.5) +
xlab("Passenger embarked port") +
ylab("Total Count")

ggplot(data[1:891,], aes(x = Embarked, fill = Survived)) +
geom_bar(width=0.5) +
xlab("embarked") +
ylab("Total Count") +
labs(fill = "Survived")

##Calculate death RATE distribution over Embarked port with Train data
# We use table in R, you can check with ?table. A good example is
# mytable <- table(A,B) # A will be rows, B will be columns
# mytable # print table

# margin.table(mytable, 1) # A frequencies (summed over B)
# margin.table(mytable, 2) # B frequencies (summed over A)

# prop.table(mytable) # cell percentages
# prop.table(mytable, 1) # row percentages
# prop.table(mytable, 2) # column percentages

# We need prop.table to get column percentage which is the survived

# creat Embarked and Survived contingency table
SurviveOverEmbarkedTable <- table(train\$Embarked, train\$Survived)
# Death-0/survived-1 value distribution (percentage) based on embarked ports
# prop.table(mytable, 2) give us column (Survived) percentages
Deathandsurvivepercentage <- prop.table(SurviveOverEmbarkedTable, 2)
# Plot
M <- c("c-Cherbourg", "Q-Queenstown", "S-Southampton")
barplot(Deathandsurvivepercentage[2:4,1]*100, xlab =(""), ylim=c(0,100), ylab="Death distribution in percentage %",  names.arg = M, col="steelblue", main="Death distribution", border="black", beside=TRUE)
barplot(Deathandsurvivepercentage[2:4,2]*100, xlab =(""), ylim=c(0,100), ylab="Death distribution in percentage %",  names.arg = M, col="blue", main="Death distribution", border="black", beside=TRUE)

## Calculate survived RATE distribution based on embarked ports
# Death-0/survived-1 value distribution (percentage) based on embarked ports
# prop.table(mytable, 1) give us row (Port) percentages
# col-1 (Survived=0, perished) and col-2 (Survived =1, survived)
DeathandsurviveRateforeachport <- prop.table(SurviveOverEmbarkedTable, 1)
#plot
barplot(Deathandsurvivepercentage[2:4,1]*100, xlab =(""), ylim=c(0,100), ylab="Death rate in percentage %",  names.arg = M, col="red", main="Death rate comparison among mebarked ports", border="black", beside=TRUE)

##END 1 ######################################################