3 Basic Exploratory Data Analysis
#note; In real world analysis, EDA will be done before handling missing values
#Import Data with Missing Values
<-read.csv("data/CleanedData.csv",header = T,colClasses=c("NULL", rep(NA, 13))) data
# Single Plot
#subset Hepatitis and Healthy Blood
= subset(data, Category==1)
hepatitis = subset(data, Category==0)
healthyBlood
#hepatitis and HealthyBlood for AST
ggplot() + geom_density(aes(x=AST), colour="red", data=hepatitis) +
geom_density(aes(x=AST), colour="Green", data=healthyBlood) +
ggtitle(" Density Plot of Category VS. AST") +
theme(plot.title = element_text(hjust = 0.5))
3.1 Density Plot
Plot can be done by adding more layers to the geom density
#Fuction for plotting density plot among the columns--can be donw with clot plot
<- function(data, column_vars) {
plot_against <- list()
plots
for (col in column_vars) {
<- filter(data, Category == 1)
hepatitis <- filter(data, Category == 0)
healthyBlood
<- ggplot() +
p geom_density(data = hepatitis,
aes(x = .data[[col]],
y = after_stat(density)),
colour = "red") +
geom_density(data = healthyBlood,
aes(x = .data[[col]],
y = after_stat(density)),
colour = "green") +
ggtitle(paste("Density Plot of", col, "against Category")) +
theme(plot.title = element_text(hjust = 0.5))
<- p
plots[[col]]
}
grid.arrange(grobs = plots, ncol = 2)
}
plot_against(data, column_vars = names(data)[3:6])
3.2 Interactive Plot Among the columns
Explore the relationships between different columns by hovering over the data points or zooming in and out on the plot
<- function(data) {
interactive_relationship_plot <- plot_ly(data, type = "scatter", mode = "markers", marker = list(size = 8))
p
# Create the scatter plot matrix
for (i in 1:(ncol(data) - 1)) {
for (j in (i + 1):ncol(data)) {
<- p %>% add_trace(x = ~data[, i], y = ~data[, j], name = colnames(data)[j])
p
}
}
# Set the axis labels
<- colnames(data)
axis_labels <- p %>% layout(
p xaxis = list(title = axis_labels),
yaxis = list(title = axis_labels),
title = "Interactive Scatter Plot Matrix",
showlegend = TRUE
)
return(p)
}
interactive_relationship_plot(data)
3.3 Outlier Detection
The model is fitted with the trained healthy blood donors and predictions was made on the Hepatis C data.The ID’s considered as outliers are 206 534 537 538 539 540
#Outlier detection
library(isotree)
<-data[data$Category==1,] #Data for Hepatitis C patient
hep<-data[data$Category==0,] #Data for Healthy blood donors
healt.bd
#ignore categorical variables
<- isolation.forest(hep[,-c(1,3)])
fit.isoforest <- predict(fit.isoforest, newdata= healt.bd[,-c(1,3)]) pred
## NULL
## [1] 534 535 537 538 539 540