3 Basic Exploratory Data Analysis

#note; In real world analysis, EDA will be done before handling missing values
#Import Data with Missing Values

data<-read.csv("data/CleanedData.csv",header = T,colClasses=c("NULL", rep(NA, 13))) 
# Single Plot

#subset Hepatitis and Healthy Blood
hepatitis = subset(data, Category==1)
healthyBlood = subset(data, Category==0)

#hepatitis and HealthyBlood for AST
 ggplot() + geom_density(aes(x=AST), colour="red", data=hepatitis) +
  geom_density(aes(x=AST), colour="Green", data=healthyBlood) +
  ggtitle(" Density Plot of Category VS. AST") +
  theme(plot.title = element_text(hjust = 0.5))

3.1 Density Plot

Plot can be done by adding more layers to the geom density

#Fuction for plotting density plot among the columns--can be donw with clot plot

plot_against <- function(data, column_vars) {
  plots <- list()

  for (col in column_vars) {
    hepatitis <- filter(data, Category == 1)
    healthyBlood <- filter(data, Category == 0)

    p <- ggplot() +
      geom_density(data = hepatitis,
                   aes(x = .data[[col]],
                   y = after_stat(density)), 
                   colour = "red") +
      geom_density(data = healthyBlood, 
                   aes(x = .data[[col]], 
                   y = after_stat(density)), 
                   colour = "green") +
      ggtitle(paste("Density Plot of", col, "against Category")) +
      theme(plot.title = element_text(hjust = 0.5))

    plots[[col]] <- p
  }

  grid.arrange(grobs = plots, ncol = 2)
}


plot_against(data, column_vars = names(data)[3:6])

3.2 Interactive Plot Among the columns

Explore the relationships between different columns by hovering over the data points or zooming in and out on the plot

interactive_relationship_plot <- function(data) {
  p <- plot_ly(data, type = "scatter", mode = "markers", marker = list(size = 8))
  
  # Create the scatter plot matrix
  for (i in 1:(ncol(data) - 1)) {
    for (j in (i + 1):ncol(data)) {
      p <- p %>% add_trace(x = ~data[, i], y = ~data[, j], name = colnames(data)[j])
    }
  }
  
  # Set the axis labels
  axis_labels <- colnames(data)
  p <- p %>% layout(
    xaxis = list(title = axis_labels),
    yaxis = list(title = axis_labels),
    title = "Interactive Scatter Plot Matrix",
    showlegend = TRUE
  )
  
  return(p)
}

interactive_relationship_plot(data)

3.3 Outlier Detection

The model is fitted with the trained healthy blood donors and predictions was made on the Hepatis C data.The ID’s considered as outliers are 206 534 537 538 539 540

#Outlier detection
library(isotree)
hep<-data[data$Category==1,]                          #Data for Hepatitis C patient
healt.bd<-data[data$Category==0,]                     #Data for Healthy blood donors

#ignore categorical variables

fit.isoforest <- isolation.forest(hep[,-c(1,3)])
pred <- predict(fit.isoforest, newdata= healt.bd[,-c(1,3)])
## NULL

## [1] 534 535 537 538 539 540