Chapter 8 Visualization

Now we can plot the simulated theta for a given individual

  • For the natural scale
pt=sample(x = D,size = 1)
matplot(t(alpha_individual[pt,,]),ylab=paste0("Alpha Prenorm",pt))

For the softmax true proprotions Theta:

matplot(t(Theta[pt,,]),ylab =paste0("True Theta",pt))

For the posterior means of the topic probabilities:

matplot(t(estimated_Theta[pt,,]),ylab=paste0("Estimated P(topic = K| D) for Pt ",pt))

matplot(t(gp_mean_array[pt,,]),ylab=paste0("GP Mean for pt ",pt,"Sig = ",sig))

Now let’s make some ggplots;

theta_patient1 <- t(Theta[pt, , ])
# Convert to a data frame and reshape
theta_patient1_df <- as.data.frame(theta_patient1)
colnames(theta_patient1_df) <- paste("Topic", 1:K, sep = "")
theta_patient1_df$Time <- 1:nrow(theta_patient1_df)

theta_patient1_long <- melt(theta_patient1_df, id.vars = "Time", variable.name = "Topic", value.name = "Proportion")
ggplot(theta_patient1_long, aes(x = Time, y = Proportion, fill = Topic,col=Topic)) +
  #geom_line()+
  geom_area(position = 'stack') +
  labs(x = "Time", y = "Topic Proportion", fill = "Topic") +
  ggtitle(paste0("Distribution of True Topic Proportions for Patient ",pt," Over Time"))

theta_patient1 <- t(estimated_Theta[pt, , ])
# Convert to a data frame and reshape
theta_patient1_df <- as.data.frame(theta_patient1)
colnames(theta_patient1_df) <- paste("Topic", 1:K, sep = "")
theta_patient1_df$Time <- 1:nrow(theta_patient1_df)
theta_patient1_long <- melt(theta_patient1_df, id.vars = "Time", variable.name = "Topic", value.name = "Proportion")

ggplot(theta_patient1_long, aes(x = Time, y = Proportion, fill = Topic)) +
  #geom_line()+
  geom_area(position = 'stack') +
  labs(x = "Time", y = "Topic Proportion", fill = "Topic") +
  ggtitle(paste0("Estimated Topics for Patient ",pt," Over Time"))

theta_patient1 <- t(gp_mean_array[pt, , ])

# Convert to a data frame and reshape
theta_patient1_df <- as.data.frame(theta_patient1)
colnames(theta_patient1_df) <- paste("Topic", 1:K, sep = "")
theta_patient1_df$Time <- 1:nrow(theta_patient1_df)
theta_patient1_long <- melt(theta_patient1_df, id.vars = "Time", variable.name = "Topic", value.name = "Proportion")
ggplot(theta_patient1_long, aes(x = Time, y = Proportion, fill = Topic,col=Topic)) +
  #geom_line()+
  geom_area(position = 'stack') +
  labs(x = "Time", y = "Topic Proportion", fill = "Topic") +
  ggtitle(paste0("GP Mean for Patient ",pt,"with Sigma ",sig,"Over Time"))

8.1 plot for all deviations

theta_deviation <- array(0,dim = c(K,T))
stupid_deviation <- array(0,dim = c(K,T))
prior_deviation <- array(0,dim = c(K,T))

  for (k in 1:K) {
      for (t in 2:T) {
       
       theta_deviation[k,t] <- mean((estimated_Theta[, k,t ] - Theta[, k, t])^2)
       stupid_deviation[k,t] <- mean((estimated_Theta[, k,t ] - rep(1/K,D))^2)
       ## here subtract first 
      prior_deviation[k,t] = mean((estimated_Theta[, k,t ] - Theta[, k, 1])^2)

}}





# Assuming the calculation loops are correctly populating the deviation arrays

# Combine your arrays into a single array for melting
combined_array <- array(c(theta_deviation, stupid_deviation, prior_deviation), dim = c( K, T, 3))

# Correct variable names and melt the array
long_deviations <- melt(combined_array, varnames = c("Topic", "Time", "Model"))

# Correct the model names in your melted data
long_deviations$Model <- factor(long_deviations$Model, levels = 1:3, labels = c("Aladyn", "Agnostic", "Initiation"))

# Visualization
ggplot(long_deviations, aes(x = Time, y = value, color = Model, group = Model)) +
  geom_line() +facet_wrap(~Topic)+
   # Adjust ncol based on your dataset
  labs(x = "Time", y = "Deviation from True Theta", color = "Model") +
  ggtitle("Deviations in Posterior Estimate of Theta_dkt")+theme_classic()

8.2 plotting over all

# Convert arrays to data frames


Theta_long <- melt(Theta, varnames = c("Individual", "Topic", "Time"), value.name = "Proportion") %>%
  mutate(Type = "True")

estimated_Theta_long <- melt(estimated_Theta, varnames = c("Individual", "Topic", "Time"), value.name = "Proportion") %>%
  mutate(Type = "Estimated")

gp_mean_long= melt(gp_mean_array, varnames = c("Individual", "Topic", "Time"), value.name = "Proportion") %>%
  mutate(Type = "GPMean")

# Combine the data
combined_data <- rbind(Theta_long, estimated_Theta_long,gp_mean_long)

# Compute the average proportion for each topic at each time point
average_proportions <- combined_data %>%
  group_by(Type, Topic, Time) %>%
  summarise(Avg_Proportion = mean(Proportion), .groups = 'drop')



# Plotting directly from the long-format data
ggplot(average_proportions, aes(x = Time, y = Avg_Proportion, #fill = as.factor(Topic),
                                col = as.factor(Topic))) +
  geom_point()+
  #geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~Type, scales = "free_y", nrow=1) +
  scale_x_continuous(breaks = seq(min(average_proportions$Time), max(average_proportions$Time), by = 10)) + # Adjust this line if necessary
  labs(title = "Average Topic Proportions Over Time", x = "Time Point", y = "Average Proportion", fill = "Topic",col="Topic") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))