Chapter 10 Estimating Beta

Now plot est_beta versus real:

# Assuming you have the beta array and diags list available
# Assuming you have the beta array (D x K x V x T) and individual diagnosis data (diags)

# Calculate average theta for each topic across all individuals
average_theta <- apply(Theta, 2, mean)  # Assuming Theta is D x K x T

# Calculate weighted population beta trajectories

marginal_beta=array(dim=c(D,V,T))

for (v in 1:V) {
  for (t in 1:T) {
    for (d in 1:D) {
    
    marginal_beta[d,v, t] <-  t(Theta[d,,t])%*%beta[d, , v, t] 
  }
  }}


## weight across individuals so we have the average for everage disease and time
weighted_population_beta=apply(marginal_beta,c(2,3),mean)

# Determine individual disease onset times, create a matrix of Disease  * time
individual_onset_times =matrix(0,nrow=T,ncol=V)
  for(d in 1:D){
      for(v in 1:V){
    # Find the first time point at which the disease v is diagnosed for individual d
    onset_times <- sapply(diags[[d]], function(diag_time) v %in% diag_time)
    if (any(onset_times)) {
      individual_onset_times[which(onset_times),v]=individual_onset_times[which(onset_times),v]+1  # Return the first time point of diagnosis
    } else {
      individual_onset_times[which(onset_times),v]=individual_onset_times[which(onset_times),v]  # No diagnosis for this disease
    }}}

individual_onset_times=apply(individual_onset_times,1,function(x){x/sum(x)})


# Compare individual onset times to weighted population trajectories
# Determine individual disease onset times, create a matrix of Disease  * time
individual_onset_times_first <- sapply(1:D, function(d) {
  sapply(1:V, function(v) {
    # Find the first time point at which the disease v is diagnosed for individual d
    onset_times <- sapply(diags[[d]], function(diag_time) v %in% diag_time)
    if (any(onset_times)) {
      return(which(onset_times)[1])  # Return the first time point of diagnosis
    } else {
      return(NA)  # No diagnosis for this disease
    }
  })
})



# Function to plot population trajectories
plot_population_trajectories <- function(population_beta) {
  pop_beta_melt <- melt(t(population_beta))
  colnames(pop_beta_melt) <- c("Time", "Disease", "Probability")
  ggplot(pop_beta_melt, aes(x = Time, y = Probability, color = as.factor(Disease), group = Disease)) +
    geom_line() +
    theme_minimal() +
    labs(title = "Marginal Disease Probability Over Time for Population",
         x = "Time",
         y = "Marginal Disease Probability",
         color = "Disease")
}

library(ggplot2)
library(reshape2)


# Function to plot density of individual onset times along with population trajectories
plot_density_onset <- function(individual_onset_times, population_beta) {
  pop_beta_melt <- melt(t(population_beta))
  colnames(pop_beta_melt) <- c("Time","Disease","Probability")
  onset_melt <- melt(individual_onset_times)
  colnames(onset_melt) <- c("Disease","Time","Count")
  
  onset_melt <- onset_melt[!is.na(onset_melt$Count),]  # Remove NA values

  # Plot population trajectories
  p1 <- ggplot(pop_beta_melt, aes(x = Time, y = Probability, col = as.factor(Disease), group = Disease)) +
    geom_line() +
    theme_minimal() +
    labs(title = "Marginal Disease Probability Over Time for Population with Onset Time Densities",
         x = "Age",
         y = "Marginal Disease Probability",
         color = "Disease")

  # Overlay density plots for individual onset times
  p2 <- ggplot(data = onset_melt, aes(x = Time, y=Count,col = as.factor(Disease),group=Disease))+geom_smooth(fill=NA) +
    scale_color_manual(values = scales::hue_pal()(length(unique(onset_melt$Disease)))) +
    guides(color = guide_legend(override.aes = list(shape = 1)), fill = guide_legend(title = "Disease  Density"))+labs(x="Age")
  
  
    onset_melt <- melt(individual_onset_times_first)
  colnames(onset_melt) <- c("Disease", "Individual", "Time")
  onset_melt <- onset_melt[!is.na(onset_melt$Time),]  # Remove NA values
  
    p3 <- ggplot(data = onset_melt, aes(x = Time, fill = as.factor(Disease), alpha = 10, adjust = 1.5, position = "identity")) +geom_density()+
    scale_fill_manual(values = scales::hue_pal()(length(unique(onset_melt$Disease)))) + labs(x="Age of First Onset")+
    guides(color = guide_legend(override.aes = list(shape = 1)), fill = guide_legend(title = "Disease Onset (first) Density"))
  

  
  
  
 ggpubr::ggarrange(p1,p2,p3,common.legend = TRUE,nrow=1)
}


library(ggpubr)
plot_density_onset(individual_onset_times = individual_onset_times, population_beta = weighted_population_beta)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

And here we try to estimate beta:

  diagnosis_counts <- array(0, dim = c(D, K, V, T))
  count_array <- array(0, dim = c(D, K, V, T))  # To keep track of the counts for averaging
  
  for (d in 1:D) {
    diagnoses_person <- diags[[d]]
    for (k in 1:K) {
      for (t in 1:T) {
        og_mapped_times <- which(warped_t_array[d, k, ] == t)  # Original times that map to current time t
        for (og_time in og_mapped_times) {
          diag_count <- unlist(diagnoses_person[t])
          for (w in diag_count) {
            diagnosis_counts[d, k, w, og_time] <- diagnosis_counts[d, k, w, og_time] + Theta[d, k, t]
            count_array[d, k, w, og_time] <- count_array[d, k, w, og_time] + 1
          }
        }
      }
    }
  }



## we want to make inference about popoulation beta
## plot eta with softmax and warping ....


normalize=function(x){x/sum(x)}
beta_est=apply(diagnosis_counts,c(2,3,4),sum)
for(k in 1:K){
  for(t in 1:T){
  beta_est[k,,t]=softmax_normalize(beta_est[k,,t])
  }}



true_beta=array(0,dim = c(K,V,T))
bias_value=max(eta)

  for (topic_idx in 1:K) {
    # Retrieve the diseases and their risk periods for this topic
    #specific_diseases_periods <- topic_disease_risk_periods[[topic_idx]]
    bias_increase <- disease_topic_matrix[,topic_idx]*bias_value
    for (time_idx in 1:T) {
        disease_values <- eta[topic_idx, , time_idx]

        dv <- disease_values+bias_increase
      
      # Apply softmax normalization
      normalized_disease <- softmax_normalize(dv)
      true_beta[topic_idx, , time_idx] <- normalized_disease
    }
  }

# Function to plot heatmaps
# Function to plot heatmaps
plot_heatmaps <- function(true_beta, est_beta,K, V, T, topic) {
  true_beta_df <- melt(true_beta[topic, , ])
  colnames(true_beta_df) <- c("Disease", "Time", "Probability")
  true_beta_df$Type <- "True"

  est_beta_df <- melt(est_beta[topic, , ])
  colnames(est_beta_df) <- c("Disease", "Time", "Probability")
  est_beta_df$Type <- "Estimated"

  combined_df <- rbind(true_beta_df, est_beta_df)

  p=ggplot(combined_df, aes(x = Time, y = Disease, fill = Probability)) +
    geom_tile() +
    facet_wrap(~ Type) +
    scale_fill_gradient(low = "white", high = "blue") +
    labs(title = "Comparison of True and Estimated Beta for Topic", topic, "and Individual",
         x = "Time", y = "Disease") +
    theme_minimal()
  
  return(p)
}

# Plot heatmaps for a specific topic and individual (replace with your desired topic and individual)

plot_difference <- function(beta_est, beta_true, D, K, V, T) {
  par(mfrow = c(1, 1))
  
  # Plot difference between estimated and true beta
  image(t(abs(beta_est[1, 1, , 1] - beta_true[1, 1, , 1])), main = "Difference between Estimated and True Beta", xlab = "Diseases", ylab = "Time")
}

plot_heatmaps(true_beta, beta_est, K, V, T, topic = 1)

plot_heatmaps(true_beta, beta_est, K, V, T, topic = 2)

plot_heatmaps(true_beta, beta_est, K, V, T, topic = 3)