Prediction
generate_predicted_diagnoses <- function(Theta, beta_est, D, K, V, T) {
predicted_diagnoses <- vector("list", D)
for (d in 1:D) {
predicted_diagnoses[[d]] <- vector("list", T)
for (t in 1:T) {
lambda_dt <- (t * 5) / T
Nd <- max(1, rpois(1, lambda = lambda_dt))
diagnoses_dt <- rep(NA, Nd)
for (n in 1:Nd) {
z <- sample(K, size = 1, prob = Theta[d, , t])
beta_tk <- beta_est[d, z, , t]
diagnoses_dt[n] <- sample(V, size = 1, prob = beta_tk)
}
predicted_diagnoses[[d]][[t]] <- diagnoses_dt
}
}
return(predicted_diagnoses)
}
calculate_performance_metrics <- function(true_diagnoses, predicted_diagnoses, D, T) {
true_positive <- 0
false_positive <- 0
false_negative <- 0
total <- 0
for (d in 1:D) {
for (t in 1:T) {
true_diagnoses_t <- true_diagnoses[[d]][[t]]
predicted_diagnoses_t <- predicted_diagnoses[[d]][[t]]
true_positive <- true_positive + sum(predicted_diagnoses_t %in% true_diagnoses_t)
false_positive <- false_positive + sum(!(predicted_diagnoses_t %in% true_diagnoses_t))
false_negative <- false_negative + sum(!(true_diagnoses_t %in% predicted_diagnoses_t))
total <- total + length(true_diagnoses_t)
}
}
accuracy <- true_positive / total
precision <- true_positive / (true_positive + false_positive)
recall <- true_positive / (true_positive + false_negative)
f1_score <- 2 * (precision * recall) / (precision + recall)
return(list(accuracy = accuracy, precision = precision, recall = recall, f1_score = f1_score))
}
# Generate predictions using time-dependent Theta
predicted_diagnoses_time_dependent <- generate_predicted_diagnoses(Theta, beta, D, K, V, T)
# Generate predictions using single Theta estimate
Theta_single <- apply(Theta, c(1, 2), mean)
Theta_single <- array(rep(Theta_single, T), dim = c(D, K, T))
predicted_diagnoses_single <- generate_predicted_diagnoses(Theta_single, beta, D, K, V, T)
# Calculate performance metrics
metrics_time_dependent <- calculate_performance_metrics(diags, predicted_diagnoses_time_dependent, D, T)
metrics_single <- calculate_performance_metrics(diags, predicted_diagnoses_single, D, T)
plot_performance_comparison <- function(metrics_time_dependent, metrics_single) {
metrics_df <- data.frame(
Metric = rep(c("Accuracy", "Precision", "Recall", "F1-Score"), 2),
Value = c(metrics_time_dependent$accuracy, metrics_time_dependent$precision, metrics_time_dependent$recall, metrics_time_dependent$f1_score,
metrics_single$accuracy, metrics_single$precision, metrics_single$recall, metrics_single$f1_score),
Estimate = rep(c("Time-Dependent Aladyn", "Singe Theta"), each = 4)
)
ggplot(metrics_df, aes(x = Metric, y = Value, col = Estimate)) +
geom_point() +
labs(title = "Performance Comparison", x = "Metric", y = "Value") +
theme_classic()
}
# Plot the performance comparison
plot_performance_comparison(metrics_time_dependent, metrics_single)
# Assuming you have the following variables defined:
# D, K, V, T: dimensions
# diags: observed diagnoses
# Theta: known topic distributions (time-dependent)
# beta_est: estimated beta values
# beta_true: true beta values for comparison
# Step 1: Generate predictions using time-dependent Theta
predicted_diagnoses_time_dependent <- generate_predicted_diagnoses(Theta, beta, D, K, V, T)
# Step 2: Generate predictions using single Theta estimate
Theta_single <- apply(Theta, c(1, 2), mean)
Theta_single <- array(rep(Theta_single, T), dim = c(D, K, T))
predicted_diagnoses_single <- generate_predicted_diagnoses(Theta_single, beta, D, K, V, T)
# Step 3: Calculate performance metrics
metrics_time_dependent <- calculate_performance_metrics(diags, predicted_diagnoses_time_dependent, D, T)
metrics_single <- calculate_performance_metrics(diags, predicted_diagnoses_single, D, T)
# Step 4: Plot the performance comparison
plot_performance_comparison(metrics_time_dependent, metrics_single)