Code
library(MASS)
library(mvtnorm)
library(ggplot2)
library(pracma)
library(rTensor)
Attaching package: 'rTensor'
The following object is masked from 'package:pracma':
fnorm
Code
# Function for Warm Start Using Simulated Data and Tucker Decomposition
warm_gp_initialization <- function(Y, g_i, n_topics, length_scales_lambda_sim, var_scales_lambda_sim, length_scales_phi_sim, var_scales_phi_sim) {
N <- dim(Y)[1] # Number of individuals
D <- dim(Y)[2] # Number of diseases
T <- dim(Y)[3] # Number of time points
P <- ncol(g_i) # Number of genetic covariates
# Compute mu_d from Y as the exp(colMeans) at each time point
mu_d <- array(0, dim = c(D, T))
for (d in 1:D) {
mu_d[d, ] <- exp(colMeans(Y[, d, ])) # exp(colMeans of Y for each disease)
}
# Convert Y to logit scale, with smoothing to avoid infinite values
Y_smooth <- (Y * (N * D * T - 1) + 0.5) / (N * D * T)
Y_logit <- log(Y_smooth / (1 - Y_smooth))
# Center Y_logit by subtracting mu_d (already log-transformed prevalence)
Y_centered <- array(0, dim = c(N, D, T))
for (d in 1:D) {
Y_centered[, d, ] <- Y_logit[, d, ] - matrix(mu_d[d, ], nrow = N, ncol = T, byrow = TRUE)
}
# Perform Tucker decomposition on Y_centered
Y_tensor <- as.tensor(Y_centered)
tucker_result <- tucker(Y_tensor, ranks = c(n_topics, D, T))
# Initialize Lambda (N x K x T)
Lambda_init <- array(0, dim = c(N, n_topics, T))
for (k in 1:n_topics) {
Lambda_init[, k, ] <- tucker_result$U[[1]][, k] %o% tucker_result$U[[3]][, k]
}
# Initialize Phi (K x D x T)
Phi_init <- array(0, dim = c(n_topics, D, T))
for (k in 1:n_topics) {
Phi_init[k, , ] <- tucker_result$U[[2]][, k] %o% tucker_result$U[[3]][, k]
}
# Initialize Gamma using a random normal prior
Gamma_init <- matrix(rnorm(n_topics * P, mean = 0, sd = 0.1), nrow = n_topics, ncol = P)
# Use the length scales and variance scales from the simulation for Lambda and Phi
length_scales_lambda <- length_scales_lambda_sim
var_scales_lambda <- var_scales_lambda_sim
length_scales_phi <- length_scales_phi_sim
var_scales_phi <- var_scales_phi_sim
# Return the initialized values
return(list(
Lambda = Lambda_init,
Phi = Phi_init,
Gamma = Gamma_init,
mu_d = mu_d, # Return mu_d as well since it's used
length_scales_lambda = length_scales_lambda,
var_scales_lambda = var_scales_lambda,
length_scales_phi = length_scales_phi,
var_scales_phi = var_scales_phi
))
}
####
# Assuming length_scales_lambda_sim, var_scales_lambda_sim, etc., come from sim_noulli.R
source("sim_noulli.R")
Attaching package: 'Matrix'
The following objects are masked from 'package:pracma':
expm, lu, tril, triu
Code
# Simulate data
length_scales_lambda_sim <- length_scales_lambda
var_scales_lambda_sim <- var_scales_lambda
length_scales_phi_sim <- length_scales_phi
var_scales_phi_sim <- var_scales_phi
# Call warm initialization function
initial_values <- warm_gp_initialization(y, g_i, n_topics = 3,
length_scales_lambda_sim, var_scales_lambda_sim,
length_scales_phi_sim, var_scales_phi_sim)
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======================================================================| 100%
Code
###
# GP prior function for a single topic k
log_gp_prior_vec <- function(eta, mean, K_inv, log_det_K) {
T <- length(eta)
centered_eta <- eta - mean # Center eta by the mean
# Compute quadratic form for GP prior
quad_form <- sum(centered_eta * (K_inv %*% centered_eta))
# Calculate the log prior (Gaussian distribution with covariance matrix K)
log_prior <- -0.5 * (log_det_K + quad_form + T * log(2 * pi))
return(log_prior)
}
###
# Fast logistic function for numerical stability
fast_logistic <- function(x) {
1 / (1 + exp(-x))
}
# Log-likelihood function for Y, Lambda, Phi, and mu_d
log_likelihood_vec <- function(y, Lambda, Phi, mu_d) {
n_individuals <- dim(Lambda)[1]
n_topics <- dim(Lambda)[2]
T <- dim(Lambda)[3]
n_diseases <- dim(Phi)[2]
# Initialize logit probability array
logit_pi <- array(0, dim = c(n_individuals, n_diseases, T))
# Calculate logit(pi) based on Lambda, Phi, and mu_d
for (k in 1:n_topics) {
for (t in 1:T) {
logit_pi[, , t] <- logit_pi[, , t] + Lambda[, k, t] %*% t(Phi[k, , t])
}
}
# Add mu_d (logit prevalence) to the logit_pi values
for (d in 1:n_diseases) {
logit_pi[, d, ] <- logit_pi[, d, ] + matrix(mu_d[d, ], nrow = n_individuals, ncol = T, byrow = TRUE)
}
# Apply the logistic function to get probabilities (pi)
pi <- array(fast_logistic(as.vector(logit_pi)), dim = dim(logit_pi))
# Calculate log-likelihood
log_lik <- 0
for (i in 1:n_individuals) {
for (d in 1:n_diseases) {
# Identify time points where the disease hasn't been diagnosed yet (at risk)
at_risk <- which(cumsum(y[i, d, ]) == 0)
if (length(at_risk) > 0) {
event_time <- max(at_risk) + 1 # The first diagnosed time point (if any)
if (event_time <= T) {
log_lik <- log_lik + log(pi[i, d, event_time])
}
log_lik <- log_lik + sum(log(1 - pi[i, d, at_risk]))
} else {
log_lik <- log_lik + log(pi[i, d, 1]) # If disease is diagnosed at time 1
}
}
}
return(log_lik)
}
##
precompute_K_inv <- function(T, length_scale, var_scale) {
time_diff_matrix <- outer(1:T, 1:T, "-") ^ 2
K <- var_scale * exp(-0.5 * time_diff_matrix / length_scale ^ 2)
K <- K + diag(1e-6, T)
K_inv <- solve(K)
log_det_K <- determinant(K, logarithm = TRUE)$modulus
return(list(K_inv = K_inv, log_det_K = log_det_K))
}
###
mcmc_sampler_softplus <- function(y, mu_d_logit, g_i, n_iterations, initial_values) {
current_state <- initial_values
n_individuals <- dim(current_state$Lambda)[1]
n_topics <- dim(current_state$Lambda)[2]
T <- dim(current_state$Lambda)[3]
n_diseases <- dim(current_state$Phi)[2]
# Initialize proposal standard deviations
adapt_sd <- list(
Lambda = array(0.01, dim = dim(current_state$Lambda)),
Phi = array(0.01, dim = dim(current_state$Phi)),
Gamma = matrix(0.01, nrow = nrow(current_state$Gamma), ncol = ncol(current_state$Gamma))
)
# Initialize storage for samples
samples <- list(
Lambda = array(0, dim = c(n_iterations, dim(current_state$Lambda))),
Phi = array(0, dim = c(n_iterations, dim(current_state$Phi))),
Gamma = array(0, dim = c(n_iterations, dim(current_state$Gamma)))
)
# Precompute inverse covariance matrices for Lambda and Phi
K_inv_lambda <- lapply(1:n_topics, function(k)
precompute_K_inv(T, current_state$length_scales_lambda[k], current_state$var_scales_lambda[k]))
K_inv_phi <- lapply(1:n_topics, function(k)
precompute_K_inv(T, current_state$length_scales_phi[k], current_state$var_scales_phi[k]))
for (iter in 1:n_iterations) {
# ------ Update Lambda ------
proposed_Lambda <- current_state$Lambda +
array(rnorm(prod(dim(current_state$Lambda)), 0, adapt_sd$Lambda),
dim = dim(current_state$Lambda))
# Likelihood calculation
current_log_lik <- log_likelihood_vec(y, current_state$Lambda, current_state$Phi, mu_d_logit)
proposed_log_lik <- log_likelihood_vec(y, proposed_Lambda, current_state$Phi, mu_d_logit)
# Compute log prior for Lambda (GP prior)
current_log_prior <- sum(sapply(1:n_individuals, function(i) {
sapply(1:n_topics, function(k) {
log_gp_prior_vec(
current_state$Lambda[i, k, ],
rep(g_i[i, ] %*% current_state$Gamma[k, ], T),
K_inv_lambda[[k]]$K_inv,
K_inv_lambda[[k]]$log_det_K
)
})
}))
proposed_log_prior <- sum(sapply(1:n_individuals, function(i) {
sapply(1:n_topics, function(k) {
log_gp_prior_vec(
proposed_Lambda[i, k, ],
rep(g_i[i, ] %*% current_state$Gamma[k, ], T),
K_inv_lambda[[k]]$K_inv,
K_inv_lambda[[k]]$log_det_K
)
})
}))
# Compute acceptance ratio for Lambda
log_accept_ratio <- (proposed_log_lik + proposed_log_prior) - (current_log_lik + current_log_prior)
if (log(runif(1)) < log_accept_ratio) {
current_state$Lambda <- proposed_Lambda
adapt_sd$Lambda <- adapt_sd$Lambda * 2
} else {
adapt_sd$Lambda <- adapt_sd$Lambda * 0.90
}
# ------ Update Phi (with log-prior for GP) ------
proposed_Phi <- current_state$Phi +
array(rnorm(prod(dim(current_state$Phi)), 0, adapt_sd$Phi), dim = dim(current_state$Phi))
# Likelihood calculation for proposed Phi
current_log_lik <- log_likelihood_vec(y, current_state$Lambda, current_state$Phi, mu_d_logit)
proposed_log_lik <- log_likelihood_vec(y, current_state$Lambda, proposed_Phi, mu_d_logit)
# Compute log prior for Phi (using GP prior)
current_log_prior_phi <- sum(sapply(1:n_topics, function(k) {
sapply(1:n_diseases, function(d) {
log_gp_prior_vec(current_state$Phi[k, d, ], 0, K_inv_phi[[k]]$K_inv, K_inv_phi[[k]]$log_det_K)
})
}))
proposed_log_prior_phi <- sum(sapply(1:n_topics, function(k) {
sapply(1:n_diseases, function(d) {
log_gp_prior_vec(proposed_Phi[k, d, ], 0, K_inv_phi[[k]]$K_inv, K_inv_phi[[k]]$log_det_K)
})
}))
# Compute acceptance ratio for Phi
log_accept_ratio <- (proposed_log_lik + proposed_log_prior_phi) - (current_log_lik + current_log_prior_phi)
if (log(runif(1)) < log_accept_ratio) {
current_state$Phi <- proposed_Phi
adapt_sd$Phi <- adapt_sd$Phi * 2
} else {
adapt_sd$Phi <- adapt_sd$Phi * 0.90
}
# ------ Update Gamma ------
proposed_Gamma <- current_state$Gamma +
matrix(rnorm(prod(dim(current_state$Gamma)), 0, adapt_sd$Gamma), nrow = nrow(current_state$Gamma))
current_log_prior <- sum(dnorm(current_state$Gamma, 0, 1, log = TRUE))
proposed_log_prior <- sum(dnorm(proposed_Gamma, 0, 1, log = TRUE))
log_accept_ratio <- proposed_log_prior - current_log_prior
if (log(runif(1)) < log_accept_ratio) {
current_state$Gamma <- proposed_Gamma
adapt_sd$Gamma <- adapt_sd$Gamma * 2
} else {
adapt_sd$Gamma <- adapt_sd$Gamma * 0.90
}
# Store samples
samples$Lambda[iter, , , ] <- current_state$Lambda
samples$Phi[iter, , , ] <- current_state$Phi
samples$Gamma[iter, , ] <- current_state$Gamma
# Print progress
if (iter %% 100 == 0) cat("Iteration", iter, "\n")
}
return(samples)
}
# Run the MCMC with softplus constraint
n_iterations <- 5000
samples <- mcmc_sampler_softplus(y, initial_values$mu_d, g_i, n_iterations, initial_values)Iteration 100
Iteration 200
Iteration 300
Iteration 400
Iteration 500
Iteration 600
Iteration 700
Iteration 800
Iteration 900
Iteration 1000
Iteration 1100
Iteration 1200
Iteration 1300
Iteration 1400
Iteration 1500
Iteration 1600
Iteration 1700
Iteration 1800
Iteration 1900
Iteration 2000
Iteration 2100
Iteration 2200
Iteration 2300
Iteration 2400
Iteration 2500
Iteration 2600
Iteration 2700
Iteration 2800
Iteration 2900
Iteration 3000
Iteration 3100
Iteration 3200
Iteration 3300
Iteration 3400
Iteration 3500
Iteration 3600
Iteration 3700
Iteration 3800
Iteration 3900
Iteration 4000
Iteration 4100
Iteration 4200
Iteration 4300
Iteration 4400
Iteration 4500
Iteration 4600
Iteration 4700
Iteration 4800
Iteration 4900
Iteration 5000
Code
library(ggplot2)
# Flatten the Lambda array for plotting (choose an individual i, topic k, and time point t)
lambda_samples <- samples$Lambda[, 1, 1, 1] # Example: Lambda[1,1,1] over iterations
# Create a data frame for ggplot
lambda_df <- data.frame(iteration = 1:length(lambda_samples), value = lambda_samples)
# Plot trace of Lambda
ggplot(lambda_df, aes(x = iteration, y = value)) +
geom_line() +
labs(title = "Trace Plot for Lambda[1,1,1]", x = "Iteration", y = "Lambda value")Code
phi_samples <- samples$Phi[, 1, 1, 1] # Example: Phi[1,1,1] over iterations
phi_df <- data.frame(iteration = 1:length(phi_samples), value = phi_samples)
ggplot(phi_df, aes(x = iteration, y = value)) +
geom_line() +
labs(title = "Trace Plot for Phi[1,1,1]", x = "Iteration", y = "Phi value")Code
# Autocorrelation plot for Lambda[1,1,1]
acf(lambda_samples, main = "Autocorrelation for Lambda[1,1,1]")
library(coda)
# Calculate ESS for Lambda[1,1,1]
ess_lambda <- effectiveSize(lambda_samples)
print(paste("Effective Sample Size for Lambda[1,1,1]:", ess_lambda))[1] "Effective Sample Size for Lambda[1,1,1]: 1.0940118445441"
Code
# Extract the final sampled Lambda and Phi values (or the posterior mean)
final_Lambda <- samples$Lambda[n_iterations, , , ]
final_Phi <- samples$Phi[n_iterations, , , ]
# Initialize the logit probabilities array
logit_pi_hat <- array(0, dim = c(100, 5, T))
# Calculate logit(pi_hat) based on final Lambda and Phi
for (k in 1:3) {
for (t in 1:T) {
logit_pi_hat[, , t] <- logit_pi_hat[, , t] + final_Lambda[, k, t] %*% t(final_Phi[k, , t])
}
}
n_diseases=5
# Add the baseline prevalence (mu_d) back to logit_pi_hat
for (d in 1:n_diseases) {
logit_pi_hat[, d, ] <- logit_pi_hat[, d, ] + matrix(initial_values$mu_d[d, ], nrow = 100, ncol = T, byrow = TRUE)
}
# Transform logit(pi_hat) to probabilities using the logistic function
pi_hat <- 1 / (1 + exp(-logit_pi_hat))