Code
library(MASS)
library(mvtnorm)
library(ggplot2)
library(pracma)
library(rTensor)
Attaching package: 'rTensor'
The following object is masked from 'package:pracma':
fnorm
Code
# Function for Warm Start Using Simulated Data and Tucker Decomposition
<- function(Y, g_i, n_topics, length_scales_lambda_sim, var_scales_lambda_sim, length_scales_phi_sim, var_scales_phi_sim) {
warm_gp_initialization <- dim(Y)[1] # Number of individuals
N <- dim(Y)[2] # Number of diseases
D <- dim(Y)[3] # Number of time points
T <- ncol(g_i) # Number of genetic covariates
P
# Compute mu_d from Y as the exp(colMeans) at each time point
<- array(0, dim = c(D, T))
mu_d for (d in 1:D) {
<- exp(colMeans(Y[, d, ])) # exp(colMeans of Y for each disease)
mu_d[d, ]
}
# Convert Y to logit scale, with smoothing to avoid infinite values
<- (Y * (N * D * T - 1) + 0.5) / (N * D * T)
Y_smooth <- log(Y_smooth / (1 - Y_smooth))
Y_logit
# Center Y_logit by subtracting mu_d (already log-transformed prevalence)
<- array(0, dim = c(N, D, T))
Y_centered for (d in 1:D) {
<- Y_logit[, d, ] - matrix(mu_d[d, ], nrow = N, ncol = T, byrow = TRUE)
Y_centered[, d, ]
}
# Perform Tucker decomposition on Y_centered
<- as.tensor(Y_centered)
Y_tensor <- tucker(Y_tensor, ranks = c(n_topics, D, T))
tucker_result
# Initialize Lambda (N x K x T)
<- array(0, dim = c(N, n_topics, T))
Lambda_init for (k in 1:n_topics) {
<- tucker_result$U[[1]][, k] %o% tucker_result$U[[3]][, k]
Lambda_init[, k, ]
}
# Initialize Phi (K x D x T)
<- array(0, dim = c(n_topics, D, T))
Phi_init for (k in 1:n_topics) {
<- tucker_result$U[[2]][, k] %o% tucker_result$U[[3]][, k]
Phi_init[k, , ]
}
# Initialize Gamma using a random normal prior
<- matrix(rnorm(n_topics * P, mean = 0, sd = 0.1), nrow = n_topics, ncol = P)
Gamma_init
# Use the length scales and variance scales from the simulation for Lambda and Phi
<- length_scales_lambda_sim
length_scales_lambda <- var_scales_lambda_sim
var_scales_lambda <- length_scales_phi_sim
length_scales_phi <- var_scales_phi_sim
var_scales_phi
# Return the initialized values
return(list(
Lambda = Lambda_init,
Phi = Phi_init,
Gamma = Gamma_init,
mu_d = mu_d, # Return mu_d as well since it's used
length_scales_lambda = length_scales_lambda,
var_scales_lambda = var_scales_lambda,
length_scales_phi = length_scales_phi,
var_scales_phi = var_scales_phi
))
}
####
# Assuming length_scales_lambda_sim, var_scales_lambda_sim, etc., come from sim_noulli.R
source("sim_noulli.R")
Attaching package: 'Matrix'
The following objects are masked from 'package:pracma':
expm, lu, tril, triu
Code
# Simulate data
<- length_scales_lambda
length_scales_lambda_sim <- var_scales_lambda
var_scales_lambda_sim <- length_scales_phi
length_scales_phi_sim <- var_scales_phi
var_scales_phi_sim
# Call warm initialization function
<- warm_gp_initialization(y, g_i, n_topics = 3,
initial_values
length_scales_lambda_sim, var_scales_lambda_sim, length_scales_phi_sim, var_scales_phi_sim)
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======================================================================| 100%
Code
###
# GP prior function for a single topic k
<- function(eta, mean, K_inv, log_det_K) {
log_gp_prior_vec <- length(eta)
T <- eta - mean # Center eta by the mean
centered_eta
# Compute quadratic form for GP prior
<- sum(centered_eta * (K_inv %*% centered_eta))
quad_form
# Calculate the log prior (Gaussian distribution with covariance matrix K)
<- -0.5 * (log_det_K + quad_form + T * log(2 * pi))
log_prior return(log_prior)
}
###
# Fast logistic function for numerical stability
<- function(x) {
fast_logistic 1 / (1 + exp(-x))
}
# Log-likelihood function for Y, Lambda, Phi, and mu_d
<- function(y, Lambda, Phi, mu_d) {
log_likelihood_vec <- dim(Lambda)[1]
n_individuals <- dim(Lambda)[2]
n_topics <- dim(Lambda)[3]
T <- dim(Phi)[2]
n_diseases
# Initialize logit probability array
<- array(0, dim = c(n_individuals, n_diseases, T))
logit_pi
# Calculate logit(pi) based on Lambda, Phi, and mu_d
for (k in 1:n_topics) {
for (t in 1:T) {
<- logit_pi[, , t] + Lambda[, k, t] %*% t(Phi[k, , t])
logit_pi[, , t]
}
}
# Add mu_d (logit prevalence) to the logit_pi values
for (d in 1:n_diseases) {
<- logit_pi[, d, ] + matrix(mu_d[d, ], nrow = n_individuals, ncol = T, byrow = TRUE)
logit_pi[, d, ]
}
# Apply the logistic function to get probabilities (pi)
<- array(fast_logistic(as.vector(logit_pi)), dim = dim(logit_pi))
pi
# Calculate log-likelihood
<- 0
log_lik for (i in 1:n_individuals) {
for (d in 1:n_diseases) {
# Identify time points where the disease hasn't been diagnosed yet (at risk)
<- which(cumsum(y[i, d, ]) == 0)
at_risk
if (length(at_risk) > 0) {
<- max(at_risk) + 1 # The first diagnosed time point (if any)
event_time if (event_time <= T) {
<- log_lik + log(pi[i, d, event_time])
log_lik
}<- log_lik + sum(log(1 - pi[i, d, at_risk]))
log_lik else {
} <- log_lik + log(pi[i, d, 1]) # If disease is diagnosed at time 1
log_lik
}
}
}
return(log_lik)
}##
<- function(T, length_scale, var_scale) {
precompute_K_inv <- outer(1:T, 1:T, "-") ^ 2
time_diff_matrix <- var_scale * exp(-0.5 * time_diff_matrix / length_scale ^ 2)
K <- K + diag(1e-6, T)
K <- solve(K)
K_inv <- determinant(K, logarithm = TRUE)$modulus
log_det_K return(list(K_inv = K_inv, log_det_K = log_det_K))
}
###
<- function(y, mu_d_logit, g_i, n_iterations, initial_values) {
mcmc_sampler_softplus
<- initial_values
current_state <- dim(current_state$Lambda)[1]
n_individuals <- dim(current_state$Lambda)[2]
n_topics <- dim(current_state$Lambda)[3]
T <- dim(current_state$Phi)[2]
n_diseases
# Initialize proposal standard deviations
<- list(
adapt_sd Lambda = array(0.01, dim = dim(current_state$Lambda)),
Phi = array(0.01, dim = dim(current_state$Phi)),
Gamma = matrix(0.01, nrow = nrow(current_state$Gamma), ncol = ncol(current_state$Gamma))
)
# Initialize storage for samples
<- list(
samples Lambda = array(0, dim = c(n_iterations, dim(current_state$Lambda))),
Phi = array(0, dim = c(n_iterations, dim(current_state$Phi))),
Gamma = array(0, dim = c(n_iterations, dim(current_state$Gamma)))
)
# Precompute inverse covariance matrices for Lambda and Phi
<- lapply(1:n_topics, function(k)
K_inv_lambda precompute_K_inv(T, current_state$length_scales_lambda[k], current_state$var_scales_lambda[k]))
<- lapply(1:n_topics, function(k)
K_inv_phi precompute_K_inv(T, current_state$length_scales_phi[k], current_state$var_scales_phi[k]))
for (iter in 1:n_iterations) {
# ------ Update Lambda ------
<- current_state$Lambda +
proposed_Lambda array(rnorm(prod(dim(current_state$Lambda)), 0, adapt_sd$Lambda),
dim = dim(current_state$Lambda))
# Likelihood calculation
<- log_likelihood_vec(y, current_state$Lambda, current_state$Phi, mu_d_logit)
current_log_lik <- log_likelihood_vec(y, proposed_Lambda, current_state$Phi, mu_d_logit)
proposed_log_lik
# Compute log prior for Lambda (GP prior)
<- sum(sapply(1:n_individuals, function(i) {
current_log_prior sapply(1:n_topics, function(k) {
log_gp_prior_vec(
$Lambda[i, k, ],
current_staterep(g_i[i, ] %*% current_state$Gamma[k, ], T),
$K_inv,
K_inv_lambda[[k]]$log_det_K
K_inv_lambda[[k]]
)
})
}))
<- sum(sapply(1:n_individuals, function(i) {
proposed_log_prior sapply(1:n_topics, function(k) {
log_gp_prior_vec(
proposed_Lambda[i, k, ],rep(g_i[i, ] %*% current_state$Gamma[k, ], T),
$K_inv,
K_inv_lambda[[k]]$log_det_K
K_inv_lambda[[k]]
)
})
}))
# Compute acceptance ratio for Lambda
<- (proposed_log_lik + proposed_log_prior) - (current_log_lik + current_log_prior)
log_accept_ratio
if (log(runif(1)) < log_accept_ratio) {
$Lambda <- proposed_Lambda
current_state$Lambda <- adapt_sd$Lambda * 2
adapt_sdelse {
} $Lambda <- adapt_sd$Lambda * 0.90
adapt_sd
}
# ------ Update Phi (with log-prior for GP) ------
<- current_state$Phi +
proposed_Phi array(rnorm(prod(dim(current_state$Phi)), 0, adapt_sd$Phi), dim = dim(current_state$Phi))
# Likelihood calculation for proposed Phi
<- log_likelihood_vec(y, current_state$Lambda, current_state$Phi, mu_d_logit)
current_log_lik <- log_likelihood_vec(y, current_state$Lambda, proposed_Phi, mu_d_logit)
proposed_log_lik
# Compute log prior for Phi (using GP prior)
<- sum(sapply(1:n_topics, function(k) {
current_log_prior_phi sapply(1:n_diseases, function(d) {
log_gp_prior_vec(current_state$Phi[k, d, ], 0, K_inv_phi[[k]]$K_inv, K_inv_phi[[k]]$log_det_K)
})
}))
<- sum(sapply(1:n_topics, function(k) {
proposed_log_prior_phi sapply(1:n_diseases, function(d) {
log_gp_prior_vec(proposed_Phi[k, d, ], 0, K_inv_phi[[k]]$K_inv, K_inv_phi[[k]]$log_det_K)
})
}))
# Compute acceptance ratio for Phi
<- (proposed_log_lik + proposed_log_prior_phi) - (current_log_lik + current_log_prior_phi)
log_accept_ratio
if (log(runif(1)) < log_accept_ratio) {
$Phi <- proposed_Phi
current_state$Phi <- adapt_sd$Phi * 2
adapt_sdelse {
} $Phi <- adapt_sd$Phi * 0.90
adapt_sd
}
# ------ Update Gamma ------
<- current_state$Gamma +
proposed_Gamma matrix(rnorm(prod(dim(current_state$Gamma)), 0, adapt_sd$Gamma), nrow = nrow(current_state$Gamma))
<- sum(dnorm(current_state$Gamma, 0, 1, log = TRUE))
current_log_prior <- sum(dnorm(proposed_Gamma, 0, 1, log = TRUE))
proposed_log_prior
<- proposed_log_prior - current_log_prior
log_accept_ratio
if (log(runif(1)) < log_accept_ratio) {
$Gamma <- proposed_Gamma
current_state$Gamma <- adapt_sd$Gamma * 2
adapt_sdelse {
} $Gamma <- adapt_sd$Gamma * 0.90
adapt_sd
}
# Store samples
$Lambda[iter, , , ] <- current_state$Lambda
samples$Phi[iter, , , ] <- current_state$Phi
samples$Gamma[iter, , ] <- current_state$Gamma
samples
# Print progress
if (iter %% 100 == 0) cat("Iteration", iter, "\n")
}
return(samples)
}
# Run the MCMC with softplus constraint
<- 5000
n_iterations <- mcmc_sampler_softplus(y, initial_values$mu_d, g_i, n_iterations, initial_values) samples
Iteration 100
Iteration 200
Iteration 300
Iteration 400
Iteration 500
Iteration 600
Iteration 700
Iteration 800
Iteration 900
Iteration 1000
Iteration 1100
Iteration 1200
Iteration 1300
Iteration 1400
Iteration 1500
Iteration 1600
Iteration 1700
Iteration 1800
Iteration 1900
Iteration 2000
Iteration 2100
Iteration 2200
Iteration 2300
Iteration 2400
Iteration 2500
Iteration 2600
Iteration 2700
Iteration 2800
Iteration 2900
Iteration 3000
Iteration 3100
Iteration 3200
Iteration 3300
Iteration 3400
Iteration 3500
Iteration 3600
Iteration 3700
Iteration 3800
Iteration 3900
Iteration 4000
Iteration 4100
Iteration 4200
Iteration 4300
Iteration 4400
Iteration 4500
Iteration 4600
Iteration 4700
Iteration 4800
Iteration 4900
Iteration 5000
Code
library(ggplot2)
# Flatten the Lambda array for plotting (choose an individual i, topic k, and time point t)
<- samples$Lambda[, 1, 1, 1] # Example: Lambda[1,1,1] over iterations
lambda_samples
# Create a data frame for ggplot
<- data.frame(iteration = 1:length(lambda_samples), value = lambda_samples)
lambda_df
# Plot trace of Lambda
ggplot(lambda_df, aes(x = iteration, y = value)) +
geom_line() +
labs(title = "Trace Plot for Lambda[1,1,1]", x = "Iteration", y = "Lambda value")
Code
<- samples$Phi[, 1, 1, 1] # Example: Phi[1,1,1] over iterations
phi_samples
<- data.frame(iteration = 1:length(phi_samples), value = phi_samples)
phi_df
ggplot(phi_df, aes(x = iteration, y = value)) +
geom_line() +
labs(title = "Trace Plot for Phi[1,1,1]", x = "Iteration", y = "Phi value")
Code
# Autocorrelation plot for Lambda[1,1,1]
acf(lambda_samples, main = "Autocorrelation for Lambda[1,1,1]")
library(coda)
# Calculate ESS for Lambda[1,1,1]
<- effectiveSize(lambda_samples)
ess_lambda print(paste("Effective Sample Size for Lambda[1,1,1]:", ess_lambda))
[1] "Effective Sample Size for Lambda[1,1,1]: 1.0940118445441"
Code
# Extract the final sampled Lambda and Phi values (or the posterior mean)
<- samples$Lambda[n_iterations, , , ]
final_Lambda <- samples$Phi[n_iterations, , , ]
final_Phi
# Initialize the logit probabilities array
<- array(0, dim = c(100, 5, T))
logit_pi_hat
# Calculate logit(pi_hat) based on final Lambda and Phi
for (k in 1:3) {
for (t in 1:T) {
<- logit_pi_hat[, , t] + final_Lambda[, k, t] %*% t(final_Phi[k, , t])
logit_pi_hat[, , t]
}
}
=5
n_diseases# Add the baseline prevalence (mu_d) back to logit_pi_hat
for (d in 1:n_diseases) {
<- logit_pi_hat[, d, ] + matrix(initial_values$mu_d[d, ], nrow = 100, ncol = T, byrow = TRUE)
logit_pi_hat[, d, ]
}
# Transform logit(pi_hat) to probabilities using the logistic function
<- 1 / (1 + exp(-logit_pi_hat)) pi_hat