Chapter 3 Generate sample GP

library(ggplot2)
library(dplyr)
library(tidyr)
library(reshape2)
library(RColorBrewer)
library(MASS)
library(ggsci)
library(kernlab)

D = 500
K = 5
V = 50
T = 100
G = 20
sigma_epsilon = 0.1  # Individual error term standard deviation

# Placeholder for GP Predictions and Original Densities
gppreds <- list()
gp_fits <- list()
original_densities <- list()

# Sample data
set.seed(42)  # For reproducibility
m = readRDS("samplediseasemerged.rds")
num_sample = K
samp = sample(1:nrow(m), num_sample)
sampled_topic = m$exclude_name[samp]

# Define a common grid for x values
common_grid <- seq(30, max(m$age_diag), length.out = 100)
sigmasq_noise = 0.01
# Fit Gaussian Processes for each topic
for (i in 1:length(sampled_topic)) {
  # Select data for the specific disease
  disease_data <- m[m$exclude_name %in% sampled_topic[i],]
  
  # Calculate density
  density_data <- density(disease_data$age_diag)
  
  # Fit Gaussian Process
  gp_fit <- gausspr(x = density_data$x, y = density_data$y, kernel = "rbfdot", kpar = "automatic")
  gp_fits[[i]] <- gp_fit
   # Store original density and predictions
  original_densities[[i]] <- data.frame(x = density_data$x, y = density_data$y, phenotype = sampled_topic[i])
  
  y_pred=predict(gp_fit, common_grid)
  gppreds[[i]] <- data.frame(x = common_grid, y = y_pred, phenotype = sampled_topic[i])
}
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
# Combine all original densities and predictions into data frames
original_density_df <- do.call(rbind, original_densities)
gp_predictions <- do.call(rbind, gppreds)

# Plot using ggplot2
ggplot() +
  geom_line(data = original_density_df, aes(x = x, y = y, color = phenotype), linetype = "dashed") +
  geom_line(data = gp_predictions, aes(x = x, y = y, color = phenotype)) +
  labs(title = "Density Estimation with Gaussian Process",
       x = "Age at Diagnosis",
       y = "Density") +
  theme_minimal() +
  theme(legend.position = "bottom")

# Simulate individual-specific topic weights
genetic_scores <- array(runif(n = D * K * G, min = -1, max = 1), dim = c(D, K, G))
gamma <- matrix(runif(K * G, min = -0.5, max = 0.5), nrow = K, ncol = G)
gen_effect <- matrix(NA, nrow = D, ncol = K)

for (d in 1:D) {
  for (k in 1:K) {
    for (g in 1:G) {
      gen_effect[d, ] <- plogis(genetic_scores[d, , ] %*% gamma[k, ])
    }
  }
}

logit_func <- function(x) { log(x / (1 - x)) }

# Define the individual-level temporal covariance function (RBF Kernel)
# Define the individual-level temporal covariance function (RBF Kernel) with variance scaling
rbf_kernel <- function(x, y, length_scale = 10, variance = 0.01) {
  variance * exp(-0.5 * (x - y)^2 / length_scale^2)
}


# Generate the temporal covariance matrix for the individual-level deviations
time_points <- seq(1, T)
K_eta <- outer(time_points, time_points, rbf_kernel)

topic_weights <- array(NA, dim = c(D, K, T))
topic_weights_norm <- array(NA, dim = c(D, K, T))
topic_weights_pop <- array(NA, dim = c(K, T))
topic_weights_norm_pop <- array(NA, dim = c(K, T))

sigmasq_noise = 0.001
for (k in 1:K) {
  gp_fit <- gp_fits[[k]]
  gp_mean <- predict(gp_fit, common_grid)
  #K_xx <- kernelMatrix(gp_fit@kernelf, common_grid)
  # K_x <- kernelMatrix(gp_fit@kernelf, observed_data$x, common_grid)
  # K_inv <- solve(kernelMatrix(gp_fit@kernelf, observed_data$x) + diag(sigmasq_noise, length(observed_data$x)))
  # 
  # mu_post <- K_x %*% K_inv %*% observed_data$y
  # K_post <- K_xx - K_x %*% K_inv %*% t(K_x)
  # 
  # Ensure GP mean is within probability bounds
  gp_mean <- pmax(gp_mean, 1e-5)
  
  for (i in 1:D) {
    # Draw individual-level deviations from the posterior distribution
    K_eta=diag(sigmasq_noise,T)
    individual_deviation <- mvrnorm(n = 1, mu = rep(0, T), Sigma = K_eta)
    #genetic_adjustment <- gamma_k * gen_effect[i, k]
    
    # Draw individual-level topic weights from the posterior distribution
    #topic_weights[i, k, ] <- mvrnorm(n = 1, mu = mu_post + genetic_adjustment, Sigma = K_post)
    #topic_weights_norm[i, k, ] <- plogis(topic_weights[i, k, ])

    topic_weights_pop[k, ] = logit_func(gp_mean)
    ## same as simulating from eta where eta|data~GP(mu_gp+gamma*g,K_posterior)
    topic_weights[i, k, ] <- logit_func(gp_mean) + gen_effect[i, k] + individual_deviation
    topic_weights_norm[i, k, ] <- plogis(logit_func(gp_mean) + gen_effect[i, k] + individual_deviation)
    topic_weights_norm_pop[k, ] <- plogis(logit_func(gp_mean))
  }
}

# Generate topic activity
Topic_activity <- array(NA, dim = c(D, K, T))
for (d in 1:D) {
  for (k in 1:K) {
    Topic_activity[d, k, ] <- rbinom(n = T, prob = topic_weights_norm[d, k, ], size = 1)
  }
}

Theta_individual <- topic_weights_norm
Theta_pop <- topic_weights_norm_pop

matplot(t(Theta_pop))

matplot(t(Theta_individual[which.max(gen_effect[, 1]), , ]), xlab = "Time", ylab = "High genetics on Topic 1")

matplot(t(Theta_individual[which.max(gen_effect[, 2]), , ]), xlab = "Time", ylab = "High genetics on Topic 2")

matplot(t(Theta_individual[which.max(gen_effect[, 3]), , ]), xlab = "Time", ylab = "High genetics on Topic 3")

D = 500
K = 5
V = 50
T = 100
G = 20

# Simulate individual-specific topic weights
genetic_scores <- array(runif(n = D * K * G, min = -1, max = 1), dim = c(D, K, G))
gamma <- matrix(runif(K * G, min = -0.5, max = 0.5), nrow = K, ncol = G)
gen_effect <- matrix(NA, nrow = D, ncol = K)


# Sample data
set.seed(42)  # For reproducibility
m = readRDS("samplediseasemerged.rds")
num_sample = K
samp = sample(1:nrow(m), num_sample)
sampled_topic = m$exclude_name[samp]

# Define a common grid for x values
common_grid <- seq(30, max(m$age_diag), length.out = 100)
sigmasq_noise <- 0.001

# Simulate individual-specific topic weights

for (d in 1:D) {
  for (k in 1:K) {
    for (g in 1:G) {
      gen_effect[d, ] <- plogis(genetic_scores[d, , ] %*% gamma[k, ])
    }
  }
}


# Placeholder for Gaussian Process fits and topic weights
gp_fits <- list()
topic_weights <- array(0, dim = c(D, K, T))
topic_weights_norm <- array(0, dim = c(D, K, T))

for (k in 1:K) {
  # Simulate some density data for fitting GP (replace this with your actual data)
  disease_data <- m[m$exclude_name %in% sampled_topic[k],]
  
  # Calculate density
  density_data <- density(disease_data$age_diag)
  
  # Fit Gaussian Process
  gp_fit <- gausspr(x = density_data$x, y = density_data$y, kernel = "rbfdot", kpar = "automatic")
  gp_fits[[k]] <- gp_fit
  
  # Extract kernel matrices
  K_train_train <- kernelMatrix(gp_fit@kernelf, density_data$x)
  K_train_common <- kernelMatrix(gp_fit@kernelf, density_data$x, common_grid)
  K_common_common <- kernelMatrix(gp_fit@kernelf, common_grid)
  
  # Noise variance
  sigma2_noise <- diag(sigmasq_noise, length(density_data$x))
  
  # Inverse of the noisy kernel matrix
  K_inv <- solve(K_train_train + sigma2_noise)
  
  # Posterior mean and covariance
  mu_post <- t(K_train_common) %*% K_inv %*% density_data$y
  K_post <- K_common_common - t(K_train_common) %*% K_inv %*% K_train_common
  
  # Ensure GP mean is within probability bounds
  mu_post <- pmax(mu_post, 1e-5)
  
  for (i in 1:D) {
    # Adjust posterior mean with genetic effects
    mu_post_adjusted <- logit_func(mu_post) + gen_effect[i, k]
    
    # Draw individual-level deviations from posterior covariance
    individual_deviation <- mvrnorm(n = 1, mu = rep(0, T), Sigma = K_post)
    
    # Combine to get the individual-level topic weights
    topic_weights_pop[k, ] = logit_func(mu_post)
    ## same as simulating from eta where eta|data~GP(mu_gp+gamma*g,K_posterior)

    topic_weights[i, k, ] <- mu_post_adjusted + individual_deviation
    topic_weights_norm[i, k, ] <- plogis(topic_weights[i, k, ])
  }
}
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel 
## Using automatic sigma estimation (sigest) for RBF or laplace kernel
Theta_individual <- topic_weights_norm
Theta_pop <- topic_weights_norm_pop

matplot(t(Theta_pop))

matplot(t(Theta_individual[which.max(gen_effect[, 1]), , ]), xlab = "Time", ylab = "High genetics on Topic 1")

matplot(t(Theta_individual[which.max(gen_effect[, 2]), , ]), xlab = "Time", ylab = "High genetics on Topic 2")

matplot(t(Theta_individual[which.max(gen_effect[, 3]), , ]), xlab = "Time", ylab = "High genetics on Topic 3")



Now plot acitivty:



``` r
melt_topic_activity <- function(activity_array) {
  activity_df <- melt(activity_array)
  names(activity_df) <- c("Individual", "Topic", "Time", "Activity")
  activity_df$Active <- ifelse(activity_df$Activity == 1, paste("Active", activity_df$Topic), "Inactive")
  return(activity_df)
}

df <- melt_topic_activity(Topic_activity)

# Define color palette
num_active_levels <- length(setdiff(unique(df$Active), "Inactive"))
softer_palette <- brewer.pal(min(num_active_levels, 8), "Set2")  # Use Set2 palette for up to 8 different active levels
active_levels <- setdiff(unique(df$Active), "Inactive")
colors_for_active <- setNames(softer_palette[1:num_active_levels], active_levels)
colors <- c("Inactive" = "white", colors_for_active)


# Plotting functions
plot_population <- function(df) {
  ggplot(df, aes(x = Time, y = Topic, fill = Active)) +
    geom_tile() +
    scale_fill_manual(values = colors) +
    labs(title = "Population-Level Topic Activity Over Time",
         x = "Time", y = "Topic", fill = "Topic Status") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "bottom")
}

plot_individual <- function(df, individual_id) {
  individual_df <- df %>% filter(Individual == individual_id)
  ggplot(individual_df, aes(x = Time, y = as.factor(Topic), fill = Active)) +
    geom_tile() +
    scale_fill_manual(values = colors) +
    labs(title = paste("Topic Activity Over Time for Individual", individual_id),
         x = "Time", y = "Topic", fill = "Topic Status") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "bottom")
}

plot_high_predisposition <- function(df, individual_id) {
  high_predisposition_df <- df %>% filter(Individual == individual_id)
  ggplot(high_predisposition_df, aes(x = Time, y = as.factor(Topic), fill = Active)) +
    geom_tile() +
    scale_fill_manual(values = colors) +
    labs(title = paste("High Predisposition Individual", individual_id, "Topic Activity Over Time"),
         x = "Time", y = "Topic", fill = "Topic Status") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "bottom")
}


plot_low_predisposition <- function(df, individual_id) {
  high_predisposition_df <- df %>% filter(Individual == individual_id)
  ggplot(high_predisposition_df, aes(x = Time, y = as.factor(Topic), fill = Active)) +
    geom_tile() +
    scale_fill_manual(values = colors) +
    labs(title = paste("Low Predisposition Individual", individual_id, "Topic Activity Over Time"),
         x = "Time", y = "Topic", fill = "Topic Status") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "bottom")
}


# Identify individuals with high and low genetic predispositions
high_predisposition_individual <- which.max(rowSums(gen_effect))
low_predisposition_individual <- which.min(rowSums(gen_effect))

# 1. Population-Level Topic Activity
print(plot_population(df))

# 2. Individual-Level Topic Activity Across All Topics
print(plot_individual(df, 1))

# 3. Detailed View for an Individual with High Predispositions
print(plot_high_predisposition(df, high_predisposition_individual))

# 4. Detailed View for an Individual with Low Predispositions
print(plot_low_predisposition(df, low_predisposition_individual))

3.1 Plot theta

mf=melt(Theta_individual)
colnames(mf)=c("Individual","Topic","Time","Value")
ggplot(mf[mf$Individual%in%sample(D,1),],aes(x=Time,y=Value,group=as.factor(Topic),fill=as.factor(Topic)))+geom_area()+theme_classic()+labs(y="Theta",x="Age")+scale_fill_nejm()

average=apply(Theta_individual,c(2,3),mean)
m=melt(average)
colnames(m)=c("Topic","Time","Value")
ggplot(m,aes(x=Time,y=Value,group=Topic,fill=as.factor(Topic)))+geom_area()+theme_classic()+labs(y="Average Theta",x="Age")+scale_fill_nejm()