Chapter 4 Simulation of Disease Loadings

For the disease loadings, we use different mean functions to simulate the progression of disease probabilities over time. We assume that diseases tend to be topic-specific and heavily loaded on a few topics.

cov_function <- function(t1, t2, lengthscale, variance) {
    exp(-0.5 * (t1 - t2)^2 / lengthscale^2) * variance
}

# Store disease-specific covariances
Sigma <- array(dim = c(K, V, T, T)) # K topics, V diseases, T x T covariance matrix each


eta <- array(dim = c(K, V, T))


# Bias factor (adjust as needed)


# Generate covariances (adjust as needed)
for (k in 1:K) {
  for (v in 1:V) {
    # Sample a single lengthscale and variance for each topic-disease pair
      cov_matrix_kv <- matrix(0, nrow = T, ncol = T)
      #lengthscale_kv <- runif(1,min=5,max=20) ## choose random lengthscale for each topic and disease
      #variance_kv <- runif(1,0.1,0.5)## choose random variance for each topic and disease
      variance_kv=0.01 ## smaller variance means it changes less from mu funciotn
      
      lengthscale_kv=10
    for (i in 1:T) {
for (j in 1:T) {
        cov_matrix_kv[i, j] <- cov_function(i, j, lengthscale_kv, variance_kv)
      }
    }

    Sigma[k, v, , ] <- cov_matrix_kv
  }
}

4.1 Sample from the Gaussian Process

The probability of a disease within a topic over time has some covariance structure. We also might imagine that the disease probabilities also have some shape over time:

linear_trend <- function(time,
                         slope = 0.1,
                         intercept = 0) {
  return(slope * time + intercept)
}


logistic_growth <-
  function(t, carrying_capacity, growth_rate, t_mid) {
    carrying_capacity / (1 + exp(-growth_rate * (t - t_mid)))
  }


exponential_decay <- function(t, initial_value, decay_rate) {
  initial_value * exp(-decay_rate * t)
}

gaussian_peak <- function(t, mean, variance) {
  exp(-(t - mean) ^ 2 / (2 * variance))
}


polynomial_trend <- function(t, coefficients) {
  result <- sapply(t, function(time_point) {
    sum(sapply(1:length(coefficients), function(i) coefficients[i] * time_point^(i - 1)))
  })
  return(result)
}

sinusoidal_pattern <- function(t, amplitude, period, phase) {
  amplitude * sin((2 * pi / period) * t + phase)
}

cov_function <- function(t1, t2, lengthscale, variance) {
  return(variance * exp(-0.5 * (t1 - t2) ^ 2 / lengthscale ^ 2))
}

We might wish that each disease within a topic ihas it’s unique mean function:

# Modify eta sampling to use the correct covariance
# Covariance matrix Sigma (you should define this according to your model specifics)
# Parameters

D <- 20  # Number of individuals
K <- 5   # Number of topics
T <- 100  # Number of time points
time_points <- 1:T

# Initialize the array
topic_word_mean <- array(NA, dim = c(K, V))


mean_functions <- list(
  linear_trend = linear_trend,
  logistic_growth = logistic_growth,
  #exponential_decay = exponential_decay,
  gaussian_peak = gaussian_peak
  #polynomial_trend = polynomial_trend
  #sinusoidal_pattern = sinusoidal_pattern
)

# Define a list of names for the mean functions
mean_function_names <- names(mean_functions)


# Simulate mean functions for each disease
mu_vectors <- list()
mean_functions_called = list()


for (k in 1:K) {
  mu_vectors[[k]] <- list()
  mean_functions_called[[k]] = list()
  for (v in 1:V) {
    # Choose a mean function randomly
    # Choose a mean function by name randomly
    chosen_function_name <- sample(mean_function_names, 1)
    # Retrieve the function from the list using the name
    mean_function <- mean_functions[[chosen_function_name]]
    # Sample parameter ranges
 params <- list(
      slope = runif(1, -0.05, 0.05),
      intercept = runif(1, -1, 1),
      carrying_capacity = runif(1, 0.02, 0.05),
      growth_rate = runif(1, 0.01, 0.05),
      t_mid = sample(time_points, 1),
      mean = sample(time_points, 1),
      variance = runif(1, 10, 50),
      coefficients = runif(3, -0.1, 0.1)
    )
    
    
    
    # Use the appropriate parameters based on the chosen mean function
    if (chosen_function_name  == "logistic_growth") {
      function_specific_params <- list(
        t = time_points,
        carrying_capacity = params$carrying_capacity,
        growth_rate = params$growth_rate,
        t_mid = params$t_mid
      )
    } else if (chosen_function_name  == "linear_trend") {
      function_specific_params <- list(
        t = time_points,
        slope = params$slope,
        intercept = params$intercept
      )
    } else if (chosen_function_name  == "exponential_decay") {
      function_specific_params <- list(
        t = time_points,
        initial_value = params$initial_value,
        decay_rate = params$decay_rate
      )
    } else if (chosen_function_name  == "gaussian_peak") {
      function_specific_params <- list(t = time_points,
                                       mean = params$mean,
                                       variance = params$variance)
    } else if (chosen_function_name == "polynomial_trend") {
      function_specific_params <- list(t = time_points, coefficients = params$coefficients)
    } else if (chosen_function_name  == "sinusoidal_pattern") {
      function_specific_params = list(
        t = time_points,
        amplitude = params$amplitude,
        period = params$period,
        phase = params$phase
      )
    }
    
    
    # Generate the time series for the mean function using do.call
    mu_vectors[[k]][[v]] <-
      do.call(mean_function, function_specific_params)
    mean_functions_called[[k]][[v]] = list("function_name" = chosen_function_name, "params" = function_specific_params)
    
    eta[k, v, ] <-
      mvrnorm(1, mu = mu_vectors[[k]][[v]], Sigma = Sigma[k, v, , ])
  }
}



rho = matrix(runif(K*D,min = 0.5,max = 2),nrow=D,ncol=K) 
#rho=matrix(1,nrow=D,ncol=K)
# Normalize using logistic function and a scaling factor
adjusted_logistic_function <- function(x, scale) {
  return(scale / (1 + exp(-x)))
}


inv_time_warping <- function(t, rho, T) {
return(min(T, max(1, floor((t / T) ^ (1 / rho) * T))))
}


# Initialize inverse warped time array
inv_warped_t_array <- array(0, dim = c(D, K, T))

for (d in 1:D) {
  for (k in 1:K) {
    for (t in 1:T) {
      inv_warped_t_array[d, k, t] <- inv_time_warping(t, rho[d, k], T)
    }
  }
}

4.2 Topic specificity

We simiulate so that disease tend to be somewhat topic specific, that is, heavilty loaded on a few topics

# Determine the number of topics in which each disease will be primarily active
#num_topics_per_disease <- sample(1:2, V, replace = TRUE)  # Each disease is active in 1 or 2 topics
num_topics_per_disease = rep(1,V)
# Initialize the list to store the primary topics for each disease
disease_primary_topics <- vector("list", length = V)

# Initialize the disease-topic matrix with zeros indicating inactivity
disease_topic_matrix <- matrix(0, nrow = V, ncol = K)

# Assign primary topics to each disease
for (disease in 1:V) {
  primary_topics <- sample(1:K, num_topics_per_disease[disease])
  disease_primary_topics[[disease]] <- primary_topics
  
  # Set the bias value for the disease in its primary topics, can vary
  #bias_values <- runif(num_topics_per_disease[disease], 0, 1)  # Variable bias between 0 and 1
  bias_values = rep(1,num_topics_per_disease[disease])
  # Fill in the disease-topic matrix with bias values for the primary topics
  disease_topic_matrix[disease, primary_topics] <- bias_values
}


image(t(disease_topic_matrix),xlab="Topic",ylab="Disease")

image((disease_topic_matrix),ylab="Topic",xlab="Disease")

topic_specific_diseases=list()
for(i in 1:K){
  topic_specific_diseases[[i]]=which(disease_topic_matrix[,i]==1)}

4.3 Now do on population for different warping

beta <- array(0, dim(eta))
# Normalize using logistic function and a scaling factor
adjusted_logistic_function <- function(x, scale) {
  return(scale / (1 + exp(-x)))
}

scaling_factor <- 0.02  #

bias_value=max(eta)
for (topic_idx in 1:K) {
    # Retrieve the diseases and their risk periods for this topic
    #specific_diseases_periods <- topic_disease_risk_periods[[topic_idx]]
    bias_increase <- disease_topic_matrix[,topic_idx]*bias_value
    for (time_idx in 1:T) {

      disease_values <- eta[topic_idx, , time_idx]
      
      # Increase the values for active diseases at this time before normalization
      dv <- disease_values+bias_increase
      
      # Apply softmax normalization
      normalized_disease <- adjusted_logistic_function(dv,scale = scaling_factor)
            #normalized_disease <- softmax_normalize(dv)

      beta[topic_idx, , time_idx] <- normalized_disease
    }
}

Show that disease probabilities for two people of different \(\rho\):

v=sample(topic_specific_diseases[[topic_idx]],1)
plot(beta[topic_idx,v,],pch=19,col="grey",ylab="Disease Probability over Time conditional on topic activity",xlab="Time")
t_high=inv_warped_t_array[which.max(rho[,topic_idx]),topic_idx,]
points(seq(1:T),beta[topic_idx,v,t_high],col="red",pch=19)
t_low=inv_warped_t_array[which.min(rho[,topic_idx]),topic_idx,]
points(seq(1:T),beta[topic_idx,v,t_low],col="blue",pch=19)

Expand beta_prime array so that it stores the right value for each individual:

beta_prime=array(NA,dim=c(D,K,V,T))
for(d in 1:D){
  for(k in 1:K){
    warped_times = inv_warped_t_array[d,k,]
    beta_prime[d,k,,]=beta[k,,warped_times]
  }
}

betaold=beta