Chapter 5 However this assumes that an individual stays within a topic

Assumption: that the change in disease distribtuion for a given topic captures all the heterogeneity of disease coding
We know that is not the case from our previous work: an individual can switch to a new pattern based on genetics and environmental factors, and we should have an ability to ‘update’ an individuals’ topic distribtuion
Furthermore, genetics has a declining importance over time, and the new diagnoses carry more weight. The rate of advancement through a topic (i.e., the shape of the spline) also may be affected by these intrinsic factors
Finally, occupancy in one topic necessarily may prevent other topics

5.1 Example function to update theta based on new diagnosis

Let’s start by modeling an individual’s transition to a new topic distribtuion as a funciton of genetics.

That is, \(p(K=k|D)=p(D|K)*p(K)/p(D)\)

Here, we will have the genetics influence change by a decay factor, and the updated theta be proportional to the likelihood of the new diagnosis:

\(\theta_{t+1}=p(D|\theta_{t})*p(\tilde{\theta})\)

Here, \(p(\tilde{\theta}\) represents the genetics-adapted prior distribution

Genetics keeps us in the prior distribtuion more at early ages than later, so the strength of genetics influence on the prior will decline over time

library(rBeta2009)
update_theta <- function(current_theta, new_diagnosis, phi_time_dependent, genetics_influence, current_time, max_time) {
  # Ensure valid inputs
  if (any(is.na(current_theta)) || any(is.na(phi_time_dependent))) {
    #print("isna")
    return(current_theta)
  }

  # Decay factor for genetics influence
  decay_factor <- ((max_time - current_time) / max_time)^3
  
  
  adjusted_genetics_influence <- genetics_influence * decay_factor
  #print(adjusted_genetics_influence)

  # Select phi for the current time
  phi_current_time <- phi_time_dependent[,,current_time]

  # Adjusted prior as a vector, where genetics impacts less with tim
  adjusted_prior <- current_theta * adjusted_genetics_influence

  # Likelihood as a vector for the specific diagnosis
  likelihood_vector <- phi_current_time[, new_diagnosis]

  # Bayesian update: Element-wise multiplication of adjusted_prior and likelihood_vector
  updated_theta <- adjusted_prior * likelihood_vector

  # Check for zero likelihood and revert to original theta for those cases
  if (all(likelihood_vector == 0)) {
    return(current_theta)
  }

  # Normalize the updated theta
  total <- sum(updated_theta)
  if (total == 0 || is.na(total)) {
    return(current_theta)
  }
  
  return(updated_theta / total)
}

Simulate data, this time let’s let the topic distribtuion for an individual be roughly uniform on the simplex. Recall that if all are equal and greater than 1 (e.g., 100 for topic-specific diseases and 100 for non-topic-specific diseases), the distribution is centered around the vector (1/K, 1/K, …, 1/K). The higher the α values, the more peaked or concentrated the distribution is around this central vector.

M <- 400          # Number of patients
K <- 5            # Number of topics
V <- 300          # Number of diseases
T <- 100          # Number of time steps
max_time <- T     # Assuming T is the maximum time

# Initial topic distributions (theta) for each patient, we want this initially quite monotone at 1/K
initial_theta <- rdirichlet(M, shape = rep(100, K))


# Time-varying disease distributions within topics (phi)
# Assuming phi is a 3D array (topics x diseases x time)
# This needs to be defined based on your model

# Genetics influence (for simplicity, assuming a constant vector for each patient), how much they want to stay in a topic, but since we simulate topics randomly maybe think a bit more carefully about this
genetics_influence <- matrix(runif(M * K, 0.5, 1.5), nrow = M, ncol = K)

# Structure to hold theta values over time for each patient
theta_over_time <- array(dim = c(T, M, K))
theta_over_time[1, , ] <- initial_theta

## check to make sure all 1
all.equal(rowSums(theta_over_time[1,,]),rep(1,nrow(theta_over_time[1,,])))

## [1] TRUE

# Function to simulate a new diagnosis (placeholder)
simulate_new_diagnosis <- function() {
  # Randomly simulate a new diagnosis
  return(sample(as.vector(topic_spec_disease), 1))
}


find_topic_for_diagnosis <- function(diagnosis, topic_spec_disease) {
  for (topic in 1:nrow(topic_spec_disease)) {
    if (diagnosis %in% topic_spec_disease[topic, ]) {
      return(topic)
    }
  }
  return(NA)  # Return NA if the diagnosis doesn't match any topic
}

for (time in 2:T) {
  for (patient in 1:M) {
    # Simulate a new diagnosis for this patient at this time
    new_diagnosis <- simulate_new_diagnosis()
    # print(new_diagnosis)
    # 
    # print(paste("Time:", time, "Patient:", patient, "Diagnosis:", new_diagnosis))
    # 
    # # Find the topic that the new diagnosis belongs to
    # topic <- find_topic_for_diagnosis(new_diagnosis, topic_spec_disease)
    # print(paste("Topic for the diagnosis:", topic))

    # Update the patient's topic distribution based on the new diagnosis
    theta_over_time[time, patient, ] <- update_theta(
      current_theta = theta_over_time[time - 1, patient, ],
      new_diagnosis = new_diagnosis,
      phi_time_dependent = phi,  # Assuming phi is defined as before
      genetics_influence = genetics_influence[patient, ],
      current_time = time,
      max_time = max_time
    )
  }
}

5.2 Topic Distribution per patient

Now let’s select for a sample patient and view his topic distribution over time.

Now, we might want to select a few patients for visualization and watch their disease codes over time.

num_patients=5
num_time_points=100
phi_array=phi

library(ggplot2)

# Function to simulate diagnoses
simulate_diagnoses <- function(theta, phi, num_patients, num_diseases, num_time_points) {
  diagnosis_record <- matrix(NA, nrow = num_patients, ncol = num_time_points)

  for (i in 1:num_patients) {
    for (t in 1:num_time_points) {
      # Aggregate disease probabilities across topics based on patient's theta
      disease_probabilities <- colSums(sweep(phi[, , t], 1, theta[i, , t], '*'))
      # Normalize probabilities
      disease_probabilities <- disease_probabilities / sum(disease_probabilities)
      # Simulate diagnosis
      diagnosed_disease <- sample(1:num_diseases, 1, prob = disease_probabilities)
      diagnosis_record[i, t] <- diagnosed_disease
    }
  }
  return(diagnosis_record)
}

# Assuming theta is indexed as [N, K, T]

library(ggplot2)
library(reshape2)

# Function to simulate diagnoses
simulate_diagnoses <- function(theta, phi, num_patients, num_diseases, num_time_points) {
  diagnosis_record <- matrix(NA, nrow = num_patients, ncol = num_time_points)

  for (t in 1:num_time_points) {
    for (i in 1:num_patients) {
      # Aggregate disease probabilities across topics based on patient's theta at time t
      disease_probabilities <- colSums(sweep(phi[, , t], 1, theta[t, i, ], '*'))
      # Normalize probabilities
      disease_probabilities <- disease_probabilities / sum(disease_probabilities)
      # Simulate diagnosis
      diagnosed_disease <- sample(1:num_diseases, 1, prob = disease_probabilities)
      diagnosis_record[i, t] <- diagnosed_disease
    }
  }
  return(diagnosis_record)
}

# Parameters
num_patients <- 5

# Simulate diagnoses
diagnosis_record <- simulate_diagnoses(theta_over_time, phi, num_patients, V, T)

# Convert to long format for plotting
diagnosis_data_long <- melt(diagnosis_record, varnames = c("Patient", "Time"), value.name = "Disease")

# Plotting
ggplot(diagnosis_data_long, aes(x = Time, y = Patient, color = as.factor(Disease))) +
  geom_point() +
  scale_color_viridis_d() +
  labs(title = "Diagnoses Over Time for Each Patient", x = "Time", y = "Patient", color = "Disease") +
  theme_minimal()

#apply(theta_over_time[,c(1:num_patients),],c(1,2),which.max)