Chapter 9 Diagnoses

We can now sumamrise the diagnoses

diagnosis_count_array <- array(0, dim = c(D, T, V))
topic_count_array <- array(0, dim = c(D, T, K))
disease_in_topic_array=array(0, dim = c(D, T, V))

# Iterate over each patient
for (d in 1:D) {
  # Iterate over each time point
  for (t in 1:length(diags[[d]])) {
    # Check if there are any diagnoses at this time point
    if (!is.null(diags[[d]][[t]])) {
      # Iterate over each diagnosis for this patient at this time
      for (diag in diags[[d]][[t]]) {
        # Increment the count for this diagnosis
        # Ensure diag is an integer and valid index; adjust if necessary based on your diagnosis ID system
        diagnosis_count_array[d, t, diag] <- diagnosis_count_array[d, t, diag] + 1
        disease_in_topic_array[d, t, diag] <- diagnosis_count_array[d, t, diag] + 1
      }
    }
  }
}



# Iterate over each patient
for (d in 1:D) {
  # Iterate over each time point
  for (t in 1:length(diags[[d]])) {
    # Check if there are any diagnoses at this time point
    if (!is.null(diags[[d]][[t]])) {
      # Iterate over each diagnosis for this patient at this time
      for (topic in topics[[d]][[t]]) {
        # Increment the count for this diagnosis
        # Ensure diag is an integer and valid index; adjust if necessary based on your diagnosis ID system
        topic_count_array[d, t, topic] <- topic_count_array[d, t, topic] + 1
        
      }
    }
  }
}

Now let’s make some plots;

D <- dim(diagnosis_count_array)[1]  # Number of patients
T <- dim(diagnosis_count_array)[2]  # Number of time points (years)
V <- dim(diagnosis_count_array)[3]  # Total number of diagnoses

start_year=2
first_occurrences <- vector("list", D)
names(first_occurrences) <- paste0("Patient_", 1:D)

# Iterate through each patient and diagnosis to find the first occurrence
for (d in 1:D) {
  for (v in 1:V) {
    # Find the first time point (if any) this diagnosis occurs for patient d
    time_points <- which(diagnosis_count_array[d, , v] > 0)
    if (length(time_points) > 0) {
      first_occurrence_year <- start_year + time_points[1] - 1
      # Store this info
      first_occurrences[[d]] <- rbind(first_occurrences[[d]], cbind(DiagnosisID = v, Year = first_occurrence_year))
    }
  }
  # Convert each patient's data to a dataframe for easier handling later
  first_occurrences[[d]] <- as.data.frame(first_occurrences[[d]])
}

# At this point, `first_occurrences` is a list of dataframes, each containing the first occurrence years of diagnoses for each patient

# Filter for specific patients (if you're only interested in a subset)
specific_patients <- sample(D, size = 2)  # Example: Randomly selecting 2 patients
df_specific <- lapply(specific_patients, function(d) {
  data.frame(PatientID = d, first_occurrences[[d]])
})
df_specific <- do.call(rbind, df_specific)

# Convert DiagnosisID and PatientID back to factors if needed
df_specific$DiagnosisID <- factor(df_specific$DiagnosisID)
df_specific$PatientID <- factor(df_specific$PatientID)


ggplot(df_specific, aes(x = Year, y = DiagnosisID)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = "First Occurrence of Diagnoses Over Time",
       x = "Year", y = "Diagnosis ID") +
  facet_wrap(~PatientID, scales = "free_y") +
  theme_minimal()

We can also plot total counts:

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
## 
##     smiths
# Convert the 3D array to a long format data frame
df <- arrayInd(which(diagnosis_count_array >= 0), dim(diagnosis_count_array)) %>%
  as_tibble() %>%
  rename(PatientID = V1, TimePoint = V2, DiagnosisID = V3) %>%
  mutate(Year = start_year + TimePoint - 1,
         Count = apply(., 1, function(x) diagnosis_count_array[x[1], x[2], x[3]]))


# Filter out zero counts to reduce plot clutter

total_diagnoses_per_year <- df %>%
  group_by(PatientID, Year) %>%
  summarise(TotalDiagnoses = sum(Count))
## `summarise()` has grouped output by 'PatientID'. You can override using the
## `.groups` argument.
library(viridis)
## Loading required package: viridisLite
patients=sample(D,size = 2)
# Plotting
ggplot(total_diagnoses_per_year[total_diagnoses_per_year$PatientID%in%patients,], aes(x = Year, y = TotalDiagnoses, fill = as.factor(PatientID))) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_viridis(discrete = TRUE, option = "viridis", direction = -1) +  # Using viridis
  labs(title = "Total Number of Diagnoses per Year for Each Patient",
       x = "Year", y = "Total Number of Diagnoses") +
  facet_wrap(~PatientID, scales = "free_y", ncol = 4) +  # Adjust ncol for layout
  theme_minimal()

Show that most diseases are in topic:

library(ggplot2)
library(dplyr)
library(tidyr)

# Step 1: Aggregate Diagnoses
# Calculate the proportion of topic-specific diagnoses for each patient at each time interval
# Assuming disease_in_topic is a list of lists with TRUE/FALSE for topic-specific diagnoses
disease_proportion_df <- do.call(rbind, lapply(1:D, function(d) {
  do.call(rbind, lapply(2:length(disease_in_topic[[d]]), function(t) {
    data.frame(
      Patient = d,
      Time = t,
      TopicSpecific = sum(unlist(disease_in_topic[[d]][[t]]), na.rm = TRUE),
      TotalDiagnoses = length(disease_in_topic[[d]][[t]]),
      ProportionTopicSpecific = ifelse(length(disease_in_topic[[d]][[t]]) > 0,
                                       sum(unlist(disease_in_topic[[d]][[t]]), na.rm = TRUE) / length(disease_in_topic[[d]][[t]]),
                                       0)
    )
  }))
}))

# Step 2: Summarize Across Patients
# Calculate the average proportion of topic-specific diagnoses at each time point across all patients
summary_proportion_df <- disease_proportion_df %>%
  group_by(Time) %>%
  summarise(
    AvgPropTopicSpecific = mean(ProportionTopicSpecific, na.rm = TRUE),
    ProportionWithTopicSpecific = sum(TopicSpecific > 0, na.rm = TRUE) / n()
  )

# Step 3: Plot Summary
# Plot the average proportion of patients with topic-specific diagnoses over time
# Calculate proportions for bar chart
bar_chart_df <- disease_proportion_df %>%
  group_by(Time) %>%
  summarise(
    PropTopicSpecific = mean(ProportionTopicSpecific, na.rm = TRUE),
    PropNotTopicSpecific = mean(1 - ProportionTopicSpecific, na.rm = TRUE)
  ) %>%
  gather(key = "Type", value = "Proportion", -Time)

# Create the bar chart
ggplot(bar_chart_df, aes(x = Time, y = Proportion, fill = Type)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("PropTopicSpecific" = "blue", "PropNotTopicSpecific" = "red")) +
  labs(title = "Proportion of Topic-Specific vs Non-Specific Diagnoses Over Time",
       x = "Time Interval", y = "Proportion") +
  theme_minimal()

9.1 Test:

For a sample patient, let’s make sure that their chosen disease topic assignemnets correpsonded to heavy loadings.

pt=sample(D,size = 1)
time=sample(T,1)
wt=Theta[pt,,time]
zs=topics[[pt]][time][[1]]
print(zs)
diseases=diags[[pt]][time][[1]]
print(diseases)

sapply(z[[1]],function(x){
  topic_specific_diseases[[x]]})

a=vector()
for(d in 1:D){
  pt=d
  for(t in 1:T){
    time=t
    wt=Theta[pt,,time]
zs=topics[[pt]][time][[1]]
diseases=diags[[pt]][time][[1]]
for(i in 1:length(zs)){
  topicx=zs[i]
  diseasex=diseases[i]
  newwt=beta[pt,topicx,diseasex,time]
  a=c(newwt,a)
  
  }}}