# Week 7 Model evaluation II

There are multiple ways to evaluate models. What we have learned so far is Information Criteria (e.g., AIC and BIC). Instead of using NLL (which quantifies goodness-of-fit), we also punish model complexity given number of parameters.
Today, we will learn three more methods to evaluate models:
1. Ex post simulation
2. model recovery (using two or two more models)
3. parameter recovery (focusing on one model)

# Week 7: Model evaluation II
# First and Last name
# date
• To start the new project, clear working environment with rm(list=ls()).
# clear working environment
rm(list=ls())
• Load the package
# Open "rtdists" package
library(rtdists)
# Load data from the "rtdists" package
data(rr98)
# Double-click rr98 in your Environment panel (right-top window)
# In the dataset, there are 3 participants, who completed ~ 4000 trials in each condition (speed/accuracy).
• Preprocess data to make sure the data and data structure are suitable for follow-up model fitting.
# Preprocess data
rr98 <- rr98[!rr98$outlier,] # remove outliers, as in original paper rr98[,'RT'] <- rr98$rt
rr98[,'accuracy'] = 0 # add a new column labeled as accuracy, where incorrect responses are coded as 0
rr98[rr98$correct, 'accuracy'] = 1 # and correct responses are coded as 1 rr98$choice = rr98$response rr98$response = as.character(rr98$response) rr98[rr98$correct == 1, "response"] = "upper"
DDMsimu.accuracy$instruction <- "accuracy" ### 7.2.3 Simulation and model fitting What if we conduct the same experiment with 200 synthetic subjects and with two hypothetical models? Simple! We just need to repeat the same simulation procedure 200 times for each model. This repetition can be addressed by the for function. In order to mimic the individual difference on information processing, we use different sets of parameter values for each synthetic subject in each model. The simulation for each subject is then followed by a model fitting. This exercise will take ~10 min. You will see the current progress on the Console window. ## Simulation and model fitting # predetermine nObservation <-200 # number of samples nParam.oDDM <-3; # number of parameters in oDDM nParam.eDDM <-4; # number of parameters in eDDM # pre-allocation AIC.oDDM<- matrix(data=NA,nrow=nObservation,ncol=2) AIC.eDDM<- matrix(data=NA,nrow=nObservation,ncol=2) BIC.oDDM<- matrix(data=NA,nrow=nObservation,ncol=2) BIC.eDDM<- matrix(data=NA,nrow=nObservation,ncol=2) # for loop to run the experiment N times. Each time we simulate data with both oDDM and eDDM. for (kSim in 1:nObservation) { print(kSim/nObservation) # update current progress in the Console window # ---------------------- # Simulate data with original DDM ParameterSet <-c(runif(n=1, min=0.5, max=1), runif(n=1, min=0.5, max=2), runif(n=1, min=0.1, max=0.3)) # pre-allocation DDMsimu.speed <-setNames(data.frame(matrix(ncol = 2, nrow = nTrial.speed)), c("rt", "response")) DDMsimu.accuracy <-setNames(data.frame(matrix(ncol = 2, nrow = nTrial.accuracy)), c("rt", "response")) # simulation DDMsimu.speed <- rdiffusion(nTrial.speed, v = ParameterSet[1], a = ParameterSet[2], t0 = ParameterSet[3]) DDMsimu.accuracy <- rdiffusion(nTrial.accuracy, v = ParameterSet[1], a = ParameterSet[2], t0 = ParameterSet[3]) DDMsimu.speed$instruction <- "speed"
DDMsimu.accuracy$instruction <- "accuracy" oDDMsimu.All <- rbind(DDMsimu.speed,DDMsimu.accuracy) # ---------------------- # Simulate data with extended function eParameterSet <-c(runif(n=1, min=0.5, max=1), runif(n=1, min=0.5, max=2), runif(n=1, min=0.5, max=2), runif(n=1, min=0.1, max=0.3)) # pre-allocation DDMsimu.speed <-setNames(data.frame(matrix(ncol = 2, nrow = nTrial.speed)), c("rt", "response")) DDMsimu.accuracy <-setNames(data.frame(matrix(ncol = 2, nrow = nTrial.accuracy)), c("rt", "response")) # simulation DDMsimu.speed <- rdiffusion(nTrial.speed, v = eParameterSet[1], a = eParameterSet[2], t0 = eParameterSet[4]) DDMsimu.accuracy <- rdiffusion(nTrial.accuracy, v = eParameterSet[1], a = eParameterSet[3], t0 = eParameterSet[4]) DDMsimu.speed$instruction <- "speed"
DDMsimu.accuracy$instruction <- "accuracy" eDDMsimu.All <- rbind(DDMsimu.speed,DDMsimu.accuracy) # ---------------------- # model fitting (original DDM) for (kDDMdata in 1:2) { if (kDDMdata ==1){ data <- oDDMsimu.All } else { data<- eDDMsimu.All } # 1. set up lower and upper boundary for each parameter # Also, check if the same range is used for the same parameter lb <-c(-5,0.5,0.1) # Lower Boundary for each parameter. Order is important. Here, drift rate, threshold and non-decision time ub <-c(5,5,0.5) # Upper Boundary for each parameter. Order is important. Here, drift rate, threshold and non-decision time # 2. Create optimization function with an additional input: number of repetition. # (The code is adapted from Mikhail Spektor's). results.oDDM<-setNames(data.frame(matrix(ncol = 5, nrow = 0)), c("nRepetition","v", "a","t0 (ms)", "nll")) run <- 1 optimization_runs <-3 # Run the parameter estimation for several times (this will take few seconds/minutes to complete) while (run <= optimization_runs){ starting_values <- c(runif(n=1, min=-5, max=5), runif(n=1, min=0.5, max=5), runif(n=1, min=0, max=0.1)) while (oDDM(starting_values,data) == Inf) {starting_values <- c(runif(n=1, min=-5, max=5), runif(n=1, min=0.5, max=5), runif(n=1, min=0, max=0.1))} # parameter estimation procedure using nlminb fitDDM <- nlminb(start=starting_values, objective=oDDM, lower = lb, upper= ub, dat=data)# the data is the one we simulated above deviance <- fitDDM[["objective"]] params <- fitDDM[["par"]] # store results for each iterate results.oDDM[run,] <- c(run, round(params[1],2), round(params[2],2), round(params[3],2), deviance) run <- run + 1} # ---------------------- # model fitting (extended DDM) # 1. set up lower and upper boundary for each parameter # Also, check if the same range is used for the same parameter lb <-c(-5,0.5,0.5,0.1) # Lower Boundary for each parameter. Order is important. Here, drift rate, threshold and non-decision time ub <-c(5,5,5,0.5) # Upper Boundary for each parameter. Order is important. Here, drift rate, threshold and non-decision time # 2. Create optimization function with an additional input: number of repetition. # (The code is adapted from Mikhail Spektor's). results.eDDM<-setNames(data.frame(matrix(ncol = 6, nrow = 0)), c("nRepetition","v","a_speed","a_accuracy", "t0 (ms)", "nll")) run <- 1 optimization_runs <-3 # Run the parameter estimation for several times (this will take few seconds/minutes to complete) while (run <= optimization_runs){ starting_values <- c(runif(n=1, min=-5, max=5), runif(n=1, min=0.5, max=5), # selected from the same range for the same parameter runif(n=1, min=0.5, max=5), # selected from the same range for the same parameter runif(n=1, min=0, max=0.1)) while (eDDM(starting_values,data) == Inf) {starting_values <- c(runif(n=1, min=-5, max=5), runif(n=1, min=0.5, max=5), runif(n=1, min=0.5, max=5), runif(n=1, min=0, max=0.1))} # parameter estimation procedure using nlminb fiteDDM <- nlminb(start=starting_values, objective=eDDM, lower = lb, upper= ub, dat=data) deviance <- fiteDDM[["objective"]] params <- fiteDDM[["par"]] # store results for each iterate results.eDDM[run,] <- c(run, round(params[1],4), round(params[2],4), round(params[3],4), round(params[4],4), deviance) run <- run + 1 } # identify the lowest nLL and extract the parameter and compute AIC and BIC nLLlowes.oDDM<- min(results.oDDM[,5]) nLLlowes.eDDM<- min(results.eDDM[,6]) AIC.oDDM[kSim, kDDMdata] <- nLLlowes.oDDM+2*nParam.oDDM # AIC values when true model is oDDM and fitting oDDM AIC.eDDM[kSim, kDDMdata] <- nLLlowes.eDDM+2*nParam.eDDM # AIC values when true model is oDDM and fitting eDDM BIC.oDDM[kSim, kDDMdata] <- nLLlowes.oDDM+nParam.oDDM*log(length(oDDMsimu.All[,1])) BIC.eDDM[kSim, kDDMdata] <- nLLlowes.eDDM+nParam.eDDM*log(length(eDDMsimu.All[,1])) # AIC.oDDM[, 1] are AIC values when true model is oDDM and fitting oDDM # AIC.eDDM[, 1] are AIC values when true model is oDDM and fitting eDDM # AIC.oDDM[, 2] are AIC values when true model is eDDM and fitting oDDM # AIC.eDDM[, 2] are AIC values when true model is eDDM and fitting eDDM } } ## Summarize model recovery with the confusion matrix # plot the confusion matrix library(yardstick) # Install it if you cannot find the package in your R library(ggplot2) # Install it if you cannot find the package in your R # Create targets and predictions data frame TP<- data.frame( "simulated" = c(rep(1,kSim),rep(2,kSim)), # suppose you have two models, 1= oDDM, 2=eDDM "estimated" = c(apply(cbind(AIC.oDDM[,1],AIC.eDDM[,1]),MARGIN = 1, FUN = which.min), #identify the minimum AIC for each synthetic subject when the true model is oDDM. apply(cbind(AIC.oDDM[,2],AIC.eDDM[,2]),MARGIN = 1, FUN = which.min)),#identify the minimum AIC for each synthetic subject when the true model is eDDM stringsAsFactors = FALSE ) # transform values to factors TP$simulated <- as.factor(TP$simulated) TP$estimated <- as.factor(TP$estimated) # The confusion matrix cm <- conf_mat(TP, simulated, estimated) autoplot(cm, type = "heatmap")+ theme(legend.position = "right")+ labs(fill="Frequency") ## 7.3 Parameter recovery To evaluate a single model and to test if there is an unique mapping between possible data pattern and a corresponding set of parameter estimates, the parameter recovery test is needed. Unlike model recovery, we focus on one model and correlation between true parameter values and estimated parameter values. # predetermine nObservation <-200 nParam.eDDM <-4; # pre-allocation eParameterSet<- setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("v","a_speed","a_accuracy", "t0")) Recovery.eDDM <-setNames(data.frame(matrix(ncol = 4, nrow = 0)), c("v","a_speed","a_accuracy", "t0")) for (kSim in 1:nObservation){ # ---------------------- # Simulate data with extended function eParameterSet[kSim,] <-c(runif(n=1, min=0.5, max=1), runif(n=1, min=0.5, max=2), runif(n=1, min=0.5, max=2), runif(n=1, min=0.1, max=0.3)) # pre-allocation DDMsimu.speed <-setNames(data.frame(matrix(ncol = 2, nrow = nTrial.speed)), c("rt", "response")) DDMsimu.accuracy <-setNames(data.frame(matrix(ncol = 2, nrow = nTrial.accuracy)), c("rt", "response")) # simulation DDMsimu.speed <- rdiffusion(nTrial.speed, v = eParameterSet[kSim,1], a = eParameterSet[kSim,2], t0 = eParameterSet[kSim,4]) DDMsimu.accuracy <- rdiffusion(nTrial.accuracy, v = eParameterSet[kSim,1], a = eParameterSet[kSim,3], t0 = eParameterSet[kSim,4]) DDMsimu.speed$instruction <- "speed"
DDMsimu.accuracy\$instruction <- "accuracy"
eDDMsimu.All <- rbind(DDMsimu.speed,DDMsimu.accuracy)

# ----------------------
# model fitting (extended DDM)
# 1. set up lower and upper boundary for each parameter
# Also, check if the same range is used for the same parameter
lb <-c(-5,0.5,0.5,0.1) # Lower Boundary for each parameter. Order is important. Here, drift rate, threshold and non-decision time
ub <-c(5,5,5,0.5) # Upper Boundary for each parameter. Order is important. Here, drift rate, threshold and non-decision time

# 2. Create optimization function with an additional input: number of repetition.
# (The code is adapted from Mikhail Spektor's).
results.eDDM<-setNames(data.frame(matrix(ncol = 6, nrow = 0)), c("nRepetition","v","a_speed","a_accuracy", "t0 (ms)", "nll"))
run <- 1
optimization_runs <-3

# Run the parameter estimation for several times (this will take few seconds/minutes to complete)
while (run <= optimization_runs){
starting_values <- c(runif(n=1, min=-5,  max=5),
runif(n=1, min=0.5,  max=5), # selected from the same range for the same parameter
runif(n=1, min=0.5,  max=5), # selected from the same range for the same parameter
runif(n=1, min=0,  max=0.1))
while (eDDM(starting_values,data) == Inf) {starting_values <- c(runif(n=1, min=-5,  max=5),
runif(n=1, min=0.5,  max=5),
runif(n=1, min=0.5,  max=5),
runif(n=1, min=0,  max=0.1))}
# parameter estimation procedure using nlminb
fiteDDM <- nlminb(start=starting_values, objective=eDDM,
lower = lb,
upper= ub,
dat=eDDMsimu.All)
deviance <- fiteDDM[["objective"]]
params <- fiteDDM[["par"]]

# save results for each iterate
results.eDDM[run,] <- c(run,
round(params[1],4),
round(params[2],4),
round(params[3],4),
round(params[4]*1000),
deviance)
run <- run + 1
}

# identify the lowest nLL and extract the parameter and compute AIC and BIC
id<- which.min(results.eDDM[,5])
Recovery.eDDM[kSim,] <- results.eDDM[id,2:5]
}

Next, we can run the correlation analysis

#the correlation between true and recovered/estimated parameters
round(cor(Recovery.eDDM,eParameterSet), 2)