elegant code

get the average Sepal.Length per Species

■ Consider a better alternative:

groups <- levels(iris$Species)
averages <- c()
for(g in groups) averages[g] <- 
                 mean(iris$Sepal.Length[iris$Species==g])
rm(g, groups)
averages
then click here to see the elegant approach:
tapply(iris$Sepal.Length, iris$Species, mean)


conditionally set invalid records to NA

■ Consider a better alternative:

for(i in c(1:nrow(DF))){
  if(DF[i, "column1"] != "valid"){  # Where column1 is not
   DF[i, "column2"] <- NA           # 'valid', set
  }                                 # column2 to NA
}
then click here to see the elegant approach:
DF$column2[DF$column1 != "valid"] <- NA


collect results

■ Consider a better alternative:

results <- c()
for(i in 1:10000) {
  results <- c(results, some_calculation(i))
}
then click here to see the elegant approach:
results <- sapply(1:1e6, some_calculation)


scale numeric columns

■ Consider a better alternative:

for(col in names(DF)) {
  if(is.numeric(DF[ ,col])) {
    mean_val <- mean(DF[ ,col], na.rm=TRUE)
    sd_val <- sd(DF[ ,col], na.rm=TRUE)
    for(i in 1:nrow(DF)) {
      DF[i, col] <- (DF[i, col] - mean_val) / sd_val
    }
  }
}
then click here to see the elegant approach:
numcols <- sapply(DF, is.numeric)
DF[ ,numcols] <- scale(DF[ ,numcols])
rm(numcols)


read multiple csv files

■ Consider a better alternative:

file1 <- read.csv("data1.csv")
file2 <- read.csv("data2.csv")
file3 <- read.csv("data3.csv")
# ... repeat for 50 files
combined <- rbind(file1, file2, file3) # ... and so on
then click here to see the elegant approach:
files <- list.files(pattern="*.csv", full.names=TRUE)
combined <- do.call(rbind, lapply(files, read.csv))


count occurrences

■ Consider a better alternative:

categories <- unique(DF$category)
counts <- numeric(length(categories))
names(counts) <- categories
for(i in 1:nrow(DF)) {
  cat <- DF$category[i]
  counts[cat] <- counts[cat] + 1
}
then click here to see the elegant approach:
table(DF$category)


create age groups

■ Consider a better alternative:

DF$age_group <- NA
for(i in 1:nrow(DF)) {
       if(DF$age[i] >=  0 & DF$age[i] < 18) DF$age_group[i] <- "0-17"
  else if(DF$age[i] >= 18 & DF$age[i] < 30) DF$age_group[i] <- "18-29"
  else if(DF$age[i] >= 30 & DF$age[i] < 50) DF$age_group[i] <- "30-49"
  else if(DF$age[i] >= 50 & DF$age[i] < 65) DF$age_group[i] <- "50-64"
  else if(DF$age[i] >= 65)                  DF$age_group[i] <- "65+"
  else                                      DF$age_group[i] <- NA
}
then click here to see the elegant approach:
DF$age_group <- cut(DF$age, 
                    breaks = c(0, 18, 30, 50, 65, Inf),
                    labels = c("0-17", "18-29", "30-49", "50-64", "65+"),
                    right = FALSE)


More examples are shown in the full R course tutorial slides.