Chapter 6 Advanced Data Structures

Lander's chapter 5 - Advanced Data Structures

Data come in many types and structures, which can pose a problem for some analysis environments but R handles them with aplomb. The most common data structure is the one-dimensional vector, which forms the basis of everything in R. The most powerful structure is the data.frame--something special in R that most other languages do not have--which handles mixed data types in a spreadsheet-like format. Lists are useful for stroing colelctions of items like a hash in Perl.

The main difference between an array and a matrix is that matrices are restricted to two dimensions while arrays can have an arbitrary number.

####################################
#chapter 5 advanced data structures#
####################################

#data.frame
#assign names 
x<-10:1
y<--4:5
q<-c("hockey", "football", "basketball", "curling", "rugby",
     "lacrosse", "basketball", "tennis", "cricket", "soccer")
(theDF<-data.frame(First = x, second=y, sport=q))
##    First second      sport
## 1     10     -4     hockey
## 2      9     -3   football
## 3      8     -2 basketball
## 4      7     -1    curling
## 5      6      0      rugby
## 6      5      1   lacrosse
## 7      4      2 basketball
## 8      3      3     tennis
## 9      2      4    cricket
## 10     1      5     soccer
nrow(theDF)
## [1] 10
ncol(theDF)
## [1] 3
dim(theDF)
## [1] 10  3
names(theDF)
## [1] "First"  "second" "sport"
rownames(theDF)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
head(theDF, n=6)
##   First second      sport
## 1    10     -4     hockey
## 2     9     -3   football
## 3     8     -2 basketball
## 4     7     -1    curling
## 5     6      0      rugby
## 6     5      1   lacrosse
tail(theDF)
##    First second      sport
## 5      6      0      rugby
## 6      5      1   lacrosse
## 7      4      2 basketball
## 8      3      3     tennis
## 9      2      4    cricket
## 10     1      5     soccer
theDF <- data.frame(First=x, Second=y, Sport=q)
theDF[, "Sport"]
##  [1] "hockey"     "football"   "basketball" "curling"    "rugby"      "lacrosse"   "basketball"
##  [8] "tennis"     "cricket"    "soccer"
class(theDF[, "Sport"])#returns factor
## [1] "character"
class(theDF[, "Sport", drop=F]) #returns data.frame
## [1] "data.frame"
#model.matrix to create a set of indicator variables
(newFactor <- factor(c("red", "blue", "green", "green"),
                    levels = c("red", "blue", "green"),
                    ordered = T))
## [1] red   blue  green green
## Levels: red < blue < green
model.matrix(~newFactor-1) #it's necessary to use ordered to have the exactly same order
##   newFactorred newFactorblue newFactorgreen
## 1            1             0              0
## 2            0             1              0
## 3            0             0              1
## 4            0             0              1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$newFactor
## [1] "contr.poly"
#list
#can store any number of items of any type
list3 <- list(c(1:3), 3:7)
list3
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] 3 4 5 6 7
(list5 <- list(theDF, 1:10, list3))
## [[1]]
##    First Second      Sport
## 1     10     -4     hockey
## 2      9     -3   football
## 3      8     -2 basketball
## 4      7     -1    curling
## 5      6      0      rugby
## 6      5      1   lacrosse
## 7      4      2 basketball
## 8      3      3     tennis
## 9      2      4    cricket
## 10     1      5     soccer
## 
## [[2]]
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## [[3]]
## [[3]][[1]]
## [1] 1 2 3
## 
## [[3]][[2]]
## [1] 3 4 5 6 7
#create empty list
emptyList <- vector(mode = "list", length = 4)
emptyList
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
#use double brackets; drop=FALSE ensures a return of data.frame
list3[2]
## [[1]]
## [1] 3 4 5 6 7
list3[[2]]
## [1] 3 4 5 6 7
length(list3)
## [1] 2
#matrix
(A <- matrix(1:10, nrow = 5))
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
(B <- matrix(21:30, nrow = 5))
##      [,1] [,2]
## [1,]   21   26
## [2,]   22   27
## [3,]   23   28
## [4,]   24   29
## [5,]   25   30
(C <- matrix(21:40, nrow = 2))
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]   21   23   25   27   29   31   33   35   37    39
## [2,]   22   24   26   28   30   32   34   36   38    40
#A%*%B
A%*%t(B)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]  177  184  191  198  205
## [2,]  224  233  242  251  260
## [3,]  271  282  293  304  315
## [4,]  318  331  344  357  370
## [5,]  365  380  395  410  425
rownames(A)
## NULL
colnames(B)
## NULL
#array: same type multidimensional vector
theArray <- array(1:12, dim = c(2,3,2))
theArray
## , , 1
## 
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6
## 
## , , 2
## 
##      [,1] [,2] [,3]
## [1,]    7    9   11
## [2,]    8   10   12