Chapter 13 Basic Statistics
Lander's chapter 14 Basic Statistics
#t test, ANOVA
x <- sample(x=4, size = 10, replace = T)
x
## [1] 1 4 3 1 4 3 1 4 3 2
x <- sample(x=1:200, size = 100, replace = T)
x
## [1] 107 77 37 123 161 135 14 4 91 198 1 136 1 188 32 122 86 8 13 198 128 172
## [23] 39 194 168 19 65 83 13 115 98 105 37 194 52 5 150 134 24 150 148 56 29 129
## [45] 43 159 92 170 178 132 146 165 166 104 137 161 57 178 8 32 155 50 46 139 93 183
## [67] 83 99 65 76 164 99 160 40 185 98 166 37 66 107 9 185 87 142 110 14 43 30
## [89] 107 51 60 15 78 69 31 64 57 83 150 3
x <- sample(x=1:100, size = 100, replace = T)
mean(x)
## [1] 50.33
y<- x
y[sample(x=1:100, size = 20, replace = F)]<- NA
y
## [1] 1 51 100 22 NA 15 NA NA 12 18 94 26 55 NA 51 66 59 55 NA 44 95 10
## [23] NA 16 12 100 71 22 20 NA 7 58 NA 80 79 25 58 NA 22 33 39 1 12 26
## [45] 97 NA 54 77 36 97 88 62 52 NA 84 80 NA NA 100 92 NA NA 98 21 25 27
## [67] 45 56 33 38 18 32 24 34 62 34 98 1 44 NA 74 88 95 64 99 87 7 NA
## [89] NA 4 57 14 3 90 NA 10 72 69 NA 82
mean(y)
## [1] NA
mean(y, na.rm = T)
## [1] 49.7375
grades <- c(95, 72, 87, 66)
weights <- c(1/2, 1/4, 1/8, 1/8)
tem <- grades*weights
tem
## [1] 47.500 18.000 10.875 8.250
sum(tem)
## [1] 84.625
weighted.mean(x=grades, w=weights)
## [1] 84.625
mean(x)
## [1] 50.33
var(x)
## [1] 977.1526
sum((x-mean(x))^2)/(length(x)-1)
## [1] 977.1526
sd(x)
## [1] 31.25944
sqrt(var(x))
## [1] 31.25944
#sd(y)
sd(y, na.rm = T)
## [1] 31.66308
min(x)
## [1] 1
max(x)
## [1] 100
median(x)
## [1] 53.5
#median(y)
median(y, na.rm = T)
## [1] 51
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 22.00 51.00 49.74 79.25 100.00 20
quantile(x, probs = c(0.25, 0.75))
## 25% 75%
## 23.5 79.0
quantile(x)[c(2,4)]
## 25% 75%
## 23.5 79.0
#quantile(y, probs = c(0.25, 0.75))
quantile(y, probs = c(0.25, 0.75), na.rm = T)
## 25% 75%
## 22.00 79.25
quantile(x, probs = c(0.1, 0.25, 0.5, 0.75, 0.99))
## 10% 25% 50% 75% 99%
## 9.7 23.5 53.5 79.0 100.0
require(ggplot2)
head(economics)
## # A tibble: 6 x 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
cor(economics$pce, economics$psavert)
## [1] -0.7928546
#cor(economics$psavert, economics$pce)
xPart <- economics$pce-mean(economics$pce)
yPart <- economics$psavert-mean(economics$psavert)
nMinusOne <- nrow(economics)-1
xSD <- sd(economics$pce)
ySD <- sd(economics$psavert)
sum(xPart * yPart)/(nMinusOne * xSD * ySD)
## [1] -0.7928546
cor(economics[, c(2, 4:6)])
## pce psavert uempmed unemploy
## pce 1.0000000 -0.7928546 0.7269616 0.6145176
## psavert -0.7928546 1.0000000 -0.3251377 -0.3093769
## uempmed 0.7269616 -0.3251377 1.0000000 0.8693097
## unemploy 0.6145176 -0.3093769 0.8693097 1.0000000
GGally::ggpairs(economics[, c(2, 4:6)])
## plot: [1,1] [===>--------------------------------------------------------------] 6% est: 0s
## plot: [1,2] [=======>----------------------------------------------------------] 12% est: 0s
## plot: [1,3] [===========>------------------------------------------------------] 19% est: 0s
## plot: [1,4] [===============>--------------------------------------------------] 25% est: 0s
## plot: [2,1] [====================>---------------------------------------------] 31% est: 0s
## plot: [2,2] [========================>-----------------------------------------] 38% est: 0s
## plot: [2,3] [============================>-------------------------------------] 44% est: 0s
## plot: [2,4] [================================>---------------------------------] 50% est: 0s
## plot: [3,1] [====================================>-----------------------------] 56% est: 0s
## plot: [3,2] [========================================>-------------------------] 62% est: 0s
## plot: [3,3] [============================================>---------------------] 69% est: 0s
## plot: [3,4] [=================================================>----------------] 75% est: 0s
## plot: [4,1] [=====================================================>------------] 81% est: 0s
## plot: [4,2] [=========================================================>--------] 88% est: 0s
## plot: [4,3] [=============================================================>----] 94% est: 0s
## plot: [4,4] [==================================================================]100% est: 0s
?ggpairs
rm(list = ls())
require(reshape2)
require(scales)
econCor <- cor(economics[, c(2, 4:6)])
econCor
## pce psavert uempmed unemploy
## pce 1.0000000 -0.7928546 0.7269616 0.6145176
## psavert -0.7928546 1.0000000 -0.3251377 -0.3093769
## uempmed 0.7269616 -0.3251377 1.0000000 0.8693097
## unemploy 0.6145176 -0.3093769 0.8693097 1.0000000
#note the difference between the uses of variable.name and varnames
#id.vars does not give column names
ecomMelt <- melt(econCor, varnames=c("x", "y"), value.name = "Correlation")
ecomMelt <- melt(econCor, id.vars=c("x", "y"), value.name = "Correlation")
names(ecomMelt)
## [1] "Var1" "Var2" "Correlation"
#Decreasing=F by default
ecomMelt <- ecomMelt[order(ecomMelt$Correlation),]
ecomMelt
## Var1 Var2 Correlation
## 2 psavert pce -0.7928546
## 5 pce psavert -0.7928546
## 7 uempmed psavert -0.3251377
## 10 psavert uempmed -0.3251377
## 8 unemploy psavert -0.3093769
## 14 psavert unemploy -0.3093769
## 4 unemploy pce 0.6145176
## 13 pce unemploy 0.6145176
## 3 uempmed pce 0.7269616
## 9 pce uempmed 0.7269616
## 12 unemploy uempmed 0.8693097
## 15 uempmed unemploy 0.8693097
## 1 pce pce 1.0000000
## 6 psavert psavert 1.0000000
## 11 uempmed uempmed 1.0000000
## 16 unemploy unemploy 1.0000000
m <- c(9,9,NA,3,NA,5,8,1,10,4)
n <- c(2,NA,1,6,6,4, 1,1,6,7)
p <- c(8,4,3,9,10,NA,3,NA,9,9)
q <- c(10,10,7,8,4,2,8,5,5,2)
r <- c(1,9,7,6,5,6,2,7,9,10)
theMat <- cbind(m,n,p,q,r)
dim(theMat)
## [1] 10 5
cor(theMat, use = "everything") #entirety of all columns must be free of NAs
## m n p q r
## m 1 NA NA NA NA
## n NA 1 NA NA NA
## p NA NA 1 NA NA
## q NA NA NA 1.0000000 -0.4242958
## r NA NA NA -0.4242958 1.0000000
#cor(theMat, use = "all.obs")
cor(theMat, use = "complete.obs")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
#keep only rows where every entry is not NA
cor(theMat, use = "complete.obs")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
cor(theMat, use = "na.or.complete")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
cor(theMat[c(1,4,7,9,10),])
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
identical(cor(theMat, use = "complete.obs"), cor(theMat[c(1,4,7,9,10),]))
## [1] TRUE
cor(theMat, use = "complete.obs")
## m n p q r
## m 1.0000000 -0.5228840 -0.2893527 0.2974398 -0.3459470
## n -0.5228840 1.0000000 0.8090195 -0.7448453 0.9350718
## p -0.2893527 0.8090195 1.0000000 -0.3613720 0.6221470
## q 0.2974398 -0.7448453 -0.3613720 1.0000000 -0.9059384
## r -0.3459470 0.9350718 0.6221470 -0.9059384 1.0000000
cor(theMat[,c("m","n")], use = "complete.obs")
## m n
## m 1.00000000 -0.02511812
## n -0.02511812 1.00000000
data(tips, package = "reshape2")
head(tips)
## total_bill tip sex smoker day time size
## 1 16.99 1.01 Female No Sun Dinner 2
## 2 10.34 1.66 Male No Sun Dinner 3
## 3 21.01 3.50 Male No Sun Dinner 3
## 4 23.68 3.31 Male No Sun Dinner 2
## 5 24.59 3.61 Female No Sun Dinner 4
## 6 25.29 4.71 Male No Sun Dinner 4
GGally::ggpairs(tips)
##
plot: [1,1] [>-----------------------------------------------------------------] 2% est: 0s
plot: [1,2] [==>---------------------------------------------------------------] 4% est: 1s
plot: [1,3] [===>--------------------------------------------------------------] 6% est: 2s
plot: [1,4] [====>-------------------------------------------------------------] 8% est: 2s
plot: [1,5] [======>-----------------------------------------------------------] 10% est: 2s
plot: [1,6] [=======>----------------------------------------------------------] 12% est: 2s
plot: [1,7] [========>---------------------------------------------------------] 14% est: 2s
plot: [2,1] [==========>-------------------------------------------------------] 16% est: 2s
plot: [2,2] [===========>------------------------------------------------------] 18% est: 2s
plot: [2,3] [============>-----------------------------------------------------] 20% est: 2s
plot: [2,4] [==============>---------------------------------------------------] 22% est: 2s
plot: [2,5] [===============>--------------------------------------------------] 24% est: 2s
plot: [2,6] [=================>------------------------------------------------] 27% est: 2s
plot: [2,7] [==================>-----------------------------------------------] 29% est: 2s
plot: [3,1] [===================>----------------------------------------------] 31% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [3,2] [=====================>--------------------------------------------] 33% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [3,3] [======================>-------------------------------------------] 35% est: 2s
plot: [3,4] [=======================>------------------------------------------] 37% est: 2s
plot: [3,5] [=========================>----------------------------------------] 39% est: 2s
plot: [3,6] [==========================>---------------------------------------] 41% est: 2s
plot: [3,7] [===========================>--------------------------------------] 43% est: 2s
plot: [4,1] [=============================>------------------------------------] 45% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [4,2] [==============================>-----------------------------------] 47% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [4,3] [===============================>----------------------------------] 49% est: 2s
plot: [4,4] [=================================>--------------------------------] 51% est: 2s
plot: [4,5] [==================================>-------------------------------] 53% est: 2s
plot: [4,6] [===================================>------------------------------] 55% est: 2s
plot: [4,7] [=====================================>----------------------------] 57% est: 1s
plot: [5,1] [======================================>---------------------------] 59% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [5,2] [=======================================>--------------------------] 61% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [5,3] [=========================================>------------------------] 63% est: 1s
plot: [5,4] [==========================================>-----------------------] 65% est: 1s
plot: [5,5] [===========================================>----------------------] 67% est: 1s
plot: [5,6] [=============================================>--------------------] 69% est: 1s
plot: [5,7] [==============================================>-------------------] 71% est: 1s
plot: [6,1] [===============================================>------------------] 73% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [6,2] [=================================================>----------------] 76% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [6,3] [==================================================>---------------] 78% est: 1s
plot: [6,4] [====================================================>-------------] 80% est: 1s
plot: [6,5] [=====================================================>------------] 82% est: 1s
plot: [6,6] [======================================================>-----------] 84% est: 1s
plot: [6,7] [========================================================>---------] 86% est: 1s
plot: [7,1] [=========================================================>--------] 88% est: 1s
plot: [7,2] [==========================================================>-------] 90% est: 0s
plot: [7,3] [============================================================>-----] 92% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [7,4] [=============================================================>----] 94% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [7,5] [==============================================================>---] 96% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [7,6] [================================================================>-] 98% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
plot: [7,7] [==================================================================]100% est: 0s
#covariance
cov(economics$pce, economics$psavert)
## [1] -8359.069
cov(economics[, c(2, 4:6)])
## pce psavert uempmed unemploy
## pce 12650851.944 -8359.069071 10618.386190 5774578.978
## psavert -8359.069 8.786360 -3.957847 -2422.805
## uempmed 10618.386 -3.957847 16.864531 9431.652
## unemploy 5774578.978 -2422.805358 9431.652268 6979948.309
identical(cov(economics$pce, economics$psavert),
cor(economics$pce, economics$psavert)*
sd(economics$pce)*sd(economics$psavert))
## [1] TRUE
head(tips)
## total_bill tip sex smoker day time size
## 1 16.99 1.01 Female No Sun Dinner 2
## 2 10.34 1.66 Male No Sun Dinner 3
## 3 21.01 3.50 Male No Sun Dinner 3
## 4 23.68 3.31 Male No Sun Dinner 2
## 5 24.59 3.61 Female No Sun Dinner 4
## 6 25.29 4.71 Male No Sun Dinner 4
unique(tips$sex)
## [1] Female Male
## Levels: Female Male
unique(tips$day)
## [1] Sun Sat Thur Fri
## Levels: Fri Sat Sun Thur
#h0: the average tip is equal to 2.5
mean(tips$tip)
## [1] 2.998279
#leave question
t.test(tips$tip, alternative = "two.sided", mu = 2.5)
##
## One Sample t-test
##
## data: tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
## 2.823799 3.172758
## sample estimates:
## mean of x
## 2.998279
dim(tips)
## [1] 244 7
z <- (mean(tips$tip)-2.5)/(sd(tips$tip)/sqrt(length(tips$tip)))
(1-pnorm(z, 0, 1))*2
## [1] 1.851998e-08
randT <- rt(30000, df=nrow(tips)-1)
NROW(tips)
## [1] 244
nrow(tips)
## [1] 244
tipTTest <- t.test(tips$tip, alternative = "two.sided", mu=2.5)
ggplot(data.frame(x=randT))+
geom_density(aes(x=x), fill="grey", color="grey")+
geom_vline(xintercept = tipTTest$statistic)+
geom_vline(xintercept = mean(randT)+c(-2,2)*sd(randT), linetype=2)
names(tipTTest)
## [1] "statistic" "parameter" "p.value" "conf.int" "estimate" "null.value"
## [7] "stderr" "alternative" "method" "data.name"
tipTTest$statistic
## t
## 5.625287
#H0: t is greater than 2.5
t.test(tips$tip, alternative="greater", mu=2.5)
##
## One Sample t-test
##
## data: tips$tip
## t = 5.6253, df = 243, p-value = 2.54e-08
## alternative hypothesis: true mean is greater than 2.5
## 95 percent confidence interval:
## 2.852023 Inf
## sample estimates:
## mean of x
## 2.998279
#two-sample t-text
#a traditional t-text requires same variance
#Welch two-sample t-test can handle differing variances
aggregate(tip ~ sex, data = tips, var)
## sex tip
## 1 Female 1.344428
## 2 Male 2.217424
#test for normality
shapiro.test(tips$tip)
##
## Shapiro-Wilk normality test
##
## data: tips$tip
## W = 0.89781, p-value = 8.2e-12
shapiro.test(tips$tip[tips$sex=="Male"])
##
## Shapiro-Wilk normality test
##
## data: tips$tip[tips$sex == "Male"]
## W = 0.87587, p-value = 3.708e-10
shapiro.test(tips$tip[tips$sex=="Female"])
##
## Shapiro-Wilk normality test
##
## data: tips$tip[tips$sex == "Female"]
## W = 0.95678, p-value = 0.005448
hist(tips$tip)
#all the tests fail so inspect visually
#H0: normality
ggplot(tips, aes(x=tip, fill=sex))+
geom_histogram(binwidth=0.5, alpha=0.5) #alpha: transparency
#neither the standard F-test (var.text) nor Bartlett test (bartlett.text) will suffice
#nonparametric Ansari-Bradley to examine equality of variances
ansari.test(tip~sex, tips)
##
## Ansari-Bradley test
##
## data: tip by sex
## AB = 5582.5, p-value = 0.376
## alternative hypothesis: true ratio of scales is not equal to 1
#variances are equal
#so can apply standard two-sample t-test
t.test(tip~sex, data = tips, var.equal=T)
##
## Two Sample t-test
##
## data: tip by sex
## t = -1.3879, df = 242, p-value = 0.1665
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6197558 0.1074167
## sample estimates:
## mean in group Female mean in group Male
## 2.833448 3.089618
#female and male are tipped roughly equally
#var.equal=FALSE (the default) would run the Welch test
names(tips)
## [1] "total_bill" "tip" "sex" "smoker" "day" "time" "size"
require(plyr)
tipSummary <-
ddply(tips, "sex", summarize,
tip.mean=mean(tip), tip.sd=sd(tip),
Lower=tip.mean-2*tip.sd/sqrt(NROW(tip)),
Upper=tip.mean+2*tip.sd/sqrt(NROW(tip)))
str(tips)
## 'data.frame': 244 obs. of 7 variables:
## $ total_bill: num 17 10.3 21 23.7 24.6 ...
## $ tip : num 1.01 1.66 3.5 3.31 3.61 4.71 2 3.12 1.96 3.23 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 2 2 2 ...
## $ smoker : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ day : Factor w/ 4 levels "Fri","Sat","Sun",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ time : Factor w/ 2 levels "Dinner","Lunch": 1 1 1 1 1 1 1 1 1 1 ...
## $ size : int 2 3 3 2 4 4 2 4 2 2 ...
nrow(tips)
## [1] 244
nrow(tips$tip)
## NULL
NROW(tips$tip)#TREAT a vector as 1-column matrix
## [1] 244
?NROW
ggplot(tipSummary, aes(x=tip.mean, y=sex))+
geom_point()+
geom_errorbarh(aes(xmin=Lower, xmax=Upper),
height=.2)
names(tipSummary)
## [1] "sex" "tip.mean" "tip.sd" "Lower" "Upper"
#geom_errorbar vs. geom_errorbarh: be careful with x,y limits
#paired two-sample t-test: observations in pairs are correlated
#vs. independent two-sample t-test