Chapter 12 Probability Distributions
Lander's chapter 14 Probability Distributions
#####################################
#chapter14 probability distributions#
#####################################
library(tidyverse)
library(reshape2)
#normal distribution
rnorm(n=10)
## [1] 0.10742845 1.47572316 0.87780092 1.06053657 0.73220495 3.55942400 0.16369035 0.08238296
## [9] 0.02358518 0.53158209
rnorm(n=10, mean = 100, sd=20)
## [1] 92.02657 126.03525 117.80587 92.62149 109.18156 88.88660 105.99909 114.34756 61.99287
## [10] 65.69723
randNorm10 <- rnorm(10)
randNorm10
## [1] 0.8488954 -0.4783605 -1.5271801 0.2886871 -1.7713282 1.5400277 0.5953859 -0.3749823
## [9] -1.6476411 -0.4338485
dnorm(randNorm10)
## [1] 0.27824584 0.35581194 0.12429742 0.38265990 0.08309753 0.12187234 0.33414485 0.37185756
## [9] 0.10266345 0.36310951
#pdf of -1; mean=0, sd=1
#mathematically impossible
dnorm(-1, 0, 1)
## [1] 0.2419707
dnorm(c(-1, 0, 1))
## [1] 0.2419707 0.3989423 0.2419707
randNorm <- rnorm(30000)
randDensity <- dnorm(randNorm)
require(ggplot2)
ggplot(data.frame(x=randNorm, y=randDensity))+
aes(x=x, y=y)+
geom_point()+
labs(x="Random Normal Variables", y="Density")
max(randNorm)
## [1] 3.974804
dnorm(4.5)
## [1] 1.598374e-05
1-pnorm(4.5)
## [1] 3.397673e-06
randNorm10
## [1] 0.8488954 -0.4783605 -1.5271801 0.2886871 -1.7713282 1.5400277 0.5953859 -0.3749823
## [9] -1.6476411 -0.4338485
pnorm(randNorm10)
## [1] 0.80203026 0.31619681 0.06335812 0.61358960 0.03825307 0.93822320 0.72420722 0.35383681
## [9] 0.04971317 0.33219921
pnorm
## function (q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
## .Call(C_pnorm, q, mean, sd, lower.tail, log.p)
## <bytecode: 0x10f85fe90>
## <environment: namespace:stats>
pnorm(c(-3,0,3))
## [1] 0.001349898 0.500000000 0.998650102
pnorm(-1)
## [1] 0.1586553
#left-tailed by default
pnorm(1)-pnorm(0)
## [1] 0.3413447
pnorm(0)-pnorm(-1)
## [1] 0.3413447
pnorm(1)-pnorm(-1)
## [1] 0.6826895
#plot pnorm(-1)
p <- ggplot(data.frame(x=randNorm, y=randDensity))+
aes(x=x, y=y)+
geom_line()+
labs(x="x", y="Density")
p
neg1Seq <- seq(from=min(randNorm), to=-1, by=0.1)
lessThanNeg1 <- data.frame(x=neg1Seq, y=dnorm(neg1Seq))
head(lessThanNeg1)
## x y
## 1 -4.218679 5.448609e-05
## 2 -4.118679 8.266641e-05
## 3 -4.018679 1.241737e-04
## 4 -3.918679 1.846661e-04
## 5 -3.818679 2.718953e-04
## 6 -3.718679 3.963450e-04
lessThanNeg1 <- rbind(c(min(randNorm), 0),
lessThanNeg1,
c(max(lessThanNeg1$x),0))
p+geom_polygon(data = lessThanNeg1, aes(x=x,y=y))
#plot pnorm(1)-pnorm(-1)
neg1Pos1Seq <- seq(from=-1, to=1, by=0.1)
neg1To1 <- data.frame(x=neg1Pos1Seq, y=dnorm(neg1Pos1Seq))
head(neg1To1)
## x y
## 1 -1.0 0.2419707
## 2 -0.9 0.2660852
## 3 -0.8 0.2896916
## 4 -0.7 0.3122539
## 5 -0.6 0.3332246
## 6 -0.5 0.3520653
neg1To1 <- rbind(c(min(neg1To1$x),0),
neg1To1,
c(max(neg1To1$x),0))
#try the following codes:
#geom_polygon connects points in order:
#neg1To1 <- rbind(neg1To1,
# c(min(neg1To1$x),0),
# c(max(neg1To1$x),0))
p+geom_polygon(data=neg1To1, aes(x=x,y=y))
randProb <- pnorm(randNorm)
ggplot(data.frame(x=randNorm, y=randProb))+
aes(x=x, y=y)+
geom_point()+
labs(x="Random Normal Variables", y="Probability")
#note the position of aes()
ggplot(data.frame(x=randNorm, y=randProb), aes(x=x, y=y))+
geom_point()+
labs(x="Random Normal Variables", y="Probability")
qnorm(pnorm(randNorm10))
## [1] 0.8488954 -0.4783605 -1.5271801 0.2886871 -1.7713282 1.5400277 0.5953859 -0.3749823
## [9] -1.6476411 -0.4338485
all.equal(randNorm10, qnorm(pnorm(randNorm10)))
## [1] TRUE
#binomial distribution
rbinom(n=1, size = 10, prob = 0.4)
## [1] 2
?rbinom
#n>1, R generates the number of successes for each of the n sets of size trails
#n: number of observations/repeats; size: number of trials(sample size)
rbinom(5, 10, 0.4)
## [1] 5 3 5 5 6
rbinom(10, 10, 0.4)
## [1] 1 4 5 5 5 6 3 2 5 5
#Bernoulli
rbinom(n=1, size=1, prob = .4)
## [1] 0
rbinom(5, 1, .4)
## [1] 0 0 0 0 0
rbinom(10, 1, .4)
## [1] 1 0 1 1 0 1 1 1 1 1
#logit regression
#outcome = beta * indicator
#log(P(outcome=1)/1-P(outcome=1)) = beta*indicator
#exp(beta)/1+exp(beta)
binomData <- data.frame(Successes=rbinom(10000, 10, 0.3))
head(binomData)
## Successes
## 1 5
## 2 0
## 3 0
## 4 4
## 5 6
## 6 3
ggplot(binomData, aes(x=Successes))+
geom_histogram(binwidth = 1)
binom5 <- data.frame(Successes=rbinom(10000, 5, .3), size=5)
dim(binom5)
## [1] 10000 2
head(binom5)
## Successes size
## 1 2 5
## 2 3 5
## 3 3 5
## 4 2 5
## 5 1 5
## 6 2 5
binom10 <- data.frame(Successes=rbinom(10000, 10, .3), size=10)
head(binom10)
## Successes size
## 1 1 10
## 2 4 10
## 3 3 10
## 4 3 10
## 5 1 10
## 6 6 10
binom100 <- data.frame(Successes=rbinom(10000, size = 100, prob = 0.3), size=100)
binom1000 <- data.frame(Successes=rbinom(10000, 1000, .3), size=1000)
binomAll <- rbind(binom5, binom10, binom100, binom1000)
dim(binomAll)
## [1] 40000 2
tail(binomAll)
## Successes size
## 39995 310 1000
## 39996 298 1000
## 39997 297 1000
## 39998 295 1000
## 39999 313 1000
## 40000 298 1000
ggplot(binomAll, aes(x=Successes))+
geom_histogram()+
facet_wrap(~size, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dbinom(3, 10, .3)
## [1] 0.2668279
pbinom(3, 10, .3)
## [1] 0.6496107
dbinom(1:10, 10, .3)
## [1] 0.1210608210 0.2334744405 0.2668279320 0.2001209490 0.1029193452 0.0367569090 0.0090016920
## [8] 0.0014467005 0.0001377810 0.0000059049
pbinom(1:10, 10, .3)
## [1] 0.1493083 0.3827828 0.6496107 0.8497317 0.9526510 0.9894079 0.9984096 0.9998563 0.9999941
## [10] 1.0000000
qbinom(p=0.2668279, 10, 0.3)
## [1] 2
dbinom(3, 10, 0.3)
## [1] 0.2668279
qbinom(p=c(.3, .35, .4, .5, .6), 10, .3)
## [1] 2 2 3 3 3
qnorm(0.975, 0, 1)
## [1] 1.959964
#poisson model
# E(dependent variable)= beta * IV
#Poisson distribution
pois1 <- rpois(10000, 1)
hist(pois1)
pois2 <- rpois(10000, 2)
max(pois2)
## [1] 9
pois5 <- rpois(10000,5)
pois10 <- rpois(10000,10)
pois20 <- rpois(10000,20)
pois <- data.frame(Lambda.1=pois1, Lambda.2=pois2, Lambda.5=pois5,
Lambda.10=pois10, Lambda.20=pois20)
head(pois)
## Lambda.1 Lambda.2 Lambda.5 Lambda.10 Lambda.20
## 1 1 2 4 12 22
## 2 1 2 5 5 19
## 3 2 3 8 10 20
## 4 2 1 4 8 29
## 5 0 2 6 8 14
## 6 3 4 6 7 13
dim(pois)
## [1] 10000 5
library(reshape2)
pois <- melt(data = pois, variable.name = "Lambda", value.name = "x")
## No id variables; using all as measure variables
head(pois)
## Lambda x
## 1 Lambda.1 1
## 2 Lambda.1 1
## 3 Lambda.1 2
## 4 Lambda.1 2
## 5 Lambda.1 0
## 6 Lambda.1 3
tail(pois)
## Lambda x
## 49995 Lambda.20 15
## 49996 Lambda.20 22
## 49997 Lambda.20 21
## 49998 Lambda.20 21
## 49999 Lambda.20 15
## 50000 Lambda.20 19
dim(pois)
## [1] 50000 2
#"+" keeps all numbers
pois$Lambda <- as.factor(as.numeric(str_extract(string = pois$Lambda,
pattern = "\\d+")))
head(pois)
## Lambda x
## 1 1 1
## 2 1 1
## 3 1 2
## 4 1 2
## 5 1 0
## 6 1 3
tail(pois)
## Lambda x
## 49995 20 15
## 49996 20 22
## 49997 20 21
## 49998 20 21
## 49999 20 15
## 50000 20 19
ggplot(pois, aes(x=x))+
geom_histogram(binwidth = 1)+
facet_wrap(~Lambda)+
ggtitle("Probability Mass Function")
ggplot(pois, aes(x=x))+
geom_density(aes(group=Lambda, color=Lambda, fill=Lambda),
adjust=4, alpha=.5)+
#scale_color_discrete()+
#scale_fill_discrete()+
ggtitle("Probability Mass Function")
?geom_density
#alpha adjusts transparency
#adjust changes the bandwidch