CTRL + R
CTRL + ENTER
help.start()
or in the menuadd_count()
function
help.search("add_count")
add_count()
function (Explain
help file structure!)
?add_count
and help("add_count")
getwd()
: Display working directorysetwd()
: Set working directorydir()
: Display files in working directory\
\\
instead or /
in
directory paths (see below)#
: Start comment that is not evaluated# Example
getwd()
## [1] "C:/Users/Paul/Google Drive/2-Teaching/2022_R_Workshop"
a <- 10
or a = 10
: Assign something to
an objecta
: Type name to display content of object
a
rm()
: Delete object from workspacesave(a, b, file = "myobjects.RData")
: Save single
objects a
and b
as .RData
fileload("myobjects.RData")
: Load saved objectsls()
: Display workspace content# Comments in the script start with #
# Everything after # in the line is ignored by R
# Send code with CTRL + R
5+5
getwd() # Display working directory
# Generate a new folder on your hardrive called "2016_03_R_course_EUI"
# Use this folder as your working directory, i.e. to store the downloaded files in there
# If you need the path to your folder, copy it from the path field in the explorer (in windows)
# The course files are available here: https://drive.google.com/folderview?id=0Bwer5wQoreiuMTRueFBGdDllNEk&usp=sharing
# Set the your working directory to your new folder
setwd("C:/Users/paul/Google Drive/Teaching/2016_03_R_course_EUI")
dir() # Display content of working directory
a <- 50 # Generate object that contains the number 50
a # Show content of object
a = 70
a
ls() # Display objects in workspace
# Create a more complicated object
thesenseoflife <- c("worshipping god","having fun","no idea")
interesting <- thesenseoflife # define another object
interesting
# Q: How can I store the object "interesting" in the object "boring"?
# Objectnames do not have blanks
# Names should make sense and be systematic
# e.g. for dummy variables: d.male.voter
# e.g. variance of variables: var.income
a <- 1
b <- 2
# Save objects a and b in the file "objects-a-and-b.RData"
# in the working directory
save(a,b,file="objects-a-and-b.RData")
# Delete objects a and b
rm(a,b)
rm(list=ls())
ls() # display all objects
load("objects-a-and-b.RData") # load objects from wd
# Check if they were loaded
a
b
getwd()
hundert <- 100
xyz <- 250
hundert
ls()
save(hundert,xyz,file="workspace.RData")
rm(hundert,xyz)
load("workspace.RData")
ls()
+ - * / ^
& | == != > < >= <=
?function
)
exp(x)=e^x log(x) log10(x) sin(x) cos(x) tan(x)
abs(x) sqrt(x) ceiling(x) floor(x) trunc(x) round(x, digits=n)
ceiling()
function?# Calculations
Ergebnis <- (23+24)*11/(18+15)*5
Ergebnis
# Functions
log(2) # exp(1) = 2.718282^0.6931472
x <- seq(1,4,0.3)
ceiling(x)
floor(x)
##########################
# Comparisons.. examples #
##########################
x <- -3:3
x
# Equals
x == 0
# is smaller than
x < 0
# is bigger than or equal to
x >= 0
# unequal
x != 0
# unequal(equal to 0)
!(x == 0)
# bigger than -1 and smaller than 1
x > -1 & x < 1
# bigger than 1 and smaller than -1
x > 1 & x < -1
# bigger than 1 or smaller than -1
x > 1 | x < -1
mean()
function it is possible to use and
additional argument called trim
. Find out what this
argument does by checking the help file of the mean()
function and write it into your script.# 2.
((3+4-5)-9)^2
-99/33+42
log(1)
(sqrt(2))^2
# 3.
5==7
5*5>=6*4
sqrt(3)!=cos(17)
# 4.
# help(mean)
# ?mean
# the fraction (0 to 0.5) of observations to be trimmed from each end of x
# before the mean is computed.
Q: What is your experience with looking at data analysis code you have written 2 years earlier?
Comment your code
Use meaningful names!
trst
vs. trust
# Data import ####
styler
addin if you wantclass()
: Query object type"tbl_df"
): Newer type of dataframe (see
?
tbl_df-class``)as.numeric()
## [1] "integer"
## [1] 1 2 3 4 5 6 7 8 9
## [1] "numeric"
## [1] 1.3 2.4 3.5
## [1] "logical"
## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [1] "character"
## [1] "a" "b" "c" "d" "f"
## [1] "matrix" "array"
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 11 12 13
## [1] "array"
## , , 1
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 6 11 16 21
## [2,] 2 7 12 17 22
## [3,] 3 8 13 18 23
## [4,] 4 9 14 19 24
## [5,] 5 10 15 20 25
##
## , , 2
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 26 31 36 41 46
## [2,] 27 32 37 42 47
## [3,] 28 33 38 43 48
## [4,] 29 34 39 44 49
## [5,] 30 35 40 45 50
## [1] "list"
## $a
## [1] 4 5 6 7 8
##
## $b
## [1] 1 2 3
##
## $c
## [1] "Ger" "FR" "It" "SE"
## [1] "data.frame"
## Fertility Agriculture Examination
## Courtelary 80.2 17.0 15
## Delemont 83.1 45.1 6
## Franches-Mnt 92.5 39.7 5
## Moutier 85.8 36.5 12
## Neuveville 76.9 43.5 17
## Porrentruy 76.1 35.3 9
## Broye 83.8 70.2 16
## Glane 92.4 67.8 14
## Gruyere 82.4 53.3 12
## Sarine 82.9 45.2 16
## [1] "tbl_df" "tbl" "data.frame"
## # A tibble: 47 x 6
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 80.2 17 15 12 9.96 22.2
## 2 83.1 45.1 6 9 84.8 22.2
## 3 92.5 39.7 5 5 93.4 20.2
## 4 85.8 36.5 12 7 33.8 20.3
## 5 76.9 43.5 17 15 5.16 20.6
## 6 76.1 35.3 9 7 90.6 26.6
## 7 83.8 70.2 16 7 92.8 23.6
## 8 92.4 67.8 14 8 97.2 24.9
## 9 82.4 53.3 12 7 97.7 21
## 10 82.9 45.2 16 13 91.4 24.4
## # ... with 37 more rows
## tibble [47 x 6] (S3: tbl_df/tbl/data.frame)
## $ Fertility : num [1:47] 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num [1:47] 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int [1:47] 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int [1:47] 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num [1:47] 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num [1:47] 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
## [1] "factor"
## [1] 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5
## Levels: 1 2 3 4 5
## [1] "ordered" "factor"
## [1] low medium high low medium high low medium high low
## [11] medium high low medium high low medium high low medium
## [21] high low medium high low medium high low medium high
## Levels: low < medium < high
## [1] "function"
## function(){x^2}
## [1] "function"
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x000002c2a8b706d0>
## <environment: namespace:base>
"integer"
, "numeric"
,
"logical"
, "character"
TRUE
and FALSE
c(1,2,3) >= 2
results in
FALSE TRUE TRUE
c("Markus", "Matthias", "David", "Till")
gives the
vector "Markus" "Matthias" "David" "Till"
names(object) <- charaktervektor
: Name more complex
dataclasses, e.g. a dataframec()
: “concatenate”,
e.g. c(1.2,"Hans",5.0,6.7)
length()
: Get vector length:
: indicates from/to for numbersrep("Peter",2)
: Repeat "Peter"
two
timesseq(5,8)
: Sequence from 5 to 8vector[positions]
[1] 1.20 3.50 5.00 6.70 8.00 10.00 13.55
[1]
= Position of the first element displayed in that
line in the vector (show it!)Inf
and -Inf
: Positive and negative
infinityNaN
: “Not a number”, e.g. calculate
0/0
NA
: Missing valuerbind()
: Combine vectors line-by-linecbind()
: Combine vectors column-by-column# Generate two vectors
x <- 10:19
x
y <- c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE)
y
# seq(0,18,2)
z <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j")
z
# Access vector elements
x[1] # 1st element of x
x[1:5] # First 5 elements of x
x[-(2:6)] # All elements but not positions 2 to 6
z[4] # ?
x[y]
x[x< 16 | x >=12]
# Add names to elements of a vector
names(x)
names(x) <- z
# or like this
another.vector <- c(a=1,b=2)
x
names(x)
# Combine vectors rowwise and columnwise
rbind(x,y)
cbind(y,z)
# Q: What did R do to the vectors? What class do they have?
# Access with names
names(x)
x[c("a","c")]
x
of length 404 in which the numbers
199 to 400 are repeated two times. Generate a new vector y
,
that contains the elements of x
on the positions 53, 78 and
99.Freunde
that contains the
names of your three best friends.# 2.
x <- rep(1:5,10)
x
# 3.
x <- rep(199:400,2)
x[x>=240]
length(x)
y <- x[c(53, 78, 99)]
y
# 4.
freunde <- c("Platon", "Aristoteles", "Corbyn")
x
of length 101, that starts with the
number 33 and ends with the number 133.y
. Extract the elements on position 1 to 25 and
save them in a new vector z
. Join the two vectors both
column by column and subsequently line by line (rows!) and save the
results in two new objects colyz
and colyz
.
What class do the last two objects that you created posses?x
) that are smaller than 57
or greater/equal than 83 and save them in a new object
subgroup
.# Vectors
# 1
x <- 33:133
# 2.
y <- x[26:50]
z <- x[1:25]
yz <- cbind(y,z)
class(yz)
zy <- rbind(y,z)
zy
class(zy)
# cbind(y,z)[1:3,1:2]
# rbind(y,z)[1:2,1:5] # Gleiche Anzahl!
# class(cbind(y,z))
# 3.
subgroup <- x[x<57 | x>=83]
subgroup
"factor"
factor(..., levels = c(...))
: Create an unordered
factor with levels ...
factor(..., ordered = TRUE)
: Create an ordered factor
levels=c(...), labels=c(...)
: Specify
levels/labelslevels()
: Display categoriesas.numeric()
: Convert factor to numeric vector"list"
list()
: Create a list (can also be dataframe!)list$switzerland
: Access element
switzerland
of the list list
list[2]
: Access second element of the list
list
list[[2]]
: Access content of second element of the list
list
# Q: How do i get help for the function factor()?
# Create factor (nominal variable)
x <- factor(rep(1:2,10),levels=c(1,2), labels=c("SPD", "CDU"))
# Q: How do I go about to understand what happens in the function?
x
levels(x)
as.numeric(x)
# Create factor (ordinal variable)
y <- factor(rep(1:3,10),levels=c(1,2,3), labels=c("low", "medium", "high"))
y
as.numeric(y)
as.character(y)
# Create a list
participants <- list(Teacher= "Rudi",
Women = c("Daniela","Johanna"),
Men = c("Simon", "Peter", "usw."))
participants
length(participants)
# Access elements or subsets of that list
participants$Teacher
participants[1]
participants[[1]]
participants[["Teacher"]]
# Q: How can I access the list element "Women"?
# Q: How can i access the Johanna who is in the element "Women"?
mylist
with two elements. The first
element first
contains the numbers 5 to 105. The second
element second
contains the numbers -1 to -50. Create
another vector x
that contains the 70th value of the first
element of mylist
and the 30th element of the second
element of mylist
.anotherlist
with four elements: A, B, C,
D. A contains the number 2. B contains a vector with the number 1 to 10.
C contains a character vector with the names “Bernd” “Julia” “Peter”. D
contains a vector with the numbers 1 to 100.names
.D
of the list and save them in an object names
xyz
.test <- rep(1:10,10)
. Convert test
to an
ordered factor. Check which categories the factor has and whether it’s
ordered.# 1.
mylist <- list(first=c(5:105),second =c(-1:-50))
mylist
x <- c(mylist$first[70], mylist$second[30])
x
# 2.
A <- 2
anotherlist <- list(A=2,B=1:10,C=c("Bernd", "Julia", "Peter"),D=1:100)
# 3
names <- anotherlist[[3]]
names <- anotherlist$C
# 4
xyz <- anotherlist[[4]][25:35]
xyz <- anotherlist$D[25:35]
anotherlist$D[anotherlist$D>=25 & anotherlist$D<=35]
# Was ist der Unterschied zwischen den beiden?!!!?
# Wie w?rde man auf die Elemente 25 und 35 zugreifen?
# 5.
test <- rep(1:10,10)
ordered(test)
install.packages("packagename")
: Install packagelibrary(packagename)
: Load an installed packagedetach("package:packagename")
: Unload a packageremove.packages("packagename")
: Uninstall packagelibrary()
: Display all installed packageslibrary(help=packagename)
: Describe a packagesearch()
: Display all loaded packagesls("package:packagename")
: Display all objects within a
packagepackagename::bar
: Load just use one object in a package
(e.g. a function)pacman
package
pacman::p_load()
: Checks whether package is installed,
if not installs & loads package
p_load(ggplot2, tidyverse)
# Create a matrix
M <- diag(5)
M[1:20]<- c(1:20)
M
ginv(M) # Funkcion ginv()
# ginv calculates the Moore-Penrose-Inverse of the matrix M
# But it doesn not work here.. why?
help.search("ginv") # function is in the package MASS
install.packages("MASS") # Install package
MASS::ginv(M) # Call function without loading the whole package
library(MASS) # Load whole package
ginv(M)
ls("package:MASS") # Display content of the package MASS
detach("package:MASS") # Unload the package
ginv(M)
search()
library()
remove.packages("MASS")
library() # Package is not installed anymore
ggplot2
.ggplot2
.ggplot2
.ggplot2
.ggplot2
.install.packages("ggplot2")
library(ggplot2)
ls("package:ggplot2")
detach("package:ggplot2")
remove.packages("ggplot2")
"lists"
of vectors of the same length under the hood"data.frame"
(classic) and tibble
"tbl_df"
object$var1
: Access variable var1
in data
frame object
as.numeric(object$var1)
: Convert class of variable into
numericNA
: What was that?!?!data.frame()
: Create a data frame
tibble()
: Create tibble dataframeas.data.frame()
: Convert into a data frame
(cf. as_tibble()
)summary()
und str()
: Get oversight of a
data frame’s contenthead()
and tail()
: Inspect the dataView()
: Show data frame (formerly fix()
)
fix()
is open!names()
: Display variablenames (and use to rename)na.omit()
: Delete missings “listwise”, i.e. rows that
contain at least one missingis.na()
: Generate logical vector indicating the
missingsattach()
: When you attach a data frame you can use
variable names directly without referring to the dataframe (R
understands that)$
to access variableslm(...., data=yourdataframe)
getwd() # Get working directory
library(foreign) # Load package "foreign"
ls("package:foreign") # Check package content
?swiss # Check out the object
# What is this about?
swiss2 <- swiss # Load data set
# attach() function
View(swiss2)
attach(swiss2)
names(swiss2)
Education
detach(swiss2)
Education
swiss2$Education
# Get info on data set
str(swiss2)
summary(swiss2)
head(swiss2)
fix(swiss2)
tail(swiss2)
# Create a data frame
data <- data.frame(id=1:3, # !
weight=c(20,27,24),
size=c("small", "large", "medium"))
data
dataframe[rows,columns]
# Q: What does the following code do?
swiss[2:4, c(1,2,4)] # when do we need c() instead of ":"?
swiss[swiss$Fertility > 75 & swiss$Agriculture > 75, c(1:3)]
subset(swiss, Fertility > 75 & Agriculture > 75)[, c(1:3)]
swiss[, c("Fertility", "Agriculture")]
# We'll learn a more convenient function later on!
A “new” package dplyr written by Hadley Wickham/Romain Francois replaces many old functions for data management
Functions in dplyr
are highly performant (big data!)
and consistent
See this page for an excellent overview and the Data Wrangling Cheat Sheet
What could the following functions be used for?
filter()
, arrange()
,
select()
, distinct()
, mutate()
,
summarise()
, group_by()
,
recode()
select()
: Selects columns (can be used for renaming,
see rename()
)slice()
: Select subset of rowsfilter()
: Filter a subset of the rowsarrange()
: Reorders the rows of a data framedistinct()
: Returns unique values/rowsmutate()
: Add new columns to existing columns (see also
transmute()
)summarise()
: Collapses a data frame to a single row
(e.g. aggregation)group_by()
: Break data set into groups (of rows), to
apply above functions to each grouprecode()
: Recode variablesdplyr
it is possible to use the
%>%
operator
x %>% f(y)
turns into f(x, y)
:
x
(e.g. a data set) is inserted into the function on the
right of %>%
%>%
: Forces >
to be a function which
does the aboveswiss %>% select(Catholic) %>% summarise(mean(Catholic))
# install.packages("dplyr")
library(dplyr)
# ?swiss # Check out the data set
# fix(swiss)
# Filter
swiss %>% filter(Agriculture >= 60 & Fertility >= 70)
swiss[swiss$Agriculture >= 60 & swiss$Fertility >= 70, ] # Classic approach
# Q: What if I want all observations/rows with Catholic <= 50 ?
# Q: What if I want all observations/rows with Catholic <= 50 OR Catholic Catholic >= 80?
# Q: What if I want all observations/rows with Catholic <= 50 AND Catholic Catholic >= 80?
# Slice
swiss %>% slice(3:7)
swiss[3:7,] # Classic approach
# Q: What if I want the rows number 11 to 15 AND 18 to 20?
# Normally, names of observations (e.g. countrys) are not saved as row.names
# but simply in a variable
row.names(swiss)
order()
library(dplyr)
swiss %>% arrange(Education, Examination, Agriculture)
# Q: What if I want to sort according to examination first?
swiss %>% arrange(desc(Education), Examination)
desc(swiss$Education)
# Q: What if I want to sort Examination in descending order instead of Education?
# CLASSIC WAY
swiss[order(swiss$Education, swiss$Examination, swiss$Agriculture), ]
swiss[order(desc(swiss$Education)), ] # type ?order
names(dataframe) <- charactervector
rename()
function in different packages
(e.g. package reshape
)dplyr
# install.packages("dplyr")
library(dplyr)
names(swiss)
swiss %>% select(Fertility, Agriculture, Education) # Select columns by name
swiss %>% select(Agriculture:Education) # Variables from:to
swiss %>% select(-(Agriculture:Education)) # Variables without (from:to)
swiss %>% select(Geburtsrate = Fertility) # Select and rename (same as assigning arrow <-!)
swiss %>% rename(Geburtsrate = Fertility) # Alternative because select drops non-mentioned variables!
swiss %>% select(1,2)
# Q: How can I save that in a new data frame?
swiss %>% select(Agriculture:Education)
# Q: What do I have to type to extract the variables Catholic and Infant.Mortality
# from the swiss dataset and save them in a new object?
library(dplyr)
swiss %>%
select(Education) %>%
distinct(Education) # Ignore the rownames here (they are not meaningful)
swiss$Education
swiss %>%
select(Education, Examination) %>%
distinct(Education, Examination)
# Q: How many observations do I get if I extract the distinct values of swiss$Catholic?
# ????
# This is useful when pulling out the names of all present countries in a datafile
# CLASSIC WAY
unique(swiss$Education)
dplyr
contains the functions
mutate(), transform(), transmute()
swiss %>%
mutate(Examination10 = Examination*10,
FertAgri = Fertility - Agriculture)
# Q: What does the above function do?
# Don't forget to assign the result to a new object if you want to save it!
# mutate() allows you to refer to variable you just created, transform() doesnt
swiss %>%
mutate(Examination10 = Examination*10,
NewExi = Examination10 - 10)
swiss %>%
transform(Examination10 = Examination*10,
NewExi = Examination10 - 10)
# Use transmute if you only wan't to keep new variables
swiss %>%
transmute(Examination10 = Examination*10,
NewExi = Examination10 - 10)
# Q: What if I want to get a new data set only with the variable Catholic but divided by 10? What do I have to write?
swiss2 <- cbind(swiss, row.names(swiss))
. Compare
swiss
and swiss2
with View()
and
explain what the code does!swiss
(?swiss
) for all the
exercises below. First, extract columns 2 and 3 and save them in a new
object.swiss
(?swiss
) that have values on the variable
Agriculture
that are smaller or equal than 20 and bigger or
equal than 80 (Agriculture <= 20
or
>= 80
). Save the results in a new data frame, that
comprises the first 3 variables/columns of the old data set.Infant.Mortality
. Which province has the highest
level of Infant.Mortality
?Infant.Mortality.squared
that contains the squared values
of the variable Infant.Mortality
. Then rename the variable
Infant.Mortality.squared
into
Infant.Mortality.sq
.getwd()
library(dplyr)
# 2
swiss3 <- swiss %>%
select(2,3)
# 3
swiss3 <- swiss %>%
select(1,4) %>%
slice(2:6)
# 4
swiss3 <- swiss %>%
select(2,4,6) %>%
slice(c(1,3,6)) # Or slice(1,3,6)
# 5
df_new <- swiss %>%
filter(Agriculture <= 20 | Agriculture >= 80) %>%
select(1,2,3)
# 6
swiss %>%
arrange(Infant.Mortality) # Porrentruy
# 7
swiss %>%
mutate(Infant.Mortality.squared = Infant.Mortality^2) %>%
rename(Infant.Mortality.sq = Infant.Mortality.squared)
mapvalues()
: Recode a categorical vector
(plyr
package)cut()
: Recode a continuous variable into a categorical
one (plyr
package)if_else(condition, value if TRUE, value if FALSE)
:
Recode with ifelse condition (dplyr
package)table(variable1, variablevar2, useNA = "always")
:
Contingency table for the two variablesstr()
and summary()
: Check whether
variables in the data set have expected distributions and beware of
missings!# install.packages("plyr")
swiss2 <- swiss # Make a copy of the data set
names(swiss2) # Display variables
str(swiss2)
summary(swiss2)
# Create a dummy for "Catholic"
# For recoding character variables simply refer to text with ""
library(plyr)
# if_else
swiss2 <- swiss2 %>%
mutate(catholic.dummy3 = if_else(Catholic > 50, 1, 0))
table(swiss2$catholic.dummy3, swiss2$Catholic,
useNA = "always") # check recoding
# mapvalues()
swiss2 <- swiss2 %>%
mutate(Examination2 = mapvalues(Examination,
from = c(3, 37),
to = c(NA, NA)))
# cut()
swiss2 <- swiss2 %>%
mutate(Examination3 = cut(Examination2,
breaks=c(-Inf, 12, 22, Inf),
labels=c("low","medium","high"))) # greater than or equal to
# CLASSIC WAY
# MANUEL CLASSIC WAY
swiss2$catholic.dummy <- NA # generate new variable in dataset
View(swiss2)
swiss2$catholic.dummy[swiss2$Catholic <= 50] <- 0 # replace values conditional on Catholic
swiss2$catholic.dummy[swiss2$Catholic > 50] <- 1 # replace values conditional on Catholic
table(swiss2$catholic.dummy, swiss2$Catholic,
useNA = "always") # check recoding
names(swiss2) # show variable names
names(swiss2)[7] <- "catholic.dummy" # Rename
swiss2
.Infant.Mortality
in your new data
set swiss2
so that values <= 18
are coded
as 0
, 18 < values <= 20
as
1
, 20 < values <= 21
as 2
and 21 < values <= 27
as 3
. Do this
using the cut()
function and name the respective variable
inf.mort.cut
. Check if your coding worked and check the
class of the new variable/object.Infant.Mortality
using
the if_else()
function called
Infant.Mortality.dummy
that is 0 for values below 20 and 1
for values equal or above 20.# 1.
swiss2 <- swiss
# 2.
install.packages("plyr")
library(plyr)
swiss2 <- swiss2 %>%
mutate(inf.mort.cut = cut(Infant.Mortality,
breaks=c(-Inf, 18, 20, 21, Inf)))
table(swiss2$inf.mort.cut, swiss2$Infant.Mortality)
class(swiss2$inf.mort.cut)
fix(swiss2)
# We should label the factor variable by adding the argument
# labels=c("lowest", "low","high","highest")
# 3.
swiss2 <- swiss2 %>%
mutate(Infant.Mortality.dummy = if_else(Infant.Mortality >= 20, 1, 0))
group_by()
: function to group a dataframe (break it
into groups)dplyr
functions recognize when data frame is
groupedgroup_by()
can also be used for aggregating data (e.g.,
mean within groups)# See http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html for this example
library(foreign)
essdata <- read.dta("./Material/ESS4e04_de.dta", convert.factors = F)
View(essdata)
nrow(essdata) # Check number of rows
names(essdata)
# edulvl measures education levels
data_grouped <- essdata %>%
group_by(edulvl_str) # convert the data frame into a
# grouped data frame and save in object
# Character variable to aggregate
data_grouped # we can see the group variable and the dimensions
class(data_grouped) # we can see the new class,
data_agg <- data_grouped %>%
summarise( # summarise collapses data frame
n = n(), # Count rows (ignores missings!)
age.m = mean(age, na.rm = TRUE), # Variable containing mean
hheinkommen.m = mean(hheinkommen, na.rm = TRUE),
NA.age = sum(is.na(age)), # Count missing on age variable
N.age = sum(!is.na(age)), # Count non-missings on age variable
) # Variable containing mean
View(data_agg)
# Classically we used aggregate()
# Sometimes we want to aggregate the data (e.g. calculate the average) but don't want to collapse the data
# Below we add a variable with group means but don't collapse the dataframe
# Group, calculate means but do not collapse
data_grouped2 <- data_grouped %>%
mutate(hheinkommen.m = mean(hheinkommen, na.rm = TRUE)) %>%
arrange(edulvl_str, hheinkommen) %>%
select(edulvl_str, hheinkommen, hheinkommen.m)
library(foreign)
and
essdata <- read.dta("./Material/ESS4e04_de.dta", convert.factors=F)
.
Adapt your file path!religion_str
contains the religious
affiliation of respondents. Aggregate the data set - using functions
from dplyr
package - so that you obtain averages for
subgroups of religious affiliations for the variables
polinteresse
and trustparties
- as well as a
variable with the number of rows across the groups.library(foreign)
data <- read.dta("./Material/ESS4e04_de.dta", convert.factors=F)
library(dplyr)
data_agg <- data %>%
group_by(religion_str) %>%
summarise(
count = n(), # Add variable with the number of observations in group
mean.polinteresse = mean(polinteresse, na.rm = TRUE), # Variable containing mean of distance
mean.trustparties = mean(trustparties, na.rm = TRUE)) # Variable containing mean of delay
View(data_agg)
data_agg <- data.frame(data_agg)
class(data_agg)
merge()
(see Quick
R)dplyr
is much fasterinner_join(x, y, by = NULL, copy = FALSE, ...) # all intersecting observations
left_join(x, y, by = NULL, copy = FALSE, ...) # all observations from x
semi_join(x, y, by = NULL, copy = FALSE, ...) # all observations from x
anti_join(x, y, by = NULL, copy = FALSE, ...) # all in x that are not in y
x
= first data set, y
= second data
setby = "var"
oder by = c(var1, var2)
?join
: Check out the corresponding helpfile# An example for merging
# Create dataset and convert rownames to variable
nrow(swiss)
swiss2 <- swiss %>%
mutate(region = rownames(swiss)) %>%
remove_rownames() %>%
select(region, everything())
# Generate 2 data frames each possess parts of the observations and the regions
# variable
# Q: What new datasets do I generate below?
swiss.a <- swiss2 %>%
slice(1:8) %>%
select(1:3)
View(swiss.a)
swiss.b <- swiss2 %>%
slice(c(1, 6:7, 12)) %>%
select(c(1,4:5))
View(swiss.b)
intersect(swiss.a$region, swiss.b$region) # check intersection, i.e.
# which regions appear in both data sets?
# MERGING OF THE TWO DATA FRAMES
library(dplyr) # is the package installed?
?inner_join
swiss.inner <- inner_join(swiss.a, swiss.b, by ="region")
View(swiss.a)
View(swiss.inner) # data set with observations that intersect across both data sets
swiss.left <- left_join(swiss.a, swiss.b, by ="region")
View(swiss.left) # all observations in x = swiss.a
# Now try semi_join(), anti_join() FOR YOURSELF!
swiss2
and the objects swiss.a
and swiss.b
subsequently.swiss.c
that include the first 10
countries in the swiss data set and their values on the variables
region
, Catholic
and
Infant.Mortality
.swiss.c
with swiss.a
so that
the merged dataset contains all the countries that are contained in at
least one of the two data sets.getwd()
swiss2 <- cbind(row.names(swiss), swiss)
names(swiss2)[1] <- "region"
# Generiere 2 Datens?tze die beide eine L?nderspalte besitzen
swiss.a <- swiss2 %>%
slice(5:36) %>% # zeilen 1-4 fehlen
select(1:3)
View(swiss.a)
swiss.c <- swiss2 %>%
slice(1:10) %>%
select("region", "Catholic", "Infant.Mortality")
View(swiss.c)
intersect(swiss.a$region, swiss.b$region) # check intersection
library(dplyr) # installiert?
?join
swiss.full <- full_join(swiss.a,
swiss.c,
by ="region")
View(swiss.full) # data set with observations that intersect across both data sets
data %>% group_by(...) %>% nest()
: Nest dataframe
by variable in group_by(...)
unnest(data, cols = "...")
: Unnest variables specified
in cols = "..."
# Usually call it "data" or "df"
# Create a dataframe
data1 <- data.frame(name = c("Peter", "Paul", "Mary"),
income = c(550, 640, 710))
data1
data2 <- data.frame(name= c("Julia", "Hans", "Tina"),
income = c(640, 320, 400))
data2
# Create a nested dataframe
data_nested <- tibble(groups = c("group1", "group2"),
data = list(data1, data2)) # Use list() for nested parts
data_nested
data_nested$data
data_nested$data[[1]]
data_nested$data[[2]]
# Unnest dataframe
data_unnested <- unnest(data_nested, cols = "data")
# Renest dataframe
data_nested <- data_unnested %>%
group_by(groups) %>%
nest()
df <- data.frame(name = c("Peter", "Paul", "Mary", "Julia", "Hans"),
gender = c("male", "female", "male", "female", "male"),
discipline = c("Sociology", "Sociology", "Polscience",
"Psychology", "Polscience"),
income = c(610, 640, 550, 710, 505))
discipline
. What
does the dataframe and the nested ones look in terms of dimensions?gender
. What does
the dataframe and the nested ones look in terms of dimensions?gender
and
discipline
. What does the dataframe and the nested ones
look in terms of dimensions?# 2.
df %>% group_by(discipline) %>% nest()
# 3.
df %>% group_by(gender) %>% nest()
# 4.
df %>% group_by(gender, discipline) %>% nest()
# 5.
df_nested <- df %>% group_by(gender, discipline) %>% nest()
df_nested$data[[1]]
mean(x)
, sd(x)
, var(x)
,
median(x)
, min(x)
, max(x)
,
cov()
, cor()
,
cor(x,y,method="kendall")
, table(x)
,
table(x,y)
, prop.table(table(x))
,
100*prop.table(table(x))
mean(x)
: Meansd(x)
: Standarddeviationvar(x)
: Variancemedian(x)
: Medianmin(x)
: Minimummax(x)
: Maximumcov()
: Covariancecor()
: Correlationcor(x,y,method="kendall")
# Kendall’s 0?cor
: Check helpfiletable(x)
: Onedimensional contingency tabletable(x,y)
: Twodimensional contingency tableprop.table(table(x))
: Onedimensional contingency table
with proportions100*prop.table(table(x))
: Onedimensional contingency
table with proportions in percentlibrary(foreign)
fix(swiss)
View(swiss)
# Contingency table for objects
table(Catholic)
table(swiss$Catholic, swiss$Education)
# Table with percentages
100*prop.table(table(swiss$Catholic))
100*prop.table(table(swiss$Catholic, swiss$Education))
round(100*prop.table(table(swiss$Catholic, swiss$Education)), 2) # rounded values
# Mean
mean(swiss$Catholic)
# Median
median(swiss$Catholic) # "middle value"
sort(swiss$Catholic)
# Save several statistics in one vector
x <- swiss$Catholic
c(mean=mean(x), median=median(x), stddev=sd(x), min=min(x), max=max(x))
# or
summary(x)
# Correlations between vectors
cor(swiss$Catholic,swiss$Education, method="spearman")
cov(swiss$Catholic,swiss$Education)
Assault
and
UrbanPop
(dataframe USArrests
) in two vectors
x
and y
. Calculate for both vectors
x
and y
the mean, the maximum, the minimum and
the variance and save all the results in two objects
statistics.x
and statistics.y
. Name the
elements of these two objects “Mean”, “Max” etc. Save
statistics.x
and statistics.y
as list elements
in one list. Finally, check wether the two vectors x
and
y
strongly correlate with each other.### 1.
x <- 1:8
x
1*2*3*4*5*6*7*8
cumprod(x)
### 2.
?USArrests # Informationen zum Objekt
x <- USArrests$Assault
y <- USArrests$UrbanPop
statistics.x <- c(mean(x), max(x), min(x), var(x))
statistics.x
statistics.y <- c(mean(x), max(x), min(x), var(x))
statistics.y
liste <- list("Mean"= mean(x), "Max" = max(x), "Min"= min(x), "Var"= var(x))
liste
x
y
cor(x,y)
read.table()
to import most common
files (.txt, .csv
)readr
package provides better logic/funtionalities
read_csv()
: Import csv filesread_table()
: Import textual data where each column is
separated by one (or more) columns of space.csv
or .txt
and
then import them (e.g. EXCEL files)foreign
: Classic package with functions for data
importhaven
: Package by Hadley Wickham
ls("package:haven")
.csv
or
.txt
filereadr
also contains write functions
write_csv(swiss, "swiss.csv")
: Store swiss
as csv filewrite_delim(swiss, "swiss.txt", delim = " ")
: Store as
delimited file with 1 white space as separator# CSV or TXT files with read.table()
getwd()
setwd("C:/Users/Paul/Google Drive/2-Teaching/2022_R_Workshop/Material")
# EXPORT DATA
write_csv(swiss, "swiss.csv")
write_delim(swiss,
file = "swiss.txt",
delim = " ")
write.table(swiss, "swiss.txt", sep=",") # classic old function
# IMPORT DATA
data <- read_csv("swiss.csv") # csv datei importieren
data
# If the import goes wrong make sure that you picked the right delimiter
data[1:5, 1:3] # Make sure the separator is the right one, i.e. open data in text edior to check that
# STATA with haven()
install.packages("haven")
library(haven)
data <- read_dta("ESS4e04_de.dta")
View(data) # preferred function to visualize dataset
# Import SPSS/SAS with: read_por(), read_sas(), read_sav(), read_spss()
# Important: read_dta() directly creates a tibble
# Sometimes you might need to convert that back to a dataframe (for older functions, because they don't work with tibbles)
# Export as dta
write_dta(data, "data_stata.dta")
# install.packages("WDI") # install package to use WDI API
library(WDI) # load the package
# WDIsearch(string="gdp", field="name", cache=NULL)
names.indicator <- WDIsearch()
View(names.indicator) # names in first column
DF <- WDI(country="all", indicator="NY.GDP.MKTP.PP.KD", start=2005, end=2005)
DF[1:3, 1:4]
install.packages("XML")
library(XML)
# BADACH
url <- "http://www.badac.ch/db/db.php?abs=canton_x&code=Csi11.51&annee=2009&arg=&lang=De"
unemployment <- data.frame(readHTMLTable(url)$table_data)
names(unemployment) <- c("nr", "abbr", "name", "unemp")
.csv
filesif(argument1){argument2}else{argument3}
argument1
is TRUE then execute
argument2
, if not execute argument3
if(10>5){1}else{0}
dplyr
package:
if_else(10 > 5, 1, 0)
for(i in Sequence){argument1}
i
is the placeholder across which the loop runsSequence
is a vector of valuesSequence
the argument1
is executedprint()
: Print objects within loop consecutivelywhile(argument1){argument2}
argument2
is executed as long as argument1
is TRUErepeat{argument1; argument2; if(argument3) break}
argument1
and argument2
are repeated until
argument3
is TRUE# What will the result be? Your call!
# If Argument
x <- -5
x
if(x <= 0) {y <- 1} else {y <- 0}
y
# If Argument
x <- 1
if(x == 0 | x == 1) {y <- 1} else {y <- 0}
y
# If Argument
x <- 10
if(x<= -10 & x>= 10) {y <- 1} else {y <- 0}
y
# Warum ist y hier gleich 0?!? BUG?
# Used in complicated loops, e.g. to give names
# conditional on some variable having some name!
# For Loop
for (x in 1:10) print(sqrt(x)) # is ok because of single line
# For Loop
x <- 0
for(i in 1:10){
x <- x+i
print(x)
} # x steigt mit an
swiss2 <- swiss
rownames(swiss2) <- NULL
vars <- names(swiss2)
for(i in vars){
# x <- x+i
print(i)
print(class(swiss2[,i]))
swiss2[,i][1] <- NA
} # x steigt mit an
# skip
# While Loop
x <- 0
while(x<13) {
x <- x+1
print(x)
} # solange x<13=TRUE
#Repeat Loop
x <- 0
repeat{x <- x+1; print(x); if(x>=10) break} # repeat that until x>=10 is TRUE
# Process data with loops
for (column in 2:6) { # this loop runs through 2 to 6
print(names(swiss)[column])
print(mean(swiss[,column]))
}
# Could be all sorts of object classes, e.g. you could loop over several data sets
for (dataset in c(data1, data2, data3)) {
# Do something, e.g. data cleaning, calculations, model estimations etc.
}
# But: Most loops are avoidable and there are more efficient ways to deal with your specific problem. E.g. with functions from dplyr, sapply(), ave() or specialized functions like rowMeans(), colMeans(), colSums() etc.
# Think about your problem and look for better solutions. If loops are necessary: Pull as much code out of the loop as possible. The larger the loop the harder it is to understand it.
y
takes the value 10, if
x
is smaller than -50 or bigger than 50. If these
conditions are not fulfilled y
should take the value 0.
Test this If-Argument by choosing different values for
x
.a = 10
and write a loop that
sequentially adds the numbers 2,4,6,8,10 to this scalar.Fertility
with missings (NA
).# 1.
x <- 51
if(x <=- 50 | x>=50) {y <- 10} else {y <- 0}
y
# 2.
a <- 10
for (i in seq(2,10,2)){
print(a)
a <- a + i
}
# NOT like this!
a <- 10
for (i in seq(2,10,2)){
print(a+i)
}
# 3.
length(swiss$Fertility)
for (i in seq(1,47,2)){swiss[i,1] <- NA}
summary(swiss$Fertility)
+
or -
)print()
-function is calledfunction(object){Action of the function}
: Define a
functioncurve(expr, from = NULL, to = NULL)
: Plot a
mathematical functionprint()
,
plot()
) and react differently for different classes of
objectsmethods(functionname)
: List all available methods for a
S3 and S4 generic function, or all methods for an S3 or S4 class.
methods(summary)
and then
summary.lm
# Examples
cos(pi)
cos
print("Hallo")
print
mean(c(1,2,3))
# Example for defining a function
# e.g. squareroot
sqrt2 <- function(x) {
x^0.5
}
sqrt2(4)
# Enter name of function to get the content
sqrt2
# Another example
funktion.bsp <- function(x) {
(x+10)^2
}
funktion.bsp(2)
# Plot the function with curve()
curve(funktion.bsp, 1, 200) # Enters number 1 to 200 into the function and plots
# the result
# Delete a function
rm(funktion.bsp) # like any other object
# Another exampe: calculate the mean of a vector
mittelwert <- function(x) {
sum(x)/length(x)
}
x <- rep(1:70,5)
mittelwert(x)
x
and adds 33 to
each element in x
.x
and
y
and returns the weighted mean (weighted by y) according
to the formula: \(\frac{\sum_{i=1}^{N}y_{i}x_{i}}{\sum_{i=1}^{N}y_{i}}\).
The vector x
contains the values 1, 2, 3, 4 and 5. The
vector y
contains the values 2, 4, 5, 6 and 7 (Tips:
?sum
; Functions with two inputs
e.g. function(a,b){}
).# 1.
plus33 <- function(x) x+33
x <- 20:25
plus33(x)
# 2.
plus2hoch2 <- function(x) (x+2)^2
x <- 1:10
plus2hoch2 (x)
curve(plus2hoch2, 1, 10)
# 3.
y <- c(2,4,5,6,7)
x <- c(1,2,3,4,5)
gew.mittelwert <- function(x,y) sum(y*x)/sum(y)
gew.mittelwert(x,y)
ggplot2
package: For static graphs (I switched
completely)plotly
package: For interactive graphsshiny
package: For interactive applicationsplot()
windows(8,8)
and
quartz(8,8)
plot(x)
savePlot(filename="./figures/test.pdf", type="pdf")
type = c("wmf", "emf", "png", "jpg", "jpeg", "bmp", "tif", "tiff", "ps", "eps", "pdf"
dev.off()
pdf()
(and dev.off()
after)getwd()
# Open window
windows(8,8) # DINA4 page has a width of 8,27 inches
# MAC?
# Create histogram
plot(rnorm(n=101,sd=.8))
hist(rnorm(n=101,sd=.8))
# Save
savePlot(filename="test.jpg", type="jpg")
# Close the graphics window
dev.off()
# If you work on a server where it's not possible to open windows
# open device
pdf("test2.pdf")
# create a histogram
hist(rnorm(n=101,sd=.8))
# close device
dev.off()
points(xcoords,ycoords)
: Adds pointslines(xcoords,ycoords)
: Adds linesabline()
: Adds a straight line (e.g. regression line,
?abline
)
abline(h=10)
or abline(v=5)
h
= horizontal, v
= verticaltext()
: Adds text
text(xcoords,ycoords, labels="Enter Text here")
Main=""
; xlab=""
; ylab=""
:
Add titlesxlim=c(min,max)
; ylim=c(min,max)
: Adapt
axes scaleslwd
: Linewidthlty
: Linetype (solid, dashed etc. )pch
: Plot Symbols (circles, crosses, points, etc.)col
: Color for points, lines etc.bg
: Backgroundcolor for plot and certain symbolspar()
: Set global parameters (?par
),
e.g. par(mfrow=c(2,2))
mfrow
: Several plots in one windowoma
: Set outer margins for plotmar
: Set inner margins for plotgetwd()
# Step for step: Example 1
windows(8,8)
x <- seq(from=0,to=2*pi,length=101)
y <- sin(x) + rnorm(n=101,sd=.5)
plot(x,y, xlim=c(-1,11), ylim=c(-3,3)) #
lines(x,sin(x),col="red")
lines(c(1,5),c(-1.5,0.5)) # xcoords in first argument, y coords in second
abline(lm(y~x),lty=3)
abline(h=0)
abline(a=mean(y),b=0.1,lty=2) # a = intercept, b = slope
# labs <- paste(c("x"), 1:101, sep="")
labs <- paste(1:101) # create a character vector with labes
text(x,y, labels=labs, pos=1, cex=0.7)
# Global parameters
windows()
par(mfrow=c(2,2), oma=c(1,1,1,1), mar=c(4,4,0,0)) # try different numbers
plot(x,y)
plot(x,y)
plot(x,y)
plot(x,y)
# Try what happens if you change the global parameters
dev.off()
# skip
# Step for step: Example 2
x <- seq(from=0,to=2*pi,length=101)
y <- sin(x) + rnorm(n=101,sd=.5)
#
windows()
plot.default(x,y,type="n", xlab="Indep. var: x", ylab="Dep. var.: y")
# plot.default() is the default scatterplot function
lines(x,sin(x),lwd=2)
segments(x0=x,x1=x,y0=sin(x),y1=y)
points(x,y,pch=21,bg="green")
text(4,1, labels="LALALA")
dev.off()
barplot(table(data$variable))
: Barplot for absolute
frequenciesbarplot(prop.table(table(data$variable)))
: Barplot for
relative frequenciesbarplot(table(data$variable), horiz=TRUE, las=1)
:
Barplotbarplot(table(data$variable1, data$variable2))
: Stacked
barplotbarplot(table(data$variable1, data$variable2), beside=TRUE)
:
Grouped barplothist(data$variable)
: Histogrammboxplot(x)
: One variableboxplot(data$variable1 , data$variable2, data$variable3)
:
Several variablesboxplot(data$variable1 data$variable2)
: Groupedplot(independentVariable, outcomeVariable)
abline(h=mean(outcomeVariable, na.rm=TRUE)
: Add
straight line for the meanabline(lm(outcomeVariable ~ independentVariable))
: Add
regression line# Graphs for categorical variables
# Generate a categorical variable in the dataset swiss
swiss$dummy.catholic[swiss$Catholic>=70] <- 1
swiss$dummy.catholic[swiss$Catholic<=70] <- 0
swiss$cat.fertility[swiss$Fertility<=64] <- 1
swiss$cat.fertility[swiss$Fertility>64 & swiss$Fertility<=70] <- 2
swiss$cat.fertility[swiss$Fertility>70 & swiss$Fertility<=78] <- 3
swiss$cat.fertility[swiss$Fertility>78] <- 4
swiss <- swiss[order(swiss$Fertility),]
swiss
# Barplot with absolute frequencies
barplot(table(swiss$dummy.catholic))
# Barplot with relative frequencies
barplot(prop.table(table(swiss$dummy.catholic)))
# barplot
barplot(table(swiss$dummy.catholic), horiz=TRUE, las=1)
# Stacked barplot
table(swiss$dummy.catholic, swiss$cat.fertility)
windows()
barplot(table(swiss$dummy.catholic, swiss$cat.fertility),
names.arg = c("x <= 64","64 < x <= 70","70 < x <= 78","78 < x"))
# Grouped barplot
barplot(table(swiss$dummy.catholic, swiss$cat.fertility), beside=TRUE)
# Four histograms
windows()
par(mfrow=c(2,2))
for (i in c(.8, .2, 2, 1)){
hist(rnorm(n=101,sd=i))
}
# Plot a function and a legend
windows()
curve(x^2,col="blue", xlim=c(0,10), ylim=c(0,10))
curve(x^3,add=TRUE,col="red")
curve(x^5,add=TRUE,col="green")
legends <- c("xhoch2", "xhoch3", "xhoch5")
legend(8,10, c("xhoch2", "xhoch3", "xhoch5"), pch = 20,
col=c("blue","red","green"))
abline(h=5)
swiss
contains data for 47 French speaking
provinces in Switzerland in the year 1888 (see ?swiss
).
Inspect the data set with the usual functions (How many variables are
there? What class do these variables have? etc.).Education
(x-Achse) and Fertility
(y-Achse)
InfantMortality
.
getwd()
# 1.
summary(swiss)
str(swiss)
# 2.
plot(swiss$Education,swiss$Fertility,
main="Der Zusammenhang zwischen Bildung und Fortpflanzung",
xlab="Bildung", ylab="Fruchtbarkeit", xlim=c(10,90), ylim=c(10,90))
# 3.
windows(6,6)
x <- swiss$Infant.Mortality
hist(x, breaks=10, main="Infant Mortality in Swiss Provinces", ylim=c(0,20),
xlim=c(10,30), xlab="Proportion of births that live less than a year",
ylab="Frequency" )
mittelwert <- mean(x, na.rm=TRUE)
sd <- sd(x, na.rm=TRUE)
curve(dnorm(x, mittelwert, sd)*length(x), add=TRUE, pch=1,col=2)
#curve(dnorm(x, mittelwert, sd), add=TRUE, pch=1,col=2)
legends <- c("normalcurve", "frequence")
legend(10,20, c("normalcurve"), pch = 15, col = 2)
arrows(18,16,20,14)
savePlot(filename="hist1.pdf", type="pdf")
ESS4e04_de.dta
and inspect it using
different functions. Generate a barplot for the absolute frequencies of
the variable wahlentscheidung.str
.geschlecht.str
) and wahlentscheidung
(wahlentscheidung.str
). Color the single bars according to
the parties. Add a legend that indicates which bar belongs to which
party (Tip: ?barplot
to check for col
).age
). Change
the number of bars so that about 15 bars a displayed. In addition the
x-axis should cover the range from 0 to 100.age
(x-axis) and
geschlecht.str
(y-axis).age
(x-axis) and
hheinkommen
(y-axis).polinteresse.str
(y- Axis) separately for women and men
(use geschlecht.str
, x-axis).# 1.
getwd()
library(foreign)
ess <- read.dta("ESS4e04_de.dta", convert.factors = FALSE,
convert.underscore = TRUE)
str(ess)
summary(ess)
fix(ess)
barplot(table(ess$wahlentscheidung))
barplot(table(ess$wahlentscheidung.str))
# 2.
barplot(table(ess$wahlentscheidung.str, ess$geschlecht.str),
beside=TRUE, col=c("black","yellow","green","red","magenta"))
legend(3,350, c("CDU", "FDP", "GRUN", "Linke","SPD"), pch = 20,
col=c("black","yellow","green","magenta","red"))
# 3.
hist(ess$age)
hist(ess$age, breaks = 15, xlim=c(0,100))
hist(ess$age, breaks=50,prob=TRUE)
mittelwert <- mean(ess$age, na.rm=TRUE)
sd <- sd(ess$age, na.rm=TRUE)
?curve
# Old: curve(dnorm(x, mittelwert, sd)*length(ess$age), add =TRUE) #
# New:
curve(dnorm(x, mittelwert, sd)*length(ess$age), add =TRUE) #
# 4.
# 4.1
windows()
par(mfrow=c(2,2))
boxplot(ess$age ~ ess$geschlecht.str)
# 4.2
plot(ess$age, ess$hheinkommen)
abline(h=mean(ess$hheinkommen, na.rm=TRUE))
abline(lm(ess$hheinkommen ~ ess$age))
# 4.3
# install.packages("gplots")
# library(gplots)
# plotmeans(ess$age ~ ess$geschlecht) # ylim=c(1,100)
# 4.4
barplot(tapply(ess$polinteresse, ess$geschlecht, mean, na.rm=TRUE))
# 4.5
savePlot(filename ="3in1", type = "pdf")
Hadley Wickhams ggplot2
Package developed into a
powerful alternative to the default plot()
function. Its
goal is to simplify complex plots (e.g. take care of legends, grids,
labels for multi-layered graphics). It is very much inspired by the
technically outstanding lattice
package and its
xyplot
function. However ggplot2 tries to simplify the
often fiddly syntax of complex graphs and gives them a more modern look
and feel. Furthermore its documentation is top-notch and well worth a
read: http://docs.ggplot2.org/current/
lm()
: R function to estimate a linear model
(lm
stands for “linear model”“)lm(df$y ~ df$x)
: Use outcome variable y
and explanatory variable x
as inputlm(y ~ x,data = df)
: If both variables are located in
the data frame “df”lm(y~x1 + x2 + x3,data = df)
: Several independent
variableslm(y ~ x1 + x2 + x1:x2,data = df)
: Interaction
effectslm(y ~ x1 * x2,data = df)
: Same as abovefit <- lm(y ~ x)
: Save the resultsfit
or summary(fit)
"lm"
contains different components
coef(fit)
: The coefficientsresiduals(fit)
: The residualsna.action==na.omit
omits missings
listwisebroom
package
tidy()
: Extract coefficients + other stats from
models# Example: Fertility, education and religion (Catholic)
?swiss
summary(swiss$Catholic)
summary(swiss$Fertility)
summary(swiss$Education)
fit <- lm(Fertility ~ Education + Catholic, data=swiss) # na.action==na.omit
# summary() is a generic function. If you input an lm() object it guesses you want a summary of regression statistics
summary(fit)
# Number of observations: nrow(swiss) = 47
# DF (degrees of freedem): 47 observations - 3 parameters (to estimate)
# 95% percent confidence intervall:
# 74.23 (point estimate) +- 2.35 (Standard error) * 1.96 (use 2 as a rule of thumb)
74.23 + 2.35*1.96
# t value = corresponding value of 74 in the t-distribution
# Pr(>|t|).. p-Value, i.e. the probability to observing t-value or greater
# given that beta = 0
# Components of the lm object
ls(fit)
names(fit)
class(fit)
# Access coefficents and other parts
fit$fitted.values # ^y = y hat = predicted values
fit$residuals # residals = Predicted - Observed
coef(fit) # Coefficients beta1, beta2, beta3
fit[1]
fit[1]$coefficients[1] # Access subelements of object
# Easier using tidy()
tidy(fit)
# Scatterplot
library(ggplot2)
library(ggrepel)
ggplot(swiss,
aes(x = Education,
y = Fertility,
label = rownames(swiss))) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
geom_text_repel()
# 3d Scatterplot
library(plotly)
plot_ly(swiss,
x = ~Education,
y = ~Catholic,
z = ~Fertility,
color = ~Examination) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education'),
yaxis = list(title = 'Catholic'),
zaxis = list(title = 'Fertility')))
# Coefficient plot
data_estimates <- tidy(fit, conf.int = TRUE) %>%
filter(term != "(Intercept)")
ggplot(data_estimates,
aes(estimate, term,
xmin = conf.low,
xmax = conf.high,
height = 0)) +
geom_point() +
geom_vline(xintercept = 0, lty = 4) +
geom_errorbarh()
car
UN
(?UN
)infantMortality
) on GDP (ppgdp
) and keep the
estimation results.car
and inspect the data set
Quartet
Quartet
to estimate several
regressions and compare the coefficients of the following regression
models: y1 on x (Model 1), y2 on x (Model 2), y3 on x (Model 3) and y4
on x4 (Model 4)!# Tip: You can use the patchwork package to draw plots side by side
library(patchwork)
# p1 + p2 + p3 + p4
# 1. Child mortality and GDP
# 1.1 Install and load package car
install.packages("car")
library(car)
# 1.2 Inspect UN data
?UN
fix(UN)
str(UN)
# 1.3 Fit a linear model for mortality dependent on GDP.
fit <- lm(infant.mortality ~ gdp, data=UN)
summary(fit)
summary(lm(infant.mortality ~ gdp, data=UN))
# 1.4 Draw the corresponding scatterplot
plot(UN$gdp, UN$infant.mortality)
abline(lm(infant.mortality ~ gdp, data=UN))
# 2. Anscombe's Quartet
# 2.1 Load package and inspect data
sum(Quartet)
str(Quartet)
# 2.2 Fit the models
# (a) y1 by x (Data: Quartet)
fit1 <- lm(y1 ~ x, data=Quartet)
summary(fit1)
# (b) y2 by x (Data: Quartet)
fit2 <- lm(y2 ~ x, data=Quartet)
summary(fit2)
# (c) y3 by x (Data: Quartet)
fit3 <- lm(y3 ~ x, data=Quartet)
summary(fit3)
# (d) y4 by x (Data: Quartet)
fit4 <- lm(y4 ~ x4, data=Quartet)
summary(fit4)
# and compare all coefficients
mycoefs <- c(fit1$coefficients[2], fit2$coefficients[2],
fit3$coefficients[2], fit4$coefficients[2])
mycoefs
Koeff <- rbind(coef(fit1),coef(fit2),coef(fit3),coef(fit4))
names(mycoefs) <- c("coeff.fit1", "coeff.fit2",
"coeff.fit3", "coeff.fit4")
# 2.3 Erstelle vier Scatterplots die jeder einen Regressionslinie enthalten:
# (a) von x und y1 (Datensatz: Quartet)
p1 <- ggplot(Quartet,
aes(x = x,
y = y1)) +
geom_point() +
geom_smooth(method='lm', formula= y~x)
# (b) von x und y2 (Datensatz: Quartet)
p2 <- ggplot(Quartet,
aes(x = x,
y = y2)) +
geom_point() +
geom_smooth(method='lm', formula= y~x)
# (c) von x und y3 (Datensatz: Quartet)
p3 <- ggplot(Quartet,
aes(x = x,
y = y3)) +
geom_point() +
geom_smooth(method='lm', formula= y~x)
# (d) von x4 und y4 (Datensatz: Quartet)
p4 <- ggplot(Quartet,
aes(x = x,
y = y4)) +
geom_point() +
geom_smooth(method='lm', formula= y~x)
p1 + p2 + p3 + p4
## TESTING ASSUMPTIONS OF THE LINEAR REGRESSION MODEL (e.g. Gelman & Hill 2007, 45-47)
# Checking robustness and assumptions
# Good overview: http://www.stat.uni-muenchen.de/~helmut/limo09/limo_09_kap6.pdf)
fit <- lm(Fertility ~ Education + Catholic, data=swiss)
# Linearity
# Cause: Non-linear relationship between X and Y
# Problem: Linear model does not produce good fit
# Diagnose: e.g. partial residual plot for every variable
crPlots(fit) # partial residual plot
# Formula: http://en.wikipedia.org/wiki/Partial_residual_plot
# Residuals + beta_i*X_i (y-axis) vs. X_i (x-axis)
# beta_i is estimated and plotted while controlling for all other variables
ceresPlots(fit)
# Generalization of component+residual (partial residual)
# Independence or errors
# Causes: autocorrelated time series or clustered data
# Problem: wrong t-values
# Diagnose: among others: Durban Watson Test http://de.wikipedia.org/wiki/Durbin-Watson-Test
durbinWatsonTest(fit)
# Constant variance of errors (Homoskedasticity)
# e_i dependent of i
# Causes: errors often increase with x
# Problem: wrong t-values -> problem for hypothesis testing
# Diagnose: plot e_i vs fitted values
spreadLevelPlot(fit)
summary(fit$fitted.values)
# "Solution": calculate robust standard errors (library(sandwich))
# Normality of errors
# Cause: wrong distribution assumption / wrong model
# Problem: wrong t-values, bad predictions
# Diagnose: Plot residuals (are they normally distributed)
# qq plot for studentized resid vs. theoretic quantiles
qqPlot(fit, main="QQ Plot")
# Outliers
# Cause: special cases, data errors
# Problem: Outliers may biase coefficients
# Diagnose: e.g. Cook's D plot - http://en.wikipedia.org/wiki/Cook's_distance
# measures effect of deleting observation
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(swiss)-length(fit$coefficients)-2))
plot(fit, which=4, cook.levels=cutoff)
# Added variable plots
library(car)
avPlots(fit)
glm()
: R function to estimate a generalized linear
modelglm(formula, family = familytype(link = linkfunction), data = )
family =
: Specify a choice of variance and link
functions
glm(outcome ~ x1 + x2 + x3, data = somedata, family = binomial())
:
Logistic modelglm(outcome ~ x1 + x2 + x3, data = somedata, family=poisson())
:
Poisson modelswiss2 <- swiss
library(plyr)
swiss2$d.Catholic <- cut(swiss2$Catholic,
breaks=c(-Inf, 50, Inf),
labels=c("low","high"))
class(swiss2$d.Catholic) # Factor!
swiss2$d.Catholic
# Logistic Regression
# where F is a binary factor and
# x1-x3 are continuous predictors
fit <- glm(d.Catholic ~ Agriculture + Education,data = swiss2, family = binomial())
# Display results
summary(fit)
# Coefficients give the change in the log odds of the outcome for a one unit increase in the predictor variable
# e.g. For every one unit change in Agriculture, the log odds of Catholic (vs. non-Catholic) increases by 0.6173.
confint(fit) # CIs using profiled log-likelihood
# confidence intervals are based on the profiled log-likelihood function
confint.default(fit) # CIs using standard error
exp(coef(fit)) # exponentiated coefficients
exp(confint(fit)) # 95% CI for exponentiated coefficients
predict(fit, type="response") # predicted values
residuals(fit, type="deviance") # residuals
# Poisson Regression
# where count is a count and
# x1-x3 are continuous predictors
fit <- glm(count ~ x1+x2+x3, data=df, family=poisson())
summary(fit) #display results
Gelman/Hill 2007, Chapter 11-13: Data Analysis Using Regression and Multilevel/Hierarchical Models
The lmer function
fit <- lmer(y ~ 1 + (1 | county))
: Varying-intercept
model with no predictors
county
: Grouping variable`fit <- lmer(y ~ ilevel.var + (1 | county))
:
Varying-intercept model with an individual-level predictordisplay(fit)
: Summarize estimation resultscoef(fit)
: Display coefficientsfixef(fit)
: Averageranef(fit)
: Group-level errorsse.fixef(fit)
and se.ranef(fit)
: Pull out
standard errorslmer(y ~ ilevel.var + glevel.var + (1 | county))
:
Adding a group level predictorlmer(y ~ ilevel.var + (1 + ilevel.var | county))
:
Varying intercept, varying slopelmer(formula = y ~ ilevel.var + glevel.var + ilevel.var:glevel.var + (1 + ilevel.var | county))
:
Varying intercept, varying slope + group level predictor includedFormerly, LateX was the go-to language for many scientists (or word)
Currently, either do everything in R & [R Markdown]
Or write collaboratively in Google Docs or Microsoft Online and include graphs manually (annoying but great for writing)
In my view: No future for LaTeX
To import into Google Docs/Word choose route over html
Below the R version and package version that were used when the script was generated.
sessionInfo()
## R version 4.2.0 (2022-04-22 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17763)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.14 knitr_1.40 magrittr_2.0.3 tidyselect_1.1.2
## [5] R6_2.5.1 rlang_1.0.2 fastmap_1.1.0 fansi_1.0.3
## [9] stringr_1.4.1 dplyr_1.0.10 tools_4.2.0 xfun_0.30
## [13] utf8_1.2.2 DBI_1.1.3 cli_3.3.0 jquerylib_0.1.4
## [17] ellipsis_0.3.2 htmltools_0.5.2 assertthat_0.2.1 yaml_2.3.5
## [21] digest_0.6.29 tibble_3.1.6 lifecycle_1.0.1 purrr_0.3.4
## [25] sass_0.4.2 vctrs_0.4.1 glue_1.6.2 cachem_1.0.6
## [29] evaluate_0.16 rmarkdown_2.16 stringi_1.7.6 compiler_4.2.0
## [33] bslib_0.4.0 pillar_1.8.1 generics_0.1.3 jsonlite_1.8.0
## [37] pkgconfig_2.0.3
A script by Paul C. Bauer
mail@paulcbauer.de