CTRL + R
CTRL + ENTER
help.start()
or in the menuadd_count()
function
help.search("add_count")
add_count()
function (Explain
help file structure!)
?add_count
and help("add_count")
getwd()
: Display working directorysetwd()
: Set working directorydir()
: Display files in working directory\
\\
instead or /
in
directory paths (see below)#
: Start comment that is not evaluated# Example
getwd()
## [1] "C:/Users/Paul/Google Drive/2-Teaching/2022_R_Workshop"
a <- 10
or a = 10
: Assign something to
an objecta
: Type name to display content of object
a
rm()
: Delete object from workspacesave(a, b, file = "myobjects.RData")
: Save single
objects a
and b
as .RData
fileload("myobjects.RData")
: Load saved objectsls()
: Display workspace content# Comments in the script start with #
# Everything after # in the line is ignored by R
# Send code with CTRL + R
5+5
getwd() # Display working directory
# Generate a new folder on your hardrive called "2016_03_R_course_EUI"
# Use this folder as your working directory, i.e. to store the downloaded files in there
# If you need the path to your folder, copy it from the path field in the explorer (in windows)
# The course files are available here: https://drive.google.com/folderview?id=0Bwer5wQoreiuMTRueFBGdDllNEk&usp=sharing
# Set the your working directory to your new folder
setwd("C:/Users/paul/Google Drive/Teaching/2016_03_R_course_EUI")
dir() # Display content of working directory
a <- 50 # Generate object that contains the number 50
a # Show content of object
a = 70
a
ls() # Display objects in workspace
# Create a more complicated object
thesenseoflife <- c("worshipping god","having fun","no idea")
interesting <- thesenseoflife # define another object
interesting
# Q: How can I store the object "interesting" in the object "boring"?
# Objectnames do not have blanks
# Names should make sense and be systematic
# e.g. for dummy variables: d.male.voter
# e.g. variance of variables: var.income
a <- 1
b <- 2
# Save objects a and b in the file "objects-a-and-b.RData"
# in the working directory
save(a,b,file="objects-a-and-b.RData")
# Delete objects a and b
rm(a,b)
rm(list=ls())
ls() # display all objects
load("objects-a-and-b.RData") # load objects from wd
# Check if they were loaded
a
b
getwd()
hundert <- 100
xyz <- 250
hundert
ls()
save(hundert,xyz,file="workspace.RData")
rm(hundert,xyz)
load("workspace.RData")
ls()
+ - * / ^
& | == != > < >= <=
?function
)
exp(x)=e^x log(x) log10(x) sin(x) cos(x) tan(x)
abs(x) sqrt(x) ceiling(x) floor(x) trunc(x) round(x, digits=n)
ceiling()
function?# Calculations
Ergebnis <- (23+24)*11/(18+15)*5
Ergebnis
# Functions
log(2) # exp(1) = 2.718282^0.6931472
x <- seq(1,4,0.3)
ceiling(x)
floor(x)
##########################
# Comparisons.. examples #
##########################
x <- -3:3
x
# Equals
x == 0
# is smaller than
x < 0
# is bigger than or equal to
x >= 0
# unequal
x != 0
# unequal(equal to 0)
!(x == 0)
# bigger than -1 and smaller than 1
x > -1 & x < 1
# bigger than 1 and smaller than -1
x > 1 & x < -1
# bigger than 1 or smaller than -1
x > 1 | x < -1
mean()
function it is possible to use and
additional argument called trim
. Find out what this
argument does by checking the help file of the mean()
function and write it into your script.# 2.
((3+4-5)-9)^2
-99/33+42
log(1)
(sqrt(2))^2
# 3.
5==7
5*5>=6*4
sqrt(3)!=cos(17)
# 4.
# help(mean)
# ?mean
# the fraction (0 to 0.5) of observations to be trimmed from each end of x
# before the mean is computed.
Q: What is your experience with looking at data analysis code you have written 2 years earlier?
Comment your code
Use meaningful names!
trst
vs. trust
# Data import ####
styler
addin if you wantclass()
: Query object type"tbl_df"
): Newer type of dataframe (see
?
tbl_df-class``)as.numeric()
## [1] "integer"
## [1] 1 2 3 4 5 6 7 8 9
## [1] "numeric"
## [1] 1.3 2.4 3.5
## [1] "logical"
## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [1] "character"
## [1] "a" "b" "c" "d" "f"
## [1] "matrix" "array"
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 11 12 13
## [1] "array"
## , , 1
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 6 11 16 21
## [2,] 2 7 12 17 22
## [3,] 3 8 13 18 23
## [4,] 4 9 14 19 24
## [5,] 5 10 15 20 25
##
## , , 2
##
## [,1] [,2] [,3] [,4] [,5]
## [1,] 26 31 36 41 46
## [2,] 27 32 37 42 47
## [3,] 28 33 38 43 48
## [4,] 29 34 39 44 49
## [5,] 30 35 40 45 50
## [1] "list"
## $a
## [1] 4 5 6 7 8
##
## $b
## [1] 1 2 3
##
## $c
## [1] "Ger" "FR" "It" "SE"
## [1] "data.frame"
## Fertility Agriculture Examination
## Courtelary 80.2 17.0 15
## Delemont 83.1 45.1 6
## Franches-Mnt 92.5 39.7 5
## Moutier 85.8 36.5 12
## Neuveville 76.9 43.5 17
## Porrentruy 76.1 35.3 9
## Broye 83.8 70.2 16
## Glane 92.4 67.8 14
## Gruyere 82.4 53.3 12
## Sarine 82.9 45.2 16
## [1] "tbl_df" "tbl" "data.frame"
## # A tibble: 47 x 6
## Fertility Agriculture Examination Education Catholic Infant.Mortality
## <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 80.2 17 15 12 9.96 22.2
## 2 83.1 45.1 6 9 84.8 22.2
## 3 92.5 39.7 5 5 93.4 20.2
## 4 85.8 36.5 12 7 33.8 20.3
## 5 76.9 43.5 17 15 5.16 20.6
## 6 76.1 35.3 9 7 90.6 26.6
## 7 83.8 70.2 16 7 92.8 23.6
## 8 92.4 67.8 14 8 97.2 24.9
## 9 82.4 53.3 12 7 97.7 21
## 10 82.9 45.2 16 13 91.4 24.4
## # ... with 37 more rows
## tibble [47 x 6] (S3: tbl_df/tbl/data.frame)
## $ Fertility : num [1:47] 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num [1:47] 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int [1:47] 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int [1:47] 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num [1:47] 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num [1:47] 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
## [1] "factor"
## [1] 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5
## Levels: 1 2 3 4 5
## [1] "ordered" "factor"
## [1] low medium high low medium high low medium high low
## [11] medium high low medium high low medium high low medium
## [21] high low medium high low medium high low medium high
## Levels: low < medium < high
## [1] "function"
## function(){x^2}
## [1] "function"
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x000002c2a8b706d0>
## <environment: namespace:base>
"integer"
, "numeric"
,
"logical"
, "character"
TRUE
and FALSE
c(1,2,3) >= 2
results in
FALSE TRUE TRUE
c("Markus", "Matthias", "David", "Till")
gives the
vector "Markus" "Matthias" "David" "Till"
names(object) <- charaktervektor
: Name more complex
dataclasses, e.g. a dataframec()
: “concatenate”,
e.g. c(1.2,"Hans",5.0,6.7)
length()
: Get vector length:
: indicates from/to for numbersrep("Peter",2)
: Repeat "Peter"
two
timesseq(5,8)
: Sequence from 5 to 8vector[positions]
[1] 1.20 3.50 5.00 6.70 8.00 10.00 13.55
[1]
= Position of the first element displayed in that
line in the vector (show it!)Inf
and -Inf
: Positive and negative
infinityNaN
: “Not a number”, e.g. calculate
0/0
NA
: Missing valuerbind()
: Combine vectors line-by-linecbind()
: Combine vectors column-by-column# Generate two vectors
x <- 10:19
x
y <- c(TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE)
y
# seq(0,18,2)
z <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j")
z
# Access vector elements
x[1] # 1st element of x
x[1:5] # First 5 elements of x
x[-(2:6)] # All elements but not positions 2 to 6
z[4] # ?
x[y]
x[x< 16 | x >=12]
# Add names to elements of a vector
names(x)
names(x) <- z
# or like this
another.vector <- c(a=1,b=2)
x
names(x)
# Combine vectors rowwise and columnwise
rbind(x,y)
cbind(y,z)
# Q: What did R do to the vectors? What class do they have?
# Access with names
names(x)
x[c("a","c")]
x
of length 404 in which the numbers
199 to 400 are repeated two times. Generate a new vector y
,
that contains the elements of x
on the positions 53, 78 and
99.Freunde
that contains the
names of your three best friends.# 2.
x <- rep(1:5,10)
x
# 3.
x <- rep(199:400,2)
x[x>=240]
length(x)
y <- x[c(53, 78, 99)]
y
# 4.
freunde <- c("Platon", "Aristoteles", "Corbyn")
x
of length 101, that starts with the
number 33 and ends with the number 133.y
. Extract the elements on position 1 to 25 and
save them in a new vector z
. Join the two vectors both
column by column and subsequently line by line (rows!) and save the
results in two new objects colyz
and colyz
.
What class do the last two objects that you created posses?x
) that are smaller than 57
or greater/equal than 83 and save them in a new object
subgroup
.# Vectors
# 1
x <- 33:133
# 2.
y <- x[26:50]
z <- x[1:25]
yz <- cbind(y,z)
class(yz)
zy <- rbind(y,z)
zy
class(zy)
# cbind(y,z)[1:3,1:2]
# rbind(y,z)[1:2,1:5] # Gleiche Anzahl!
# class(cbind(y,z))
# 3.
subgroup <- x[x<57 | x>=83]
subgroup
"factor"
factor(..., levels = c(...))
: Create an unordered
factor with levels ...
factor(..., ordered = TRUE)
: Create an ordered factor
levels=c(...), labels=c(...)
: Specify
levels/labelslevels()
: Display categoriesas.numeric()
: Convert factor to numeric vector"list"
list()
: Create a list (can also be dataframe!)list$switzerland
: Access element
switzerland
of the list list
list[2]
: Access second element of the list
list
list[[2]]
: Access content of second element of the list
list
# Q: How do i get help for the function factor()?
# Create factor (nominal variable)
x <- factor(rep(1:2,10),levels=c(1,2), labels=c("SPD", "CDU"))
# Q: How do I go about to understand what happens in the function?
x
levels(x)
as.numeric(x)
# Create factor (ordinal variable)
y <- factor(rep(1:3,10),levels=c(1,2,3), labels=c("low", "medium", "high"))
y
as.numeric(y)
as.character(y)
# Create a list
participants <- list(Teacher= "Rudi",
Women = c("Daniela","Johanna"),
Men = c("Simon", "Peter", "usw."))
participants
length(participants)
# Access elements or subsets of that list
participants$Teacher
participants[1]
participants[[1]]
participants[["Teacher"]]
# Q: How can I access the list element "Women"?
# Q: How can i access the Johanna who is in the element "Women"?
mylist
with two elements. The first
element first
contains the numbers 5 to 105. The second
element second
contains the numbers -1 to -50. Create
another vector x
that contains the 70th value of the first
element of mylist
and the 30th element of the second
element of mylist
.anotherlist
with four elements: A, B, C,
D. A contains the number 2. B contains a vector with the number 1 to 10.
C contains a character vector with the names “Bernd” “Julia” “Peter”. D
contains a vector with the numbers 1 to 100.names
.D
of the list and save them in an object names
xyz
.test <- rep(1:10,10)
. Convert test
to an
ordered factor. Check which categories the factor has and whether it’s
ordered.# 1.
mylist <- list(first=c(5:105),second =c(-1:-50))
mylist
x <- c(mylist$first[70], mylist$second[30])
x
# 2.
A <- 2
anotherlist <- list(A=2,B=1:10,C=c("Bernd", "Julia", "Peter"),D=1:100)
# 3
names <- anotherlist[[3]]
names <- anotherlist$C
# 4
xyz <- anotherlist[[4]][25:35]
xyz <- anotherlist$D[25:35]
anotherlist$D[anotherlist$D>=25 & anotherlist$D<=35]
# Was ist der Unterschied zwischen den beiden?!!!?
# Wie w?rde man auf die Elemente 25 und 35 zugreifen?
# 5.
test <- rep(1:10,10)
ordered(test)
install.packages("packagename")
: Install packagelibrary(packagename)
: Load an installed packagedetach("package:packagename")
: Unload a packageremove.packages("packagename")
: Uninstall packagelibrary()
: Display all installed packageslibrary(help=packagename)
: Describe a packagesearch()
: Display all loaded packagesls("package:packagename")
: Display all objects within a
packagepackagename::bar
: Load just use one object in a package
(e.g. a function)pacman
package
pacman::p_load()
: Checks whether package is installed,
if not installs & loads package
p_load(ggplot2, tidyverse)
# Create a matrix
M <- diag(5)
M[1:20]<- c(1:20)
M
ginv(M) # Funkcion ginv()
# ginv calculates the Moore-Penrose-Inverse of the matrix M
# But it doesn not work here.. why?
help.search("ginv") # function is in the package MASS
install.packages("MASS") # Install package
MASS::ginv(M) # Call function without loading the whole package
library(MASS) # Load whole package
ginv(M)
ls("package:MASS") # Display content of the package MASS
detach("package:MASS") # Unload the package
ginv(M)
search()
library()
remove.packages("MASS")
library() # Package is not installed anymore
ggplot2
.ggplot2
.ggplot2
.ggplot2
.ggplot2
.install.packages("ggplot2")
library(ggplot2)
ls("package:ggplot2")
detach("package:ggplot2")
remove.packages("ggplot2")
"lists"
of vectors of the same length under the hood"data.frame"
(classic) and tibble
"tbl_df"
object$var1
: Access variable var1
in data
frame object
as.numeric(object$var1)
: Convert class of variable into
numericNA
: What was that?!?!data.frame()
: Create a data frame
tibble()
: Create tibble dataframeas.data.frame()
: Convert into a data frame
(cf. as_tibble()
)summary()
und str()
: Get oversight of a
data frame’s contenthead()
and tail()
: Inspect the dataView()
: Show data frame (formerly fix()
)
fix()
is open!names()
: Display variablenames (and use to rename)na.omit()
: Delete missings “listwise”, i.e. rows that
contain at least one missingis.na()
: Generate logical vector indicating the
missingsattach()
: When you attach a data frame you can use
variable names directly without referring to the dataframe (R
understands that)$
to access variableslm(...., data=yourdataframe)
getwd() # Get working directory
library(foreign) # Load package "foreign"
ls("package:foreign") # Check package content
?swiss # Check out the object
# What is this about?
swiss2 <- swiss # Load data set
# attach() function
View(swiss2)
attach(swiss2)
names(swiss2)
Education
detach(swiss2)
Education
swiss2$Education
# Get info on data set
str(swiss2)
summary(swiss2)
head(swiss2)
fix(swiss2)
tail(swiss2)
# Create a data frame
data <- data.frame(id=1:3, # !
weight=c(20,27,24),
size=c("small", "large", "medium"))
data
dataframe[rows,columns]
# Q: What does the following code do?
swiss[2:4, c(1,2,4)] # when do we need c() instead of ":"?
swiss[swiss$Fertility > 75 & swiss$Agriculture > 75, c(1:3)]
subset(swiss, Fertility > 75 & Agriculture > 75)[, c(1:3)]
swiss[, c("Fertility", "Agriculture")]
# We'll learn a more convenient function later on!
A “new” package dplyr written by Hadley Wickham/Romain Francois replaces many old functions for data management
Functions in dplyr
are highly performant (big data!)
and consistent
See this page for an excellent overview and the Data Wrangling Cheat Sheet
What could the following functions be used for?
filter()
, arrange()
,
select()
, distinct()
, mutate()
,
summarise()
, group_by()
,
recode()
select()
: Selects columns (can be used for renaming,
see rename()
)slice()
: Select subset of rowsfilter()
: Filter a subset of the rowsarrange()
: Reorders the rows of a data framedistinct()
: Returns unique values/rowsmutate()
: Add new columns to existing columns (see also
transmute()
)summarise()
: Collapses a data frame to a single row
(e.g. aggregation)group_by()
: Break data set into groups (of rows), to
apply above functions to each grouprecode()
: Recode variablesdplyr
it is possible to use the
%>%
operator
x %>% f(y)
turns into f(x, y)
:
x
(e.g. a data set) is inserted into the function on the
right of %>%
%>%
: Forces >
to be a function which
does the aboveswiss %>% select(Catholic) %>% summarise(mean(Catholic))
# install.packages("dplyr")
library(dplyr)
# ?swiss # Check out the data set
# fix(swiss)
# Filter
swiss %>% filter(Agriculture >= 60 & Fertility >= 70)
swiss[swiss$Agriculture >= 60 & swiss$Fertility >= 70, ] # Classic approach
# Q: What if I want all observations/rows with Catholic <= 50 ?
# Q: What if I want all observations/rows with Catholic <= 50 OR Catholic Catholic >= 80?
# Q: What if I want all observations/rows with Catholic <= 50 AND Catholic Catholic >= 80?
# Slice
swiss %>% slice(3:7)
swiss[3:7,] # Classic approach
# Q: What if I want the rows number 11 to 15 AND 18 to 20?
# Normally, names of observations (e.g. countrys) are not saved as row.names
# but simply in a variable
row.names(swiss)
order()
library(dplyr)
swiss %>% arrange(Education, Examination, Agriculture)
# Q: What if I want to sort according to examination first?
swiss %>% arrange(desc(Education), Examination)
desc(swiss$Education)
# Q: What if I want to sort Examination in descending order instead of Education?
# CLASSIC WAY
swiss[order(swiss$Education, swiss$Examination, swiss$Agriculture), ]
swiss[order(desc(swiss$Education)), ] # type ?order
names(dataframe) <- charactervector
rename()
function in different packages
(e.g. package reshape
)dplyr
# install.packages("dplyr")
library(dplyr)
names(swiss)
swiss %>% select(Fertility, Agriculture, Education) # Select columns by name
swiss %>% select(Agriculture:Education) # Variables from:to
swiss %>% select(-(Agriculture:Education)) # Variables without (from:to)
swiss %>% select(Geburtsrate = Fertility) # Select and rename (same as assigning arrow <-!)
swiss %>% rename(Geburtsrate = Fertility) # Alternative because select drops non-mentioned variables!
swiss %>% select(1,2)
# Q: How can I save that in a new data frame?
swiss %>% select(Agriculture:Education)
# Q: What do I have to type to extract the variables Catholic and Infant.Mortality
# from the swiss dataset and save them in a new object?
library(dplyr)
swiss %>%
select(Education) %>%
distinct(Education) # Ignore the rownames here (they are not meaningful)
swiss$Education
swiss %>%
select(Education, Examination) %>%
distinct(Education, Examination)
# Q: How many observations do I get if I extract the distinct values of swiss$Catholic?
# ????
# This is useful when pulling out the names of all present countries in a datafile
# CLASSIC WAY
unique(swiss$Education)
dplyr
contains the functions
mutate(), transform(), transmute()
swiss %>%
mutate(Examination10 = Examination*10,
FertAgri = Fertility - Agriculture)
# Q: What does the above function do?
# Don't forget to assign the result to a new object if you want to save it!
# mutate() allows you to refer to variable you just created, transform() doesnt
swiss %>%
mutate(Examination10 = Examination*10,
NewExi = Examination10 - 10)
swiss %>%
transform(Examination10 = Examination*10,
NewExi = Examination10 - 10)
# Use transmute if you only wan't to keep new variables
swiss %>%
transmute(Examination10 = Examination*10,
NewExi = Examination10 - 10)
# Q: What if I want to get a new data set only with the variable Catholic but divided by 10? What do I have to write?
swiss2 <- cbind(swiss, row.names(swiss))
. Compare
swiss
and swiss2
with View()
and
explain what the code does!swiss
(?swiss
) for all the
exercises below. First, extract columns 2 and 3 and save them in a new
object.swiss
(?swiss
) that have values on the variable
Agriculture
that are smaller or equal than 20 and bigger or
equal than 80 (Agriculture <= 20
or
>= 80
). Save the results in a new data frame, that
comprises the first 3 variables/columns of the old data set.Infant.Mortality
. Which province has the highest
level of Infant.Mortality
?Infant.Mortality.squared
that contains the squared values
of the variable Infant.Mortality
. Then rename the variable
Infant.Mortality.squared
into
Infant.Mortality.sq
.getwd()
library(dplyr)
# 2
swiss3 <- swiss %>%
select(2,3)
# 3
swiss3 <- swiss %>%
select(1,4) %>%
slice(2:6)
# 4
swiss3 <- swiss %>%
select(2,4,6) %>%
slice(c(1,3,6)) # Or slice(1,3,6)
# 5
df_new <- swiss %>%
filter(Agriculture <= 20 | Agriculture >= 80) %>%
select(1,2,3)
# 6
swiss %>%
arrange(Infant.Mortality) # Porrentruy
# 7
swiss %>%
mutate(Infant.Mortality.squared = Infant.Mortality^2) %>%
rename(Infant.Mortality.sq = Infant.Mortality.squared)
mapvalues()
: Recode a categorical vector
(plyr
package)cut()
: Recode a continuous variable into a categorical
one (plyr
package)if_else(condition, value if TRUE, value if FALSE)
:
Recode with ifelse condition (dplyr
package)table(variable1, variablevar2, useNA = "always")
:
Contingency table for the two variablesstr()
and summary()
: Check whether
variables in the data set have expected distributions and beware of
missings!# install.packages("plyr")
swiss2 <- swiss # Make a copy of the data set
names(swiss2) # Display variables
str(swiss2)
summary(swiss2)
# Create a dummy for "Catholic"
# For recoding character variables simply refer to text with ""
library(plyr)
# if_else
swiss2 <- swiss2 %>%
mutate(catholic.dummy3 = if_else(Catholic > 50, 1, 0))
table(swiss2$catholic.dummy3, swiss2$Catholic,
useNA = "always") # check recoding
# mapvalues()
swiss2 <- swiss2 %>%
mutate(Examination2 = mapvalues(Examination,
from = c(3, 37),
to = c(NA, NA)))
# cut()
swiss2 <- swiss2 %>%
mutate(Examination3 = cut(Examination2,
breaks=c(-Inf, 12, 22, Inf),
labels=c("low","medium","high"))) # greater than or equal to
# CLASSIC WAY
# MANUEL CLASSIC WAY
swiss2$catholic.dummy <- NA # generate new variable in dataset
View(swiss2)
swiss2$catholic.dummy[swiss2$Catholic <= 50] <- 0 # replace values conditional on Catholic
swiss2$catholic.dummy[swiss2$Catholic > 50] <- 1 # replace values conditional on Catholic
table(swiss2$catholic.dummy, swiss2$Catholic,
useNA = "always") # check recoding
names(swiss2) # show variable names
names(swiss2)[7] <- "catholic.dummy" # Rename
swiss2
.Infant.Mortality
in your new data
set swiss2
so that values <= 18
are coded
as 0
, 18 < values <= 20
as
1
, 20 < values <= 21
as 2
and 21 < values <= 27
as 3
. Do this
using the cut()
function and name the respective variable
inf.mort.cut
. Check if your coding worked and check the
class of the new variable/object.Infant.Mortality
using
the if_else()
function called
Infant.Mortality.dummy
that is 0 for values below 20 and 1
for values equal or above 20.# 1.
swiss2 <- swiss
# 2.
install.packages("plyr")
library(plyr)
swiss2 <- swiss2 %>%
mutate(inf.mort.cut = cut(Infant.Mortality,
breaks=c(-Inf, 18, 20, 21, Inf)))
table(swiss2$inf.mort.cut, swiss2$Infant.Mortality)
class(swiss2$inf.mort.cut)
fix(swiss2)
# We should label the factor variable by adding the argument
# labels=c("lowest", "low","high","highest")
# 3.
swiss2 <- swiss2 %>%
mutate(Infant.Mortality.dummy = if_else(Infant.Mortality >= 20, 1, 0))
group_by()
: function to group a dataframe (break it
into groups)dplyr
functions recognize when data frame is
groupedgroup_by()
can also be used for aggregating data (e.g.,
mean within groups)# See http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html for this example
library(foreign)
essdata <- read.dta("./Material/ESS4e04_de.dta", convert.factors = F)
View(essdata)
nrow(essdata) # Check number of rows
names(essdata)
# edulvl measures education levels
data_grouped <- essdata %>%
group_by(edulvl_str) # convert the data frame into a
# grouped data frame and save in object
# Character variable to aggregate
data_grouped # we can see the group variable and the dimensions
class(data_grouped) # we can see the new class,
data_agg <- data_grouped %>%
summarise( # summarise collapses data frame
n = n(), # Count rows (ignores missings!)
age.m = mean(age, na.rm = TRUE), # Variable containing mean
hheinkommen.m = mean(hheinkommen, na.rm = TRUE),
NA.age = sum(is.na(age)), # Count missing on age variable
N.age = sum(!is.na(age)), # Count non-missings on age variable
) # Variable containing mean
View(data_agg)
# Classically we used aggregate()
# Sometimes we want to aggregate the data (e.g. calculate the average) but don't want to collapse the data
# Below we add a variable with group means but don't collapse the dataframe
# Group, calculate means but do not collapse
data_grouped2 <- data_grouped %>%
mutate(hheinkommen.m = mean(hheinkommen, na.rm = TRUE)) %>%
arrange(edulvl_str, hheinkommen) %>%
select(edulvl_str, hheinkommen, hheinkommen.m)
library(foreign)
and
essdata <- read.dta("./Material/ESS4e04_de.dta", convert.factors=F)
.
Adapt your file path!religion_str
contains the religious
affiliation of respondents. Aggregate the data set - using functions
from dplyr
package - so that you obtain averages for
subgroups of religious affiliations for the variables
polinteresse
and trustparties
- as well as a
variable with the number of rows across the groups.library(foreign)
data <- read.dta("./Material/ESS4e04_de.dta", convert.factors=F)
library(dplyr)
data_agg <- data %>%
group_by(religion_str) %>%
summarise(
count = n(), # Add variable with the number of observations in group
mean.polinteresse = mean(polinteresse, na.rm = TRUE), # Variable containing mean of distance
mean.trustparties = mean(trustparties, na.rm = TRUE)) # Variable containing mean of delay
View(data_agg)
data_agg <- data.frame(data_agg)
class(data_agg)
merge()
(see Quick
R)dplyr
is much fasterinner_join(x, y, by = NULL, copy = FALSE, ...) # all intersecting observations
left_join(x, y, by = NULL, copy = FALSE, ...) # all observations from x
semi_join(x, y, by = NULL, copy = FALSE, ...) # all observations from x
anti_join(x, y, by = NULL, copy = FALSE, ...) # all in x that are not in y
x
= first data set, y
= second data
setby = "var"
oder by = c(var1, var2)
?join
: Check out the corresponding helpfile# An example for merging
# Create dataset and convert rownames to variable
nrow(swiss)
swiss2 <- swiss %>%
mutate(region = rownames(swiss)) %>%
remove_rownames() %>%
select(region, everything())
# Generate 2 data frames each possess parts of the observations and the regions
# variable
# Q: What new datasets do I generate below?
swiss.a <- swiss2 %>%
slice(1:8) %>%
select(1:3)
View(swiss.a)
swiss.b <- swiss2 %>%
slice(c(1, 6:7, 12)) %>%
select(c(1,4:5))
View(swiss.b)
intersect(swiss.a$region, swiss.b$region) # check intersection, i.e.
# which regions appear in both data sets?
# MERGING OF THE TWO DATA FRAMES
library(dplyr) # is the package installed?
?inner_join
swiss.inner <- inner_join(swiss.a, swiss.b, by ="region")
View(swiss.a)
View(swiss.inner) # data set with observations that intersect across both data sets
swiss.left <- left_join(swiss.a, swiss.b, by ="region")
View(swiss.left) # all observations in x = swiss.a
# Now try semi_join(), anti_join() FOR YOURSELF!
swiss2
and the objects swiss.a
and swiss.b
subsequently.swiss.c
that include the first 10
countries in the swiss data set and their values on the variables
region
, Catholic
and
Infant.Mortality
.swiss.c
with swiss.a
so that
the merged dataset contains all the countries that are contained in at
least one of the two data sets.getwd()
swiss2 <- cbind(row.names(swiss), swiss)
names(swiss2)[1] <- "region"
# Generiere 2 Datens?tze die beide eine L?nderspalte besitzen
swiss.a <- swiss2 %>%
slice(5:36) %>% # zeilen 1-4 fehlen
select(1:3)
View(swiss.a)
swiss.c <- swiss2 %>%
slice(1:10) %>%
select("region", "Catholic", "Infant.Mortality")
View(swiss.c)
intersect(swiss.a$region, swiss.b$region) # check intersection
library(dplyr) # installiert?
?join
swiss.full <- full_join(swiss.a,
swiss.c,
by ="region")
View(swiss.full) # data set with observations that intersect across both data sets
data %>% group_by(...) %>% nest()
: Nest dataframe
by variable in group_by(...)
unnest(data, cols = "...")
: Unnest variables specified
in cols = "..."
# Usually call it "data" or "df"
# Create a dataframe
data1 <- data.frame(name = c("Peter", "Paul", "Mary"),
income = c(550, 640, 710))
data1
data2 <- data.frame(name= c("Julia", "Hans", "Tina"),
income = c(640, 320, 400))
data2
# Create a nested dataframe
data_nested <- tibble(groups = c("group1", "group2"),
data = list(data1, data2)) # Use list() for nested parts
data_nested
data_nested$data
data_nested$data[[1]]
data_nested$data[[2]]
# Unnest dataframe
data_unnested <- unnest(data_nested, cols = "data")
# Renest dataframe
data_nested <- data_unnested %>%
group_by(groups) %>%
nest()
df <- data.frame(name = c("Peter", "Paul", "Mary", "Julia", "Hans"),
gender = c("male", "female", "male", "female", "male"),
discipline = c("Sociology", "Sociology", "Polscience",
"Psychology", "Polscience"),
income = c(610, 640, 550, 710, 505))
discipline
. What
does the dataframe and the nested ones look in terms of dimensions?gender
. What does
the dataframe and the nested ones look in terms of dimensions?gender
and
discipline
. What does the dataframe and the nested ones
look in terms of dimensions?# 2.
df %>% group_by(discipline) %>% nest()
# 3.
df %>% group_by(gender) %>% nest()
# 4.
df %>% group_by(gender, discipline) %>% nest()
# 5.
df_nested <- df %>% group_by(gender, discipline) %>% nest()
df_nested$data[[1]]
mean(x)
, sd(x)
, var(x)
,
median(x)
, min(x)
, max(x)
,
cov()
, cor()
,
cor(x,y,method="kendall")
, table(x)
,
table(x,y)
, prop.table(table(x))
,
100*prop.table(table(x))
mean(x)
: Meansd(x)
: Standarddeviationvar(x)
: Variancemedian(x)
: Medianmin(x)
: Minimummax(x)
: Maximumcov()
: Covariancecor()
: Correlationcor(x,y,method="kendall")
# Kendall’s 0?cor
: Check helpfiletable(x)
: Onedimensional contingency tabletable(x,y)
: Twodimensional contingency tableprop.table(table(x))
: Onedimensional contingency table
with proportions100*prop.table(table(x))
: Onedimensional contingency
table with proportions in percentlibrary(foreign)
fix(swiss)
View(swiss)
# Contingency table for objects
table(Catholic)
table(swiss$Catholic, swiss$Education)
# Table with percentages
100*prop.table(table(swiss$Catholic))
100*prop.table(table(swiss$Catholic, swiss$Education))
round(100*prop.table(table(swiss$Catholic, swiss$Education)), 2) # rounded values
# Mean
mean(swiss$Catholic)
# Median
median(swiss$Catholic) # "middle value"
sort(swiss$Catholic)
# Save several statistics in one vector
x <- swiss$Catholic
c(mean=mean(x), median=median(x), stddev=sd(x), min=min(x), max=max(x))
# or
summary(x)
# Correlations between vectors
cor(swiss$Catholic,swiss$Education, method="spearman")
cov(swiss$Catholic,swiss$Education)
Assault
and
UrbanPop
(dataframe USArrests
) in two vectors
x
and y
. Calculate for both vectors
x
and y
the mean, the maximum, the minimum and
the variance and save all the results in two objects
statistics.x
and statistics.y
. Name the
elements of these two objects “Mean”, “Max” etc. Save
statistics.x
and statistics.y
as list elements
in one list. Finally, check wether the two vectors x
and
y
strongly correlate with each other.### 1.
x <- 1:8
x
1*2*3*4*5*6*7*8
cumprod(x)
### 2.
?USArrests # Informationen zum Objekt
x <- USArrests$Assault
y <- USArrests$UrbanPop
statistics.x <- c(mean(x), max(x), min(x), var(x))
statistics.x
statistics.y <- c(mean(x), max(x), min(x), var(x))
statistics.y
liste <- list("Mean"= mean(x), "Max" = max(x), "Min"= min(x), "Var"= var(x))
liste
x
y
cor(x,y)