Big data and Social Science

14.8 DPLYR: Grammar of data management (Hadley Wickham)

A “new” package dplyr written by Hadley Wickham/Romain Francois replaces many old functions for data management
Functions in dplyr are highly performant (big data!) and consistent
See this page for an excellent overview and the Data Wrangling Cheat Sheet

Functions
- filter(): Select a subset of the rows of a data frame (see also slice())
- arrange(): Reorders the rows of a data frame
- select(): Selects columns (can be used for renaming, see rename())
- distinct(): Returns unique values in a table
- mutate(): Add new columns to existing columns (see also transmute())
- summarise(): Collapses a data frame to a single row (e.g. aggregation)
- group_by(): Break data set into groups (of rows), to apply above functions to each group

14.8.1 filter() & slice()

Selecting rows = selecting observations (e.g. countries, individuals etc.)
Example

# install.packages("dplyr")
library(dplyr)

# ?swiss # Check out the data set
# fix(swiss)

# Filter
filter(swiss, Agriculture >= 60 & Fertility >= 70)

Fertility	Agriculture	Examination	Education	Catholic	Infant.Mortality
83.8	70.2	16	7	92.85	23.6
92.4	67.8	14	8	97.16	24.9
87.1	64.5	14	6	98.61	24.5
72.5	71.2	12	1	2.40	21.0
72.0	63.5	6	3	2.56	18.0
75.5	85.9	3	2	99.71	15.1
77.3	89.7	5	2	100.00	18.3
70.5	78.2	12	6	98.96	19.4
79.4	64.9	7	3	98.22	20.2
92.2	84.6	3	3	99.46	16.3
79.3	63.1	13	13	96.83	18.1

swiss[swiss$Agriculture >= 60 & swiss$Fertility >= 70, ] # Classic approach

	Fertility	Agriculture	Examination	Education	Catholic	Infant.Mortality
Broye	83.8	70.2	16	7	92.85	23.6
Glane	92.4	67.8	14	8	97.16	24.9
Veveyse	87.1	64.5	14	6	98.61	24.5
Oron	72.5	71.2	12	1	2.40	21.0
Paysd’enhaut	72.0	63.5	6	3	2.56	18.0
Conthey	75.5	85.9	3	2	99.71	15.1
Herens	77.3	89.7	5	2	100.00	18.3
Martigwy	70.5	78.2	12	6	98.96	19.4
Monthey	79.4	64.9	7	3	98.22	20.2
Sierre	92.2	84.6	3	3	99.46	16.3
Sion	79.3	63.1	13	13	96.83	18.1

# Q: What if I want all observations/rows with Catholic <= 50 ?
# Q: What if I want all observations/rows with Catholic <= 50 OR Catholic  Catholic >= 80?
# Q: What if I want all observations/rows with Catholic <= 50 AND Catholic  Catholic >= 80?



# Slice
slice(swiss, 3:7)

Fertility	Agriculture	Examination	Education	Catholic	Infant.Mortality
92.5	39.7	5	5	93.40	20.2
85.8	36.5	12	7	33.77	20.3
76.9	43.5	17	15	5.16	20.6
76.1	35.3	9	7	90.57	26.6
83.8	70.2	16	7	92.85	23.6

swiss[3:7,] # Classic approach

	Fertility	Agriculture	Examination	Education	Catholic	Infant.Mortality
Franches-Mnt	92.5	39.7	5	5	93.40	20.2
Moutier	85.8	36.5	12	7	33.77	20.3
Neuveville	76.9	43.5	17	15	5.16	20.6
Porrentruy	76.1	35.3	9	7	90.57	26.6
Broye	83.8	70.2	16	7	92.85	23.6

# Q: What if I want the rows number 11 to 15 AND 18 to 20?



# Normally, names of observations (e.g. countrys) are not saved as row.names
# but simply in a variable
row.names(swiss)

##  [1] "Courtelary"   "Delemont"     "Franches-Mnt" "Moutier"     
##  [5] "Neuveville"   "Porrentruy"   "Broye"        "Glane"       
##  [9] "Gruyere"      "Sarine"       "Veveyse"      "Aigle"       
## [13] "Aubonne"      "Avenches"     "Cossonay"     "Echallens"   
## [17] "Grandson"     "Lausanne"     "La Vallee"    "Lavaux"      
## [21] "Morges"       "Moudon"       "Nyone"        "Orbe"        
## [25] "Oron"         "Payerne"      "Paysd'enhaut" "Rolle"       
## [29] "Vevey"        "Yverdon"      "Conthey"      "Entremont"   
## [33] "Herens"       "Martigwy"     "Monthey"      "St Maurice"  
## [37] "Sierre"       "Sion"         "Boudry"       "La Chauxdfnd"
## [41] "Le Locle"     "Neuchatel"    "Val de Ruz"   "ValdeTravers"
## [45] "V. De Geneve" "Rive Droite"  "Rive Gauche"

14.8.2 arrange(): Reorder/sort rows

Classic way is order()

library(dplyr)
arrange(swiss, Education, Examination, Agriculture)
# Q: What if I want to sort according to examination first?

arrange(swiss, desc(Education), Examination)
desc(swiss$Education)
# Q: What if I want to sort Examination in descending order instead of Education?


# CLASSIC WAY
swiss[order(swiss$Education, swiss$Examination, swiss$Agriculture), ]
swiss[order(desc(swiss$Education)), ] # type ?order

14.8.3 select(): Subsetting and renaming

Classic way
- names(dataframe) <- charactervector
- or rename() function in different packages (e.g. package reshape)
- better with dplyr

library(dplyr)
select(swiss, Fertility, Agriculture, Education) # Select columns by name

	Fertility	Agriculture	Education
Courtelary	80.2	17.0	12
Delemont	83.1	45.1	9
Franches-Mnt	92.5	39.7	5
Moutier	85.8	36.5	7
Neuveville	76.9	43.5	15
Porrentruy	76.1	35.3	7
Broye	83.8	70.2	7
Glane	92.4	67.8	8
Gruyere	82.4	53.3	7
Sarine	82.9	45.2	13
Veveyse	87.1	64.5	6
Aigle	64.1	62.0	12
Aubonne	66.9	67.5	7
Avenches	68.9	60.7	12
Cossonay	61.7	69.3	5
Echallens	68.3	72.6	2
Grandson	71.7	34.0	8
Lausanne	55.7	19.4	28
La Vallee	54.3	15.2	20
Lavaux	65.1	73.0	9
Morges	65.5	59.8	10
Moudon	65.0	55.1	3
Nyone	56.6	50.9	12
Orbe	57.4	54.1	6
Oron	72.5	71.2	1
Payerne	74.2	58.1	8
Paysd’enhaut	72.0	63.5	3
Rolle	60.5	60.8	10
Vevey	58.3	26.8	19
Yverdon	65.4	49.5	8
Conthey	75.5	85.9	2
Entremont	69.3	84.9	6
Herens	77.3	89.7	2
Martigwy	70.5	78.2	6
Monthey	79.4	64.9	3
St Maurice	65.0	75.9	9
Sierre	92.2	84.6	3
Sion	79.3	63.1	13
Boudry	70.4	38.4	12
La Chauxdfnd	65.7	7.7	11
Le Locle	72.7	16.7	13
Neuchatel	64.4	17.6	32
Val de Ruz	77.6	37.6	7
ValdeTravers	67.6	18.7	7
V. De Geneve	35.0	1.2	53
Rive Droite	44.7	46.6	29
Rive Gauche	42.8	27.7	29

select(swiss, Agriculture:Education) # Variables from:to

	Agriculture	Examination	Education
Courtelary	17.0	15	12
Delemont	45.1	6	9
Franches-Mnt	39.7	5	5
Moutier	36.5	12	7
Neuveville	43.5	17	15
Porrentruy	35.3	9	7
Broye	70.2	16	7
Glane	67.8	14	8
Gruyere	53.3	12	7
Sarine	45.2	16	13
Veveyse	64.5	14	6
Aigle	62.0	21	12
Aubonne	67.5	14	7
Avenches	60.7	19	12
Cossonay	69.3	22	5
Echallens	72.6	18	2
Grandson	34.0	17	8
Lausanne	19.4	26	28
La Vallee	15.2	31	20
Lavaux	73.0	19	9
Morges	59.8	22	10
Moudon	55.1	14	3
Nyone	50.9	22	12
Orbe	54.1	20	6
Oron	71.2	12	1
Payerne	58.1	14	8
Paysd’enhaut	63.5	6	3
Rolle	60.8	16	10
Vevey	26.8	25	19
Yverdon	49.5	15	8
Conthey	85.9	3	2
Entremont	84.9	7	6
Herens	89.7	5	2
Martigwy	78.2	12	6
Monthey	64.9	7	3
St Maurice	75.9	9	9
Sierre	84.6	3	3
Sion	63.1	13	13
Boudry	38.4	26	12
La Chauxdfnd	7.7	29	11
Le Locle	16.7	22	13
Neuchatel	17.6	35	32
Val de Ruz	37.6	15	7
ValdeTravers	18.7	25	7
V. De Geneve	1.2	37	53
Rive Droite	46.6	16	29
Rive Gauche	27.7	22	29

select(swiss, -(Agriculture:Education)) # Variables without (from:to)

	Fertility	Catholic	Infant.Mortality
Courtelary	80.2	9.96	22.2
Delemont	83.1	84.84	22.2
Franches-Mnt	92.5	93.40	20.2
Moutier	85.8	33.77	20.3
Neuveville	76.9	5.16	20.6
Porrentruy	76.1	90.57	26.6
Broye	83.8	92.85	23.6
Glane	92.4	97.16	24.9
Gruyere	82.4	97.67	21.0
Sarine	82.9	91.38	24.4
Veveyse	87.1	98.61	24.5
Aigle	64.1	8.52	16.5
Aubonne	66.9	2.27	19.1
Avenches	68.9	4.43	22.7
Cossonay	61.7	2.82	18.7
Echallens	68.3	24.20	21.2
Grandson	71.7	3.30	20.0
Lausanne	55.7	12.11	20.2
La Vallee	54.3	2.15	10.8
Lavaux	65.1	2.84	20.0
Morges	65.5	5.23	18.0
Moudon	65.0	4.52	22.4
Nyone	56.6	15.14	16.7
Orbe	57.4	4.20	15.3
Oron	72.5	2.40	21.0
Payerne	74.2	5.23	23.8
Paysd’enhaut	72.0	2.56	18.0
Rolle	60.5	7.72	16.3
Vevey	58.3	18.46	20.9
Yverdon	65.4	6.10	22.5
Conthey	75.5	99.71	15.1
Entremont	69.3	99.68	19.8
Herens	77.3	100.00	18.3
Martigwy	70.5	98.96	19.4
Monthey	79.4	98.22	20.2
St Maurice	65.0	99.06	17.8
Sierre	92.2	99.46	16.3
Sion	79.3	96.83	18.1
Boudry	70.4	5.62	20.3
La Chauxdfnd	65.7	13.79	20.5
Le Locle	72.7	11.22	18.9
Neuchatel	64.4	16.92	23.0
Val de Ruz	77.6	4.97	20.0
ValdeTravers	67.6	8.65	19.5
V. De Geneve	35.0	42.34	18.0
Rive Droite	44.7	50.43	18.2
Rive Gauche	42.8	58.33	19.3

select(swiss, Geburtsrate = Fertility) # Select and rename

	Geburtsrate
Courtelary	80.2
Delemont	83.1
Franches-Mnt	92.5
Moutier	85.8
Neuveville	76.9
Porrentruy	76.1
Broye	83.8
Glane	92.4
Gruyere	82.4
Sarine	82.9
Veveyse	87.1
Aigle	64.1
Aubonne	66.9
Avenches	68.9
Cossonay	61.7
Echallens	68.3
Grandson	71.7
Lausanne	55.7
La Vallee	54.3
Lavaux	65.1
Morges	65.5
Moudon	65.0
Nyone	56.6
Orbe	57.4
Oron	72.5
Payerne	74.2
Paysd’enhaut	72.0
Rolle	60.5
Vevey	58.3
Yverdon	65.4
Conthey	75.5
Entremont	69.3
Herens	77.3
Martigwy	70.5
Monthey	79.4
St Maurice	65.0
Sierre	92.2
Sion	79.3
Boudry	70.4
La Chauxdfnd	65.7
Le Locle	72.7
Neuchatel	64.4
Val de Ruz	77.6
ValdeTravers	67.6
V. De Geneve	35.0
Rive Droite	44.7
Rive Gauche	42.8

dplyr::rename(swiss, Geburtsrate = Fertility) # Alternative because select drops non-mentioned variables!

	Geburtsrate	Agriculture	Examination	Education	Catholic	Infant.Mortality
Courtelary	80.2	17.0	15	12	9.96	22.2
Delemont	83.1	45.1	6	9	84.84	22.2
Franches-Mnt	92.5	39.7	5	5	93.40	20.2
Moutier	85.8	36.5	12	7	33.77	20.3
Neuveville	76.9	43.5	17	15	5.16	20.6
Porrentruy	76.1	35.3	9	7	90.57	26.6
Broye	83.8	70.2	16	7	92.85	23.6
Glane	92.4	67.8	14	8	97.16	24.9
Gruyere	82.4	53.3	12	7	97.67	21.0
Sarine	82.9	45.2	16	13	91.38	24.4
Veveyse	87.1	64.5	14	6	98.61	24.5
Aigle	64.1	62.0	21	12	8.52	16.5
Aubonne	66.9	67.5	14	7	2.27	19.1
Avenches	68.9	60.7	19	12	4.43	22.7
Cossonay	61.7	69.3	22	5	2.82	18.7
Echallens	68.3	72.6	18	2	24.20	21.2
Grandson	71.7	34.0	17	8	3.30	20.0
Lausanne	55.7	19.4	26	28	12.11	20.2
La Vallee	54.3	15.2	31	20	2.15	10.8
Lavaux	65.1	73.0	19	9	2.84	20.0
Morges	65.5	59.8	22	10	5.23	18.0
Moudon	65.0	55.1	14	3	4.52	22.4
Nyone	56.6	50.9	22	12	15.14	16.7
Orbe	57.4	54.1	20	6	4.20	15.3
Oron	72.5	71.2	12	1	2.40	21.0
Payerne	74.2	58.1	14	8	5.23	23.8
Paysd’enhaut	72.0	63.5	6	3	2.56	18.0
Rolle	60.5	60.8	16	10	7.72	16.3
Vevey	58.3	26.8	25	19	18.46	20.9
Yverdon	65.4	49.5	15	8	6.10	22.5
Conthey	75.5	85.9	3	2	99.71	15.1
Entremont	69.3	84.9	7	6	99.68	19.8
Herens	77.3	89.7	5	2	100.00	18.3
Martigwy	70.5	78.2	12	6	98.96	19.4
Monthey	79.4	64.9	7	3	98.22	20.2
St Maurice	65.0	75.9	9	9	99.06	17.8
Sierre	92.2	84.6	3	3	99.46	16.3
Sion	79.3	63.1	13	13	96.83	18.1
Boudry	70.4	38.4	26	12	5.62	20.3
La Chauxdfnd	65.7	7.7	29	11	13.79	20.5
Le Locle	72.7	16.7	22	13	11.22	18.9
Neuchatel	64.4	17.6	35	32	16.92	23.0
Val de Ruz	77.6	37.6	15	7	4.97	20.0
ValdeTravers	67.6	18.7	25	7	8.65	19.5
V. De Geneve	35.0	1.2	37	53	42.34	18.0
Rive Droite	44.7	46.6	16	29	50.43	18.2
Rive Gauche	42.8	27.7	22	29	58.33	19.3

# Q: How can I save that in a new data frame?
select(swiss, Agriculture:Education)

	Agriculture	Examination	Education
Courtelary	17.0	15	12
Delemont	45.1	6	9
Franches-Mnt	39.7	5	5
Moutier	36.5	12	7
Neuveville	43.5	17	15
Porrentruy	35.3	9	7
Broye	70.2	16	7
Glane	67.8	14	8
Gruyere	53.3	12	7
Sarine	45.2	16	13
Veveyse	64.5	14	6
Aigle	62.0	21	12
Aubonne	67.5	14	7
Avenches	60.7	19	12
Cossonay	69.3	22	5
Echallens	72.6	18	2
Grandson	34.0	17	8
Lausanne	19.4	26	28
La Vallee	15.2	31	20
Lavaux	73.0	19	9
Morges	59.8	22	10
Moudon	55.1	14	3
Nyone	50.9	22	12
Orbe	54.1	20	6
Oron	71.2	12	1
Payerne	58.1	14	8
Paysd’enhaut	63.5	6	3
Rolle	60.8	16	10
Vevey	26.8	25	19
Yverdon	49.5	15	8
Conthey	85.9	3	2
Entremont	84.9	7	6
Herens	89.7	5	2
Martigwy	78.2	12	6
Monthey	64.9	7	3
St Maurice	75.9	9	9
Sierre	84.6	3	3
Sion	63.1	13	13
Boudry	38.4	26	12
La Chauxdfnd	7.7	29	11
Le Locle	16.7	22	13
Neuchatel	17.6	35	32
Val de Ruz	37.6	15	7
ValdeTravers	18.7	25	7
V. De Geneve	1.2	37	53
Rive Droite	46.6	16	29
Rive Gauche	27.7	22	29

# Q: What do I have to type to extract the variables Catholic and Infant.Mortality
# from the swiss dataset and save them in a new object?


# CLASSIC WAY
swiss2 <- swiss
names(swiss2)[names(swiss2)=="Agriculture"] <- "test"
# Q: How would you go about to understand this term?

14.8.4 distinct() & unique(): Extract distinct/unique rows/values

library(dplyr)
distinct(select(swiss, Education))

Education
12
9
5
7
15
8
13
6
2
28
20
10
3
1
19
11
32
53
29

swiss$Education

##  [1] 12  9  5  7 15  7  7  8  7 13  6 12  7 12  5  2  8 28 20  9 10  3 12
## [24]  6  1  8  3 10 19  8  2  6  2  6  3  9  3 13 12 11 13 32  7  7 53 29
## [47] 29

distinct(select(swiss, Education, Examination))

Education	Examination
12	15
9	6
5	5
7	12
15	17
7	9
7	16
8	14
13	16
6	14
12	21
7	14
12	19
5	22
2	18
8	17
28	26
20	31
9	19
10	22
3	14
12	22
6	20
1	12
3	6
10	16
19	25
8	15
2	3
6	7
2	5
6	12
3	7
9	9
3	3
13	13
12	26
11	29
13	22
32	35
7	15
7	25
53	37
29	16
29	22

# Q: How many observations do I get if I extract the distinct values of swiss$Catholic?

# ????

# This is useful when pulling out the names of all present countries in a datafile

# CLASSIC WAY
unique(swiss$Education)

##  [1] 12  9  5  7 15  8 13  6  2 28 20 10  3  1 19 11 32 53 29

14.8.5 mutate() & transform ()

Transformations are executed line-by-line
dplyr contains the functions mutate(), transform(), transmute()

library(dplyr)
mutate(swiss, Examination10 = Examination*10, FertAgri = Fertility - Agriculture)

	Fertility	Agriculture	Examination	Education	Catholic	Infant.Mortality	Examination10	FertAgri
Courtelary	80.2	17.0	15	12	9.96	22.2	150	63.2
Delemont	83.1	45.1	6	9	84.84	22.2	60	38.0
Franches-Mnt	92.5	39.7	5	5	93.40	20.2	50	52.8
Moutier	85.8	36.5	12	7	33.77	20.3	120	49.3
Neuveville	76.9	43.5	17	15	5.16	20.6	170	33.4
Porrentruy	76.1	35.3	9	7	90.57	26.6	90	40.8
Broye	83.8	70.2	16	7	92.85	23.6	160	13.6
Glane	92.4	67.8	14	8	97.16	24.9	140	24.6
Gruyere	82.4	53.3	12	7	97.67	21.0	120	29.1
Sarine	82.9	45.2	16	13	91.38	24.4	160	37.7
Veveyse	87.1	64.5	14	6	98.61	24.5	140	22.6
Aigle	64.1	62.0	21	12	8.52	16.5	210	2.1
Aubonne	66.9	67.5	14	7	2.27	19.1	140	-0.6
Avenches	68.9	60.7	19	12	4.43	22.7	190	8.2
Cossonay	61.7	69.3	22	5	2.82	18.7	220	-7.6
Echallens	68.3	72.6	18	2	24.20	21.2	180	-4.3
Grandson	71.7	34.0	17	8	3.30	20.0	170	37.7
Lausanne	55.7	19.4	26	28	12.11	20.2	260	36.3
La Vallee	54.3	15.2	31	20	2.15	10.8	310	39.1
Lavaux	65.1	73.0	19	9	2.84	20.0	190	-7.9
Morges	65.5	59.8	22	10	5.23	18.0	220	5.7
Moudon	65.0	55.1	14	3	4.52	22.4	140	9.9
Nyone	56.6	50.9	22	12	15.14	16.7	220	5.7
Orbe	57.4	54.1	20	6	4.20	15.3	200	3.3
Oron	72.5	71.2	12	1	2.40	21.0	120	1.3
Payerne	74.2	58.1	14	8	5.23	23.8	140	16.1
Paysd’enhaut	72.0	63.5	6	3	2.56	18.0	60	8.5
Rolle	60.5	60.8	16	10	7.72	16.3	160	-0.3
Vevey	58.3	26.8	25	19	18.46	20.9	250	31.5
Yverdon	65.4	49.5	15	8	6.10	22.5	150	15.9
Conthey	75.5	85.9	3	2	99.71	15.1	30	-10.4
Entremont	69.3	84.9	7	6	99.68	19.8	70	-15.6
Herens	77.3	89.7	5	2	100.00	18.3	50	-12.4
Martigwy	70.5	78.2	12	6	98.96	19.4	120	-7.7
Monthey	79.4	64.9	7	3	98.22	20.2	70	14.5
St Maurice	65.0	75.9	9	9	99.06	17.8	90	-10.9
Sierre	92.2	84.6	3	3	99.46	16.3	30	7.6
Sion	79.3	63.1	13	13	96.83	18.1	130	16.2
Boudry	70.4	38.4	26	12	5.62	20.3	260	32.0
La Chauxdfnd	65.7	7.7	29	11	13.79	20.5	290	58.0
Le Locle	72.7	16.7	22	13	11.22	18.9	220	56.0
Neuchatel	64.4	17.6	35	32	16.92	23.0	350	46.8
Val de Ruz	77.6	37.6	15	7	4.97	20.0	150	40.0
ValdeTravers	67.6	18.7	25	7	8.65	19.5	250	48.9
V. De Geneve	35.0	1.2	37	53	42.34	18.0	370	33.8
Rive Droite	44.7	46.6	16	29	50.43	18.2	160	-1.9
Rive Gauche	42.8	27.7	22	29	58.33	19.3	220	15.1

# Q: What does the above function do?

# Don't forget to assign the result to a new object if you want to save it!

# mutate() allows you to refer to variable you just created, transform() doesnt
mutate(swiss, Examination10 = Examination*10, NewExi = Examination10 - 10)

	Fertility	Agriculture	Examination	Education	Catholic	Infant.Mortality	Examination10	NewExi
Courtelary	80.2	17.0	15	12	9.96	22.2	150	140
Delemont	83.1	45.1	6	9	84.84	22.2	60	50
Franches-Mnt	92.5	39.7	5	5	93.40	20.2	50	40
Moutier	85.8	36.5	12	7	33.77	20.3	120	110
Neuveville	76.9	43.5	17	15	5.16	20.6	170	160
Porrentruy	76.1	35.3	9	7	90.57	26.6	90	80
Broye	83.8	70.2	16	7	92.85	23.6	160	150
Glane	92.4	67.8	14	8	97.16	24.9	140	130
Gruyere	82.4	53.3	12	7	97.67	21.0	120	110
Sarine	82.9	45.2	16	13	91.38	24.4	160	150
Veveyse	87.1	64.5	14	6	98.61	24.5	140	130
Aigle	64.1	62.0	21	12	8.52	16.5	210	200
Aubonne	66.9	67.5	14	7	2.27	19.1	140	130
Avenches	68.9	60.7	19	12	4.43	22.7	190	180
Cossonay	61.7	69.3	22	5	2.82	18.7	220	210
Echallens	68.3	72.6	18	2	24.20	21.2	180	170
Grandson	71.7	34.0	17	8	3.30	20.0	170	160
Lausanne	55.7	19.4	26	28	12.11	20.2	260	250
La Vallee	54.3	15.2	31	20	2.15	10.8	310	300
Lavaux	65.1	73.0	19	9	2.84	20.0	190	180
Morges	65.5	59.8	22	10	5.23	18.0	220	210
Moudon	65.0	55.1	14	3	4.52	22.4	140	130
Nyone	56.6	50.9	22	12	15.14	16.7	220	210
Orbe	57.4	54.1	20	6	4.20	15.3	200	190
Oron	72.5	71.2	12	1	2.40	21.0	120	110
Payerne	74.2	58.1	14	8	5.23	23.8	140	130
Paysd’enhaut	72.0	63.5	6	3	2.56	18.0	60	50
Rolle	60.5	60.8	16	10	7.72	16.3	160	150
Vevey	58.3	26.8	25	19	18.46	20.9	250	240
Yverdon	65.4	49.5	15	8	6.10	22.5	150	140
Conthey	75.5	85.9	3	2	99.71	15.1	30	20
Entremont	69.3	84.9	7	6	99.68	19.8	70	60
Herens	77.3	89.7	5	2	100.00	18.3	50	40
Martigwy	70.5	78.2	12	6	98.96	19.4	120	110
Monthey	79.4	64.9	7	3	98.22	20.2	70	60
St Maurice	65.0	75.9	9	9	99.06	17.8	90	80
Sierre	92.2	84.6	3	3	99.46	16.3	30	20
Sion	79.3	63.1	13	13	96.83	18.1	130	120
Boudry	70.4	38.4	26	12	5.62	20.3	260	250
La Chauxdfnd	65.7	7.7	29	11	13.79	20.5	290	280
Le Locle	72.7	16.7	22	13	11.22	18.9	220	210
Neuchatel	64.4	17.6	35	32	16.92	23.0	350	340
Val de Ruz	77.6	37.6	15	7	4.97	20.0	150	140
ValdeTravers	67.6	18.7	25	7	8.65	19.5	250	240
V. De Geneve	35.0	1.2	37	53	42.34	18.0	370	360
Rive Droite	44.7	46.6	16	29	50.43	18.2	160	150
Rive Gauche	42.8	27.7	22	29	58.33	19.3	220	210

# transform(swiss, Examination10 = Examination*10, NewExi = Examination10 - 10)

# Use transmute if you only wan't to keep new variables
transmute(swiss, Examination10 = Examination*10, NewExi = Examination10 - 10)

Examination10	NewExi
150	140
60	50
50	40
120	110
170	160
90	80
160	150
140	130
120	110
160	150
140	130
210	200
140	130
190	180
220	210
180	170
170	160
260	250
310	300
190	180
220	210
140	130
220	210
200	190
120	110
140	130
60	50
160	150
250	240
150	140
30	20
70	60
50	40
120	110
70	60
90	80
30	20
130	120
260	250
290	280
220	210
350	340
150	140
250	240
370	360
160	150
220	210

# Q: What if I want to get a new data set only with the variable Catholic but dividid by 10? What do I have to write?

14.8.5.1 Exercise: Filtering, reordering, selecting/renaming, extracting and transforming

Execute the following code: swiss2 <- cbind(swiss, row.names(swiss)). Compare swiss and swiss2 with View() and explain what it does!
Use the dataset swiss (?swiss).
1. Extract columns 2 and 3 and save them in a new object. (Tip: Access through names() or simply the column numbers)
2. Extract the rows 2 to 6 from columns 1 and 4 and save them in a new object.
3. Extract observations 1, 3, and 6 from columns 2, 4, 6 and save them in a new object.
Extract all provinces from the data set swiss (?swiss) that have values on the variable Agriculture that are smaller or equal than 20 and bigger or equal than 80 (Agriculture <= 20 or >= 80). Save the results in a new data frame, that comprises the first 3 variables/columns of the old data set.
Reorder the observations/rows in the data set swiss according to the variable Infant.Mortality. Which province has the highest level of Infant.Mortality?
Add a new variable/column called Infant.Mortality.squared that contains the squared values of the variable Infant.Mortality.

14.8.5.2 Solution: Filtering, reordering, selecting/renaming, extracting and transforming

14.8.6 group_by(): Applying functions across groups

We can apply above functions to subgroups within the dataset # Observations that have certain values on a variable # Individuals with different levels of education # Individuals belonging to countries
dplyr lets you use the group_by() function to describe how to break a dataset down into groups of rows
dplyr functions recognize when data frame is grouped by using group_by()
Can be used for aggregating data

14.8.6.1 Example: Applying dplyr functions across groups (aggregation)

14.8.6.2 Exercise: Applying dplyr functions across groups (aggregation)

Execute the following code: library(foreign) and essdata <- read.dta("./www/ESS4e04_de.dta", convert.factors=F). Adapt your file path!
The variable religion_str contains the religious affiliation of respondents. Aggregate the data set - using functions from dplyr package - so that you obtain averages for subgroups of religious affiliations for the variables polinteresse and trustparties - as well as a variable with the number of observations across the groups.

14.8.6.3 Solution: Applying dplyr functions across groups (aggregation)

14.8.7 Chaining with dplyr

Code (e.g. above) is sometimes difficult to read and the arguments are very long
With dplyr it is possible to use the %>% operator
- x %>% f(y) turns into f(x, y): x (e.g. a data set) is inserted into the function on the right of %>%
- %>%: Forces > to be a function which does the above
The resulting code seems a bit more intuitive

swiss %>% # Stick swiss into the functions below consecutively
select(Fertility, Infant.Mortality, Examination) %>%
filter(Fertility > 82) %>% 
filter(Infant.Mortality < 24)

Fertility	Infant.Mortality	Examination
83.1	22.2	6
92.5	20.2	5
85.8	20.3	12
83.8	23.6	16
82.4	21.0	12
92.2	16.3	3

# DO TRY THAT AT HOME!!!

14.8.8 anti_join(): Merging data frames

Classically merge() (see Quick R)
dplyr is much faster
inner_join(x, y, by = NULL, copy = FALSE, ...) # all intersecting observations
left_join(x, y, by = NULL, copy = FALSE, ...) # all observations from x
semi_join(x, y, by = NULL, copy = FALSE, ...) # all observations from x
anti_join(x, y, by = NULL, copy = FALSE, ...) # all in x that are not in y
- x = first data set, y = second data set
- by = “matchingvariable” oder by = c(var1, var2??)
- ?join: Check out the corresponding helpfile

14.8.8.1 Example: Merging data frames

getwd()

## [1] "C:/GoogleDrive/2-Teaching/2018_Big_Data_and_Social_Science/book-big-data-social-science"

# An example for merging


nrow(swiss)

## [1] 47

swiss2 <- cbind(row.names(swiss), swiss)
names(swiss2)[1] <- "region"
nrow(swiss2)

## [1] 47

# View(swiss2)

# Generate 2 data frames each possess parts of the observations and the regions
# variable
# Q: What new datasets do I generate below?
swiss.a <- swiss2[1:8,1:3]
# View(swiss.a)
swiss.b <- swiss2[c(1, 6:7, 12),c(1,4:5)]
# View(swiss.b)

intersect(swiss.a$region, swiss.b$region) # check intersection, i.e.

## [1] "Courtelary" "Porrentruy" "Broye"

# which regions appear in both data sets?

# MERGING OF THE TWO DATA FRAMES
library(dplyr) # is the package installed?
?inner_join

join	R Documentation

Join two tbls together

Description

These are generic functions that dispatch to individual tbl methods - see the method documentation for details of individual data sources. x and y should usually be from the same data source, but if copy is TRUE, y will automatically be copied to the same source as x.

Usage

inner_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)

left_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)

right_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)

full_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)

semi_join(x, y, by = NULL, copy = FALSE, ...)

anti_join(x, y, by = NULL, copy = FALSE, ...)

Arguments

`x, y`	tbls to join
`by`	a character vector of variables to join by. If `NULL`, the default, `*_join()` will do a natural join, using all variables with common names across the two tables. A message lists the variables so that you can check they’re right (to suppress the message, simply explicitly list the variables that you want to join). To join by different variables on x and y use a named vector. For example, `by = c(“a” = “b”)` will match `x.a` to `y.b`.
`copy`	If `x` and `y` are not from the same data source, and `copy` is `TRUE`, then `y` will be copied into the same src as `x`. This allows you to join tables across srcs, but it is a potentially expensive operation so you must opt into it.
`suffix`	If there are non-joined duplicate variables in `x` and `y`, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.
`…`	other parameters passed onto methods, for instance, `na_matches` to control how `NA` values are matched. See join.tbl_df for more.

Join types

Currently dplyr supports four types of mutating joins and two types of filtering joins.

Mutating joins combine variables from the two data.frames:

inner_join(): return all rows from x where there are matching values in y, and all columns from x and y. If there are multiple matches between x and y, all combination of the matches are returned.
left_join(): return all rows from x, and all columns from x and y. Rows in x with no match in y will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.
right_join(): return all rows from y, and all columns from x and y. Rows in y with no match in x will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.
full_join(): return all rows and all columns from both x and y. Where there are not matching values, returns NA for the one missing.

Filtering joins keep cases from the left-hand data.frame:

semi_join()

return all rows from x where there are matching values in y, keeping just columns from x.

A semi join differs from an inner join because an inner join will return one row of x for each matching row of y, where a semi join will never duplicate rows of x.

anti_join()

return all rows from x where there are not matching values in y, keeping just columns from x.

Grouping

Groups are ignored for the purpose of joining, but the result preserves the grouping of x.

Examples

# "Mutating" joins combine variables from the LHS and RHS
band_members %>% inner_join(band_instruments)
band_members %>% left_join(band_instruments)
band_members %>% right_join(band_instruments)
band_members %>% full_join(band_instruments)

# "Filtering" joins keep cases from the LHS
band_members %>% semi_join(band_instruments)
band_members %>% anti_join(band_instruments)

# To suppress the message, supply by
band_members %>% inner_join(band_instruments, by = "name")
# This is good practice in production code

# Use a named `by` if the join variables have different names
band_members %>% full_join(band_instruments2, by = c("name" = "artist"))
# Note that only the key from the LHS is kept

swiss.inner <- inner_join(swiss.a, swiss.b, by ="region") 
# View(swiss.a)
# View(swiss.inner) # data set with observations that intersect across both data sets

swiss.left <- left_join(swiss.a, swiss.b, by ="region") 
# View(swiss.left) # all observations in x = swiss.a

# TRY semi_join(), anti_join() FOR YOURSELF!

14.8.8.2 Exercise: Merging data frames

Use the code in the previous example to create swiss2 and the objects swiss.a and swiss.b subsequently.
Create a third object swiss.c that include the first 10 countries in the swiss data set and their values on the variables region, Catholic and Infant.Mortality.
Merge/join swiss.c with swiss.a so that the merged dataset contains all the countries that are contained in at least one of the two data sets.

14.8.8.3 Solution: Merging data frames

Acharya, Avidit, Matthew Blackwell, and Maya Sen. 2016. “Explaining Causal Findings Without Bias: Detecting and Assessing Direct Effects.” Am. Polit. Sci. Rev. 110 (3). Cambridge University Press: 512–29.

Barberá, P. 2015. “Birds of the Same Feather Tweet Together: Bayesian Ideal Point Estimation Using Twitter Data.” Polit. Anal. cambridge.org.

Bauer, Paul. 2018. “Writing a Reproducible Paper in R Markdown,” May.

Bauer, Paul C. 2015. “Negative Experiences and Trust: A Causal Analysis of the Effects of Victimization on Generalized Trust.” Eur. Sociol. Rev. 31 (4): 397–417.

———. 2016. “The Visual Display of Causal Relationships.”

Chrisman, Nicholas R. 1998. “Rethinking Levels of Measurement for Cartography.” N. Z. Cartogr. Geogr. Inf. Sys. 25 (4). Taylor & Francis: 231–42.

Dressel, Julia, and Hany Farid. 2018. “The Accuracy, Fairness, and Limits of Predicting Recidivism.” Sci Adv 4 (1): eaao5580.

Edelman, Benjamin, Michael Luca, and Dan Svirsky. 2017. “Racial Discrimination in the Sharing Economy: Evidence from a Field Experiment.” Am. Econ. J. Appl. Econ. 9 (2). aeaweb.org: 1–22.

Entwisle, B, and P Elias. 2013. “New Data for Understanding the Human Condition: International Perspectives.” Paris, France: OECD, available at http://www. oecd. org/sti/sci-tech/new-data-for-understanding-the-hu man-condition. pdf[ 1477].

Gerring, John. 2012. “Mere Description.” British Journal of Political Science 4 (4): 721–46.

Gill, Jeff. 1999. “The Insignificance of Null Hypothesis Significance Testing.” Polit. Res. Q. 52 (3). SAGE Publications Inc: 647–74.

Golder, Scott A, and Michael W Macy. 2014. “Digital Footprints: Opportunities and Challenges for Online Social Research.” Annu. Rev. Sociol. 40 (1). Annual Reviews: 129–52.

Grimmer, Justin. 2015. “We Are All Social Scientists Now: How Big Data, Machine Learning, and Causal Inference Work Together.” PS Polit. Sci. Polit. 48 (1). Cambridge University Press: 80–83.

Hern, Alex. 2018. “Fitness Tracking App Strava Gives Away Location of Secret US Army Bases.” The Guardian, January.

Hilbert, Martin, and Priscila López. 2011. “The World’s Technological Capacity to Store, Communicate, and Compute Information.” Science 332 (6025): 60–65.

Holland, Paul W. 1986. “Statistics and Causal Inference.” J. Am. Stat. Assoc. 81 (396). Taylor & Francis: 945–60.

King, Gary. 1995. “Replication, Replication.” PS, Political Science & Politics 28 (3): 444–52.

———. 2010. “A Hard Unsolved Problem? Post-Treatment Bias in Big Social Science Questions.” In Hard Problems in Social Science” Symposium, Harvard University. scholar.harvard.edu.

Laney, Doug. 2001. “3D Data Management: Controlling Data Volume, Velocity and Variety.” META Group Research Note 6 (70): 1.

Lazer, David, Alex Pentland, Lada Adamic, Sinan Aral, Albert-Laszlo Barabasi, Devon Brewer, Nicholas Christakis, et al. 2009. “Social Science. Computational Social Science.” Science 323 (5915): 721–23.

Mayer-Schönberger, Viktor, and Kenneth Cukier. 2012. Big Data: A Revolution That Transforms How We Work, Live, and Think. Boston: Houghton Mifflin Harcourt.

Monroe, Burt L. 2013. “The Five Vs of Big Data Political Science Introduction to the Virtual Issue on Big Data in Political Science Political Analysis.” Polit. Anal. 21 (V5). Cambridge University Press: 1–9.

Munzert, Simon, Christian Rubba, Peter Meißner, and Dominic Nyhuis. 2014. Automated Data Collection with R: A Practical Guide to Web Scraping and Text Mining. John Wiley & Sons.

Open Science Collaboration. 2015. “Estimating the Reproducibility of Psychological Science.” Science 349 (6251). American Association for the Advancement of Science: aac4716.

Pearl, Judea. 2009. “Causal Inference in Statistics: An Overview.” Stat. Surv. 3. The author, under a Creative Commons Attribution License: 96–146.

Richthammer, Christian, Michael Netter, Moritz Riesner, Johannes Sänger, and Günther Pernul. 2014. “Taxonomy of Social Network Data Types.” EURASIP Journal on Information Security 2014 (1). Nature Publishing Group: 11.

Stevens, S S. 1946. “On the Theory of Scales of Measurement.” Science 103 (2684): 677–80.

Stone, Richard. 1993. “The Assumptions on Which Causal Inferences Rest.” J. R. Stat. Soc. Series B Stat. Methodol. 55 (2). [Royal Statistical Society, Wiley]: 455–66.

Wikipedia contributors. 2018. “Data.” https://en.wikipedia.org/w/index.php?title=Data&oldid=869556199.

Zimmer, Michael. 2010. “‘But the Data Is Already Public’: On the Ethics of Research in Facebook.” Ethics Inf. Technol. 12 (4): 313–25.

Education	Examination
12	15
9	6
5	5
7	12
15	17
7	9
7	16
8	14
13	16
6	14
12	21
7	14
12	19
5	22
2	18
8	17
28	26
20	31
9	19
10	22
3	14
12	22
6	20
1	12
3	6
10	16
19	25
8	15
2	3
6	7
2	5
6	12
3	7
9	9
3	3
13	13
12	26
11	29
13	22
32	35
7	15
7	25
53	37
29	16
29	22

Examination10	NewExi
150	140
60	50
50	40
120	110
170	160
90	80
160	150
140	130
120	110
160	150
140	130
210	200
140	130
190	180
220	210
180	170
170	160
260	250
310	300
190	180
220	210
140	130
220	210
200	190
120	110
140	130
60	50
160	150
250	240
150	140
30	20
70	60
50	40
120	110
70	60
90	80
30	20
130	120
260	250
290	280
220	210
350	340
150	140
250	240
370	360
160	150
220	210

Education	Examination
12	15
9	6
5	5
7	12
15	17
7	9
7	16
8	14
13	16
6	14
12	21
7	14
12	19
5	22
2	18
8	17
28	26
20	31
9	19
10	22
3	14
12	22
6	20
1	12
3	6
10	16
19	25
8	15
2	3
6	7
2	5
6	12
3	7
9	9
3	3
13	13
12	26
11	29
13	22
32	35
7	15
7	25
53	37
29	16
29	22

Examination10	NewExi
150	140
60	50
50	40
120	110
170	160
90	80
160	150
140	130
120	110
160	150
140	130
210	200
140	130
190	180
220	210
180	170
170	160
260	250
310	300
190	180
220	210
140	130
220	210
200	190
120	110
140	130
60	50
160	150
250	240
150	140
30	20
70	60
50	40
120	110
70	60
90	80
30	20
130	120
260	250
290	280
220	210
350	340
150	140
250	240
370	360
160	150
220	210

Education	Examination
12	15
9	6
5	5
7	12
15	17
7	9
7	16
8	14
13	16
6	14
12	21
7	14
12	19
5	22
2	18
8	17
28	26
20	31
9	19
10	22
3	14
12	22
6	20
1	12
3	6
10	16
19	25
8	15
2	3
6	7
2	5
6	12
3	7
9	9
3	3
13	13
12	26
11	29
13	22
32	35
7	15
7	25
53	37
29	16
29	22

Examination10	NewExi
150	140
60	50
50	40
120	110
170	160
90	80
160	150
140	130
120	110
160	150
140	130
210	200
140	130
190	180
220	210
180	170
170	160
260	250
310	300
190	180
220	210
140	130
220	210
200	190
120	110
140	130
60	50
160	150
250	240
150	140
30	20
70	60
50	40
120	110
70	60
90	80
30	20
130	120
260	250
290	280
220	210
350	340
150	140
250	240
370	360
160	150
220	210