Sponsors: Elias Dinas and Juho härkönen, SPS Department
filter()
: ?arrange()
: ?select()
: ?distinct()
: ?mutate()
: Add new columns to existing columnssummarise()
: Collapses a data frame to a single row (e.g. aggregation)group_by()
: ?na.omit()
: ?library(tidyverse)
library(stringr)
#0
gdp<- read.csv("./data/gdp/gdp.csv", stringsAsFactors = FALSE)[,-1]
#1
gdp <- gdp %>% gather(2:170, key = country, value = GDP, na.rm= TRUE, convert = TRUE)
#2
gdp$label<- gdp$GDP
gdp$label <- str_replace_all(gdp$label, "[0-9]{1,10}", "") # creates implict missing values!
gdp$label[gdp$label == ""]<- NA
# 3
gdp <-fill(gdp, label) # to move forward values, and fill NA in this way
gdp <- gdp %>%
group_by(country) %>%
slice(-1) # filter by row position OR
gdp <- gdp %>%
group_by(country) %>%
filter(cgdppc != "year") # filter by row position OR
# extras: nice presentation
names(gdp)[1:3] <- c("year", "country", "gdp")
# 4
write.csv(gdp, "gdpnew.csv")
install.packages(c("tidyverse", "foreign", "stringr", "haven", "openxlsx", "readxl", "eeptools", "tm", "quanteda", "pdftools", "rvest", "xml2"))
# Get Rmarkdown installed
install.packages("rmarkdown")
# Generate a new file, check names and format of the file
# White part = the space of text --> Information of the steps of your analysis
# Grey part = R code as a separate paragraph--> analysis and specific comments about a command
#How does it work?
# treat it as a normal script, at the end of your work, you can click the knit button and get the file you want.
# get Rstudio Connect
# 1) go to the eye looking symbol right to the "run" button
# 2)click: "manage account" and add this to the server: https://bookdown.org
# 3) follow the instructions to have access to Rs connect
# 4) log in your account
install.packages(c("tidyverse", "foreign", "stringr", "haven", "openxlsx", "readxl", "eeptools", "tm", "quanteda", "pdftools", "naniar", "rvest", "xml2"))
# packages
library(tidyverse)
library(readxl)
library(openxlsx)
library(haven)
library(stringr)
# import daset
gwf <- read.csv("./data/regime type/autocracydata.csv" , stringsAsFactors = FALSE) # change working directory
View(gwf)
dplyr
are highly performant (big data!) and consistent
filter()
: Select a subset of the rows of a data framearrange()
: Reorders the rows of a data frameselect()
: Selects columnsdistinct()
: Returns unique values in a tablemutate()
: Add new columns to existing columnssummarise()
: Collapses a data frame to a single row (e.g. aggregation)group_by()
: Break data set into groups (of rows), to apply above functions to each group
%>%
this is a pipe operator which means “nested in”, so you can encadenate functions
select()
: select columns while filter()
select rows (e.g. countries, individuals etc.)
# Task: select all the columns, excepting the first one that refers to the number of rows
shortgwf <- select(gwf, -c(X)) # conversely select(gwf, cowcode:gwf_monarch)
shortgwf <- gwf %>% select(-X) %>% filter(year == 1995)
# select(dataset, columns) in columns you can add the name of the vector or its position. Positive values means the column will be considered, while negative means the column would not be considered
# sometimes, writting the column names do not work. To solve the problem write the name of the columns = `name`.
# Task: filter the dataset by those regimes that are monarchies
shortgwf <- filter(shortgwf, gwf_monarch != 1) # != not equal to
# Question: can I combine both functions? Yes, pipe them
shortgwf <- gwf %>% filter(gwf_monarch != 1) %>% select(-X)
# how is the old-school way to do this?
# shortgwf<- gwf[, -1 ]
# shortgwf<- gwf[gwf$gwf_monarch != 1, ]
arrange()
: reorder rows
# task: common practice...datasets in an alphabetical order
colnames(shortgwf)
shortgwf <- gwf %>% arrange(gwf_country)# write the name order you want wihtin arrange() or their position
# what if I want to organize the dataset by the regime type first?
shortgwf <- gwf %>% arrange(desc(gwf_country))
# Q: is equivalent the following function to the sum of previous command?
shortgwf <- gwf %>% arrange(gwf_country, desc(year))
# how is the old-school way to do this?
#shortgwf <- shortgwf[order(shortgwf$gwf_country, desc(shortgwf$gwf_regimetype)), ]
mutate()
: transformation of columns executed line-by-line
mutate_all()
: transformation of all the columns of a dataset. Any idea about its application?
mutate_at()
: transformation of multiple columns selected with a character vector or vars()
mutate_if()
: transformation of multiple columns if condition satisfied
# how can we enumerate rows?
shortgwf <- gwf %>% mutate( rows = row_number()) # kind useful to write the number of rows, which should be updated after deleating a row
# which was the classic approach?
#shortgwf$rows <- 1:nrow(shortgwf)
# you can add them each column
shortgwf <- shortgwf %>% mutate_at(c("rows", "year"), .funs = lag) # problem?
shortgwf <- shortgwf %>% mutate_at(c("rows", "year"), .funs = list(new = lag))
shortgwf <- shortgwf %>% mutate(new = lag(year, new2 = lag(rows)))
# which is the the difference with respect to the previous code?
# let's say that you want to change all the integer vectors to numeric vectors to numeric
shortgwf <- shortgwf %>% mutate_if(is.integer, .funs = as.numeric)
# only those columns with the name rows in it?
shortgwf <- shortgwf %>% mutate_at(c("rows", "rows_new"), .funs = as.integer) # OR
shortgwf <- shortgwf %>% mutate_at(vars(contains('new')), .funs = as.integer)
group_by()
function to describe how to break a dataset down into groups of rows
# total duration of every regime in a given country
shortgwf <- gwf %>% group_by(gwf_casename) %>% mutate( total = n())
# Question: what if we change ```n()``` for nrow()? what can these function be useful for?
shortgwf <- shortgwf %>% group_by(gwf_casename) %>% mutate( total2 = row_number())
# OR shortgwf <- shortgwf %>% group_by(gwf_casename) %>% mutate( total = n(), total2 = row_number())
# also more than one variable can be generated
shortgwf <- shortgwf %>% group_by(gwf_casename, year) %>% mutate( total_months = total*12)
summarise()
function will create summary statistics for a given column in the data frame
summarise_all()
apply the same summary statistic to all the columns
summarise_at()
apply the same summary statistic to more than one column
# create a list of regime-cases
list_gwf <- shortgwf %>% select(gwf_casename, total,total_months) %>% summarise(sum = sum(total))
list_gwf <- select (list_gwf, -year) %>% distinct() # what's going with Argentina 43-46?
# we want to calculate the avarage lenght of every regime-case in years and month
regime_mean <- shortgwf %>% select(gwf_regimetype, total, total_months) %>% group_by(gwf_regimetype) %>% summarise_all(funs(mean = mean, sd = sd))
regime_mean2 <- shortgwf %>% select(gwf_regimetype, total, total_months) %>% group_by(gwf_regimetype) %>% summarise_at(vars(total:total_months), funs(mean = mean, sd = sd))
# Q: compare the outcome ob both functions, which are the advantages/disavantages of using one function or the other?
# old_style <- shortgwf %>% select(gwf_casename, total, total_months) %>% group_by(gwf_casename) %>% summarise(mean1 = mean(total), mean2 = mean(total_months) and the sd...)
summrise()
and group_by
. Here another blog in case you need to remmember the theory againusing the dataset ‘gwf’…
In one line of code, we need a dataset with the columns: gwf_country till gwf_enddate and gwf_regimetype and… remove observations that fall into the regime type ‘oligarchy’. Check you did it right
Create a new vector that contains information about the duration of every authoritarian regime type in a given year (tip: you might need row_number
) in one line of code.
distinct()
is a function that returns unique values. a<- shortgwf %>% group_by(gwf_casename) %>% distinct()
why is this function not working? what shall we do to obtain a list of regime-case with their starting and ending date?
# Exercise 1
exercise1 <- gwf %>% select(gwf_country:gwf_enddate, gwf_regimetype ) %>% filter(gwf_regimetype != "oligarchy")
# Exercise 2
exercise2 <- exercise1 %>% group_by(gwf_casename) %>% mutate(duration = row_number()) # duration
# Exercise 3
exercise3 <- exercise2 %>% select(-duration) %>% distinct()
separate()
: separate one column into severalunite()
: unite in one column multiple columnsspread()
: spread rows into columnsgather()
: gather columns into rows
separate()
unite()
# THE DATASET TO WORK NOW
tidy_data <- exercise3
# create three columns from the column 'gwf_startdate'
tidy_data1 <- tidy_data %>% separate(gwf_startdate , into = c("year", "month", "day"), sep = "-", remove = TRUE, convert = TRUE)
# It works as follow separete(the column to separate, the name of the new columns, the symbol to separate the values)
# additional useful argument -->
#'remove = FALSE' do not delete the original variable
# convert = TRUE to give it a proper data structure
# list <- list %>% separate(gwf_startdate , into = c("year", "month", "day"), sep = "-", remove = FALSE, convert = TRUE )
# also, you do not need 'symbols to separate '
tidy_data1 <- tidy_data1 %>% separate(year , into = c("century", "years"), sep = 2)
# in sep, + values far-left of the strings, - values far-right of the strings
# Oh, god! that was not the right variable. How can I get my column back!!??
tidy_data1 <- tidy_data1 %>% unite(year, century, years)
# how can I remove the '_'? adding "sep = "" "
tidy_data1 <- tidy_data %>% separate(gwf_startdate , into = c("year", "month", "day"), sep = "-", remove = TRUE, convert = TRUE)
tidy_data1 <- tidy_data1 %>% separate(year , into = c("century", "years"), sep = 2)
tidy_data1 <- tidy_data1 %>% unite(year, century, years, sep= "")
# _ value is a default value when uniting between the values from different columns. we can specify different "sep" condition
spread()
gather()
# create a dataset in which one column correspond to years and the value = regime type
spread_data <- tidy_data1 %>% select(gwf_casename, gwf_regimetype, year) %>% spread(key = gwf_regimetype, value = year) # horizontally
# this can be useful to estimate some descriptive statistics for each regime type
spread_data <- spread_data %>% summarise_all(min , na.rm = TRUE ) # no possible two functions :'(
############# spread_data is a very ugly dataset, let's put it in its original form. in gather we trust #############
# We need the set of columns that represent values, not variables. In this example, those are the columns after total.
# We need the key: the name of the variable whose values form the column names. Here it is year.
# We need the value: the name of the variable whose values are spread over the cells. Here it's regime type.
united_data <- spread_data %>% gather(2:10, key = gwf_regimetype, value = year) %>% drop_na() # why so ugly?
spread()
makes long tables shorter and wider
gather()
makes wide tables narrower and longer
as.characther()
as.POSIXct()
as.POSIXlt()
# data for the section
tidy_data <- exercise3
# save time data with POSIXct
value <- as.POSIXct(unlist(tidy_data[2,3]))
print(value)
typeof(value) # double? integer value with double precisioin floating points useful for mathematical purposes
cat(value,"\n") # how it is stored in R internally
# save time data with POSIXlt
list_date <- as.POSIXlt(Sys.time(), format = "%y-%m-%d")
print(list_date)
typeof(list_date)
cat(list_date,"\n")
names(list_date)
list_date[[1]] # seconds
list_date[[2]] # minutes
list_date[[3]] # hours
list_date[[4]] # day of month (1-31)
list_date[[5]] # month of the year (0-11)
as.Date()
:
help(as.Date)
adn observe the commands: what else you can specify with this function?# save time data with as.Date
value2 <- as.Date(unlist(tidy_data[1,3]))
print(value2)
typeof(value2)
cat(value2,"\n")
value2 <- as.Date.character(tidy_data[1,3], format= "%Y/%m/%d")
typeof(value2)
as.Date
format. Tips: you need to write the code in one line and use mutate_at()
Date
is a suitable R’s internal format to run a numer of basic operations
library(eeptools)
# create time differences
tidy_data$duration_month <- age_calc(tidy_data$gwf_startdate, enddate = tidy_data$gwf_enddate, units = "months") # default is in months
# sensible to omited values
tidy_data <- na.omit(tidy_data)
tidy_data$duration_year <- age_calc(tidy_data$gwf_startdate, tidy_data$gwf_enddate, units = "months") #
# old school
# Lenght of by months
elapsed_months <- function(end_date, start_date) {
ed <- as.POSIXlt(end_date)
sd <- as.POSIXlt(start_date)
12 * (ed$year - sd$year) + (ed$mon - sd$mon)
}
#######################################################################################################
######### ########## Expand dataset condioned on time differences ######### #########
# daydly basis.
lst <- Map(`:`, tidy_data$gwf_startdate, tidy_data$gwf_enddate) # Map is a function from purr to extract a lsit values
spread_vertical_data <- cbind(tidy_data[rep(seq_len(nrow(tidy_data)), lengths(lst)), ], date = unlist(lst)) # why negative values?
spread_vertical_data$first <- as.Date(spread_vertical_data$date, origin = "1970-01-01") # date format!
library(lubridate)
# monthly basis
spread_vertical_data$second <- format(as.Date( spread_vertical_data$first), "%Y-%m")
# it shows only month, still the data is stored on dayly basis!
spread_vertical_data <- spread_vertical_data %>% select(-c(first,date)) %>% distinct()
install.packages("naniar")
(today)
complete()
turns implicit missing values into explicit NAna_if()
library(naniar)
# generating a dataset
df <- data.frame(replicate(6, sample(c(1:10, -99), 6, rep= T)))
df <- mutate_all(df, funs(na_if(.,-99)))
# sometimes white spaces need to be converted in NA
df <- data.frame(replicate(6, sample(c(1:10, ""), 6, rep= T)))
mutate_all(df, funs(na_if(., "")))
####### Old school way a function to solve missings #######
fix_missing <- function(x){x[x == -99]<- NA
x}
# change the -99 in case the value is different.
df<-lapply(df, fix_missing) # to applied it for the whole dataset, of course, also you can use the functio to fix one particular column
df$X1 <- fix_missing(df$X1)
miss_var_summary()
: summary for each variablemiss_case_summary()
:summary for each case
# dataset of the section
gwf_2 <- gwf
library(naniar)
report1 <- miss_var_summary(gwf_2, order = TRUE, add_cumsum = TRUE)
report2 <- miss_case_summary(gwf_2, order = TRUE, add_cumsum = TRUE)
# old school, I prefer it
colSums(is.na(gwf_2))
apply(is.na(gwf_2), 2, which) # shows the rows with missing values and columns
# subset
no_missing <- gwf_2 %>% filter(is.na(gwf_regimetype)) # or
no_missing <- gwf_2 %>% complete.obs() # or
na.rm = TRUE
df <- data.frame(replicate(6, sample(c(1:10, -99), 6, rep= T)))
df <- mutate_all(df, funs(na_if(.,-99)))
# way to correct the previous mess
df2 <- df %>% gather(5:6, key = beer, value = cost, na.rm= TRUE) # exclude missing values, UPDATE positions based on the lcoation of the mssing values!
# several ways in the old.fashion... one of them list <- na.omit(list) (https://stackoverflow.com/questions/4862178/remove-rows-with-nas-missing-values-in-data-frame)
df %>% summarise(mean_cost = mean(X4, na.rm= TRUE), sd_cost = sd(X4, na.rm= TRUE))
# also na.rm when running regressions!
fill()
function from tidy package
no_missing <- gwf_2 %>% filter(is.na(gwf_regimetype))
gwf_2 <- gwf_2 %>% group_by(gwf_casename) %>% fill(gwf_regimetype)
# check missings
no_missing2 <- gwf_2 %>% filter(is.na(gwf_regimetype))
gwf_2 <- gwf_2 %>% group_by(gwf_casename) %>% fill(gwf_regimetype, .direction = c("up"))
# add manual entry for Iran and later apply fill with direction = down
distinct()
is a function integrated in tidy package to deal with duplicates
gwf_2 <- gwf
data <- gwf_2 %>% select(cowcode, gwf_casename, gwf_regimetype) # all repeated
data <- gwf_2 %>% select(cowcode, gwf_casename, gwf_regimetype) %>% distinct() # tadaaaa
# If the problems are misspells...
# the process of creating ids
gwf_2$id_duplica <- paste0(gwf_2$cowcode, sep = "." , gwf_2$year)
gwf_2$id_duplica <- tolower(gwf_2$id_duplica) #everything in lower case
gwf_2$id_duplica <- gsub('\\s{1,4}','', gwf_2$id_duplica )# remove empty spaces in the cell
gwf_2 <- arrange(gwf_2, id_duplica) # to put some alphabetic order here...
gwf_2 <- transform(gwf_2, id_duplica = as.numeric(factor(id_duplica)))
sum(table(gwf_2$id_duplica)-1) # 42 observations are duplicates, but where?
gwf_2 <- gwf_2 %>% group_by(id_duplica) %>% mutate(n=n())
a <- gwf_2 %>% filter(n != 2)
library(stringr)
# change working directory
gwf <- read.csv("./data/regime type/autocracydata.csv", stringsAsFactors = FALSE) # change working directory
names(gwf) <- names(gwf) %>% str_replace_all("gwf_", "")
# old school: names(gwf)[1:15] <- c("write here 15 names")
example.string <- "</P> Dip. Rodirgo Gromenawer, Perez Elected: PRIS -Parlamentary group:PIRS 2010</P>"
# A string matching itself
str_extract(example.string, "PRI")
# multiple matches
str_extract_all(example.string, "PRI")
# case sensitivity
str_extract(example.string, "pri")
str_extract_all(example.string, regex("pri", ignore_case =TRUE))
[:digit:]
: Digits: 0 1 2 3 4 5 6 7 8 9[:lower:]
: Lower-case characters: a-z[:upper:]
: Upper-case characters: A-Z[:alpha:]
: Alphabetic characters: a-z and A-Z[:alnum:]
: Digits and alphabetic characters[:punct:]
: Punctuation characters: ‘.’, ‘,’, ‘;’, etc.[:graph:]
: Graphical characters [:alnum:] and [:punct:][:blank:]
: Blank characters: Space and tab[:space:]
: Space characters: Space, tab, newline, and other space characters
\d
: matches digits = [0-9]
\D
: matches non-digits = [^0-9]
\w
: matches any characther = [a-z]
+ [A-Z]
\s
: matches whitespace characther\S
: matches non-whitespace characther
^
: match the beginning of a string$
: match the end of a string +|
: match more than one element +.+
: match everything +[ ]
: match a range of words or numbers included within the bracket example.string <- "</P> Dip. Rodirgo Gromenawer, Perez Elected: PRIS -Parlamentary group:PIRS</P>"
# match the beginning of a string
str_extract_all(example.string, "^</P>")
# match the end of a string
str_extract_all(example.string, "PRI$")
str_extract_all(example.string, "</P>$")
# pipe operator
unlist(str_extract_all(example.string, "Dip|PRI"))
# wildcard
str_extract(example.string, "R.+go")
# character class (range)
str_extract(example.string, "P[RSTUV][IJKLM]S")
str_extract(example.string, "Ro[:alpha:][:alpha:][:alpha:]go")
# quite annoying repeating several times the same right?
?
: The preceding item is optional and will be matched at most once+
: The preceding item will be matched one or more times{n}
: The preceding item is matched exactly n times{n,}
: The preceding item is matched n or more times{n,m}
: The preceding item is matched between n and m# greedy quantification
str_extract(example.string, "R[:alpha:]{1,6}o")
# the preceding item is the wildcard
str_extract(example.string, "Dip.+Elected")
# the preceding item is the wildcard
str_extract_all(example.string, ".+?PRI")
# meta characters
unlist(str_extract_all(example.string, "\\.")) # only want the dot. The dot has a meaning in regular expression that you want to isolate
# assertions
unlist(str_extract_all(example.string, "(?<=</P> ).+")) # positive lookbehind: (?<=...)
unlist(str_extract_all(example.string, ".+(?=</P>)")) # positive lookahead (?=...)
# to create strings, both single and double quotes work
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
# to include a literal single or double quote in a string you can use \ to escape it:
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
# to extract info between brackets or quotations: two strategies
str_extract(string2, '(\").*?(\")')
str_extract(string2, "(?<=\")(.*?)(?=\")") #
# Also: negative lookbehind: (?<!...) and negative lookahead (?!...), never used
"[[:digit:]]{4,}"
".*?\\.txt$"
"<(.+?)>.+?</\\1>"
# example str_extract_all("Phone 150$, PC 690$", "[0-9]+\\$") # example to extract every number that precedes $
str_extract()
: Extracts first string that matches pattern Character vectorstr_extract_all()
: Extracts all strings that match pattern List of character vectorsstr_locate()
: Return position first pattern that match Matrix of start/end positionsstr_locate_all()
: Return positions of all pattern matches List of matricesstr_replace()
: Replaces first pattern match Character vectorstr_split()
: Split string at pattern List of character vectorsstr_split_fixed()
: Split string at pattern into fixed number of pieces Matrix of character vectorlibrary(stringr)
# joining
str_c("text", "manipulation", sep = " ")
str_c("text", c("manipulation", "basics"), sep = " ")
# working example
example.string <- "</P> Dip. Rodirgo Gromenawer, Perez Elected: PRI -Parlamentary group:PRI</P>"
# locate
str_locate(example.string, "Rodirgo Gromenawer, Perez")
# substring extraction
str_sub(example.string, start = 11, end = 35)
# replacement
str_replace(example.string, pattern = "</P>", replacement = "")
str_replace_all(example.string, "</P>", "")
# splitting
str_split(example.string, ":") %>% unlist
secret <- "clI.0pow1zLstc0d87wnkig7Ovd000dgdfgdfsdfeeeggvhryn92V55juwczi8hqrfpxs5j5dwpn0Enwo. Cwisdij76j8kpf03AT5dr3coc0bt7yczjataootj55t3j3ne6c4Sfek.W1w1IwwojigT d6vrf5656Hrbz2.2bkTnbhzgv4I9i05zEcropwSgnb.YqoO65fUa1otfb7wm24k6t3s3339zqe5 fy89n6?d5t9kc4fe905gmr"
# import the pdf in R == setting a corpus. we need tm package
library(tm)
library(pdftools)
library(stringr)
# 1. my workign directory: D:\Biblioteca D\courses material\data managment in R\data
cname <- file.path("D:", "Biblioteca D", "courses material", "data managment in R", "data") # I am telling to R where the pdf files are. Cchange \ for ,
# 2. Corpus
Rpdf <- readPDF(control = list(text = "-layout")) # layout control in order to keep the original format as much as possible
# Rpdf <- readPDF(engine = "xpdf", control = list(text = "-layout")) # xpdf engine, similar as the default. but needs to installed: http://www.xpdfreader.com/
docs <- Corpus(DirSource(cname), readerControl=list(reader=Rpdf)) # upload documents
writeLines(as.character(docs[1]))
## problems with encoding? Functions below change the encoding of R
# Spanish\LATIN: Sys.setlocale("LC_CTYPE", "")
# Cyrillic: Sys.setlocale(category = "LC_ALL", locale = "Russian")
# Specify the doc we want
content_doc <- docs[[1]]$content
################################################################################## In case this code does not work for you: use
doc <- pdf_text("./data/Legislatura_36.pdf") # can upload only one doc!
#################################################################################################
# 3. extract the info that we need
# name
mps.regex <- "Dip.+[[:alpha:].,]{1,5}"
name_dip<- unlist(str_extract_all(content_doc, mps.regex)) #172?
# party
party <- unlist(str_extract_all(name_dip, ":(.+)"))
party <- unlist(str_replace_all(party, ":", "")) # 171?
# state
state.regex <- "Es[tad][tad].+[[:alpha:]:.+[0-9]]{1,8}"
state_dip<- unlist(str_extract_all(content_doc, state.regex)) # 171?
# 4 and 5. correct errors: error in the string 8, which one shall we used
#name_dip <- unlist(str_remove(name_dip, "Diputado Favio Altamirano Manlio fallecio."))
#name_dip <- name_dip[-8]
# 6. put together in data.frame when we feel comfortable with the lenght of vectors
data<- data.frame(names = name_dip, party.affiliation = party, state = state_dip, stringsAsFactors = FALSE)
# 5. clean dataset!
data$names <- unlist(str_replace_all(data$names, "Partido:.+" , "")) # remove party in names col
data$names <- unlist(str_replace_all(data$names, "Dip." , "")) # remove dip in names col
data$names_ordered <- str_replace(data$names , "(.+), (.+)", "\\2 \\1") # change first name, second surname. Does it work?
data$names_ordered <- tolower(data$names_ordered) # lower case
## other tasks: create a id per
# packages
library(xml2)
library(rvest)
library(tidyverse)
library(stringr)
####### data collection stage of the research #######
# 1. specify URL
url <- "https://en.wikipedia.org/wiki/Results_of_the_Malaysian_general_election,_2018_by_parliamentary_constituency"
browseURL(url)
# 2. download static HTML behind the URL and parse it into an XML file
web<- read_html(url) %>% html_table( fill = TRUE)
candidates <- rbind(web[[3]], web[[4]], web[[5]], web[[6]], web[[7]], web[[8]],
web[[9]], web[[10]], web[[11]], web[[12]], web[[13]], web[[14]], web[[15]], web[[16]], web[[17]], web[[18]])
####### data managment stage of the research #######
# 3. setting the dataset
# columns to deleate (10:13)
candidates <- candidates[,1:9]
# changign labes to facilitate identification
names(candidates)[1]<- "Constituency.code"
names(candidates)[2]<- "Constituency"
names(candidates)[3]<- "Winner"
names(candidates)[6]<- "Loser"
names(candidates)[4]<- "Votes.total.wn"
names(candidates)[5]<- "Majority"
names(candidates)[7]<- "Votes.total.ls"
names(candidates)[8]<- "Incumbent"
names(candidates)[9]<- "Incumbent.Majority"
# create outcome of election column and candiate.names
# gathering columns
candidates <- candidates %>% gather("Winner", "Loser", key = Outcome, value = Candidate.name, na.rm= TRUE, convert = TRUE) # candidates and outcome
candidates <- candidates %>% gather("Votes.total.wn", "Votes.total.ls", key = b, value = Votes.number, na.rm= TRUE, convert = TRUE) # fix N of votes
# remove columns and empty rows, and repeated observations.
candidates <- candidates %>% select(-b) %>% distinct()
# remove the first obs.
candidates$flag <- candidates$Constituency.code # create a flag
candidates$flag <- str_replace_all(candidates$flag, "P[0-9]{1,3}", "1")
candidates <- candidates %>% filter(flag == 1) %>% select(-flag)
candidates<- arrange(candidates, Constituency.code)
# correct problem: winner has most of the votes, loser has what is left
#convert chrt vectors to numeric
candidates$Votes.number <- str_replace_all(candidates$Votes.number, ",", "")
candidates$Votes.number <- as.numeric(as.character(candidates$Votes.number))
# listing winners, keep first obs. which the one with largest percentage
list.winners <- candidates %>% group_by(Candidate.name) %>% filter(Votes.number == max(Votes.number) & Outcome == "Winner")
# listing losers, deleate the first obs. which the one with largest percentage
list.losers <- candidates %>% group_by(Candidate.name) %>% filter(Votes.number != max(Votes.number) & Outcome == "Loser")
# Logic of the collection: the first row correspond to the winners value, the second row correspond to members' value in the table, the rest correspond the values of other candidates
# acombine both list
candidates2 <- rbind(list.winners, list.losers)
candidates2<- arrange(candidates2, desc(Constituency.code))
# adding column party affiliaition
candidates2$Party.affiliation <- str_extract_all(candidates2$Candidate.name, "\\(.*\\)" )
candidates2$Party.affiliation <- str_replace_all(candidates2$Party.affiliation, "\\(", "")
candidates2$Party.affiliation <- str_replace_all(candidates2$Party.affiliation, "\\)", "")
candidates2$Candidate.name <- str_replace_all(candidates2$Candidate.name, "\\(.*\\)", "" )
# id = name without empty spaces
candidates2$id <- tolower(candidates2$Candidate.name)
candidates2$id <- gsub('\\s{1,4}','', candidates2$id)# remove empty spaces in the cell
# what shall I do with those districts that have changed the district name?
### ### ### ### ### other advanced and interesting features of webscrapping and regular expressions ### ### ### ### ###
# adding a column for wikipedia links
# extract url
a <- read_html(url) %>% html_nodes("a") %>% html_attr("href")
a<- unlist(str_extract_all(a, ".wiki.[[:alpha:]]{1,10}\\_+[[:alpha:]]{1,10}\\_.+")) # inspect and select only those links with candidate information
a<- a[40:304] # simplicity is beauty, remmber
# create full real links
url.candidate2 <- paste0("https://en.wikipedia.org", a)
wiki.links2 <- data.frame(url.candidate = url.candidate2)
wiki.links2$Candidate.name <- str_replace_all(wiki.links2$url.candidate, ".+.wiki.", "")
wiki.links2$Candidate.name <- str_replace_all(wiki.links2$Candidate.name, "_", " ")
wiki.links2$Candidate.name <- str_replace_all(wiki.links2$Candidate.name, "\\(.*\\)", "" )
# id = name without empty spaces
wiki.links2$id <- tolower(wiki.links2$Candidate.name)
wiki.links2$id <- gsub('\\s{1,4}','', wiki.links2$id)# remove empty spaces in the cell
# remove repeated and non informative rows
wiki.links2 <- distinct(wiki.links2)
# 6. merge both info
candidates2018 <- left_join(candidates2,wiki.links2, by = "id")
candidates2018<- candidates2018 %>% select(-14) %>% distinct()
### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###### ### ### ### ###
# WRITE AND SAVE
write.csv(candidates2018, "./candidateMSY2018.csv")
candidates2 <- read.csv("./candidateMSY2018.csv", na.strings="", stringsAsFactors = FALSE) # sexy stuff
# ;)
left_join()
keeps all observations in xright_join()
keeps all observations in yinner_join()
keeps only matched observations in X and Yfull_join()
keeps all observations in x and y even if not matchsemi_join()
keeps all observations in x that have a match in yanti_join()
drops all observations in x that have a match in ylibrary(foreign)
library(tidyverse)
library(readxl)
# data sets
# Authoritarian regimes dataset
gwf <- read.csv("./data/regime type/autocracydata.csv", stringsAsFactors = TRUE)[,-1] # change working directory
# dataset about the failure of authoritarian regimes
failure_regime <- read_excel("data/data/failure_regime.xlsx")
View(failure_regime)
left_data <- left_join(gwf, failure_regime, by= c("cowcode", "year", "gwf_country"))
right_data <- right_join(gwf, failure_regime, by= c("cowcode", "year" , "gwf_country", "gwf_casename")) # why one obs. more than failure?
# what I use: an updated versiobn of a function posted in stack overflow https://stackoverflow.com/questions/28992362/dplyr-join-define-na-values
left_join_NA <- function(x, y, ...) {
left_join(x = x, y = y, by = ...) %>%
mutate_all(funs(replace(., which(is.na(.)), 0))) ## replace the implicit NA into a pre-defined valu, 0 in this case
}
# you can change the 'type of join' with other type of joins and the default value (99 instead of 0)
complete_data <- left_join_NA(gwf, failure_regime, by= c("cowcode", "year", "gwf_country"))
## classical approaches
# complete_data[is.na(complete_data)] <- 0 # to change all NA in 0 in a classic manner, better specify each column
# complete_data$gwf_fail[is.na(complete_data$gwf_fail)] <- 0
# complete_data$gwf_fail_type[is.na(complete_data$gwf_fail_type)] <- 0
full_data<- full_join(gwf, failure_regime, by= c("cowcode", "year", "gwf_country")) # keep all observations.
inner_data <- inner_join(gwf, failure_regime, by= c("cowcode", "year", "gwf_country")) # keep only matched observations
# why inner seems to be better than the rest of datasets?
# why?
anti_data <- anti_join(gwf, failure_regime, by= c("cowcode", "year", "gwf_country", "gwf_casename"))
# Anti-joins are useful for diagnosing join mismatches (e.g. countries have slightly different name!), but also as an opposite function of semi_join
the_data <- anti_join(right_data, anti_data, by= c("cowcode", "year", "gwf_country"))
write.csv(the_data, "the_data.csv")
semi_data <- semi_join(failure_regime, inner_data, by= c("cowcode", "year", "gwf_country")) # Why, one obs. less?
# Semi-joins and anti joins are useful for matching filtered summary tables back to the original rows.
top_autocra <- the_data %>% group_by(gwf_regimetype) %>% filter(gwf_spell == max(gwf_spell)) # what is this?
semi_data <- the_data %>% semi_join(top_autocra)
anti_data <- the_data %>% anti_join(top_autocra) # dataset that does not include the most long lasting authoritarian regimes in the world