8 Suplement
8.1 Requirements
library(devtools) # Tools to Make Developing R Packages Easier
library(pdftools) # Tools to manipulate pdfs
library(taxize) # Taxonomic names management
library(monkeylearn) # Machine learning API
library(fulltext) # Retrieve full text of scientific articles from (some) open sources
library(SnowballC) # An R interface to the C libstemmer library, word recognition
library(NLP) # Requirement from tm package
library(tm) # Text mining package
library(NCmisc) # To handle large list visualization
library(stringr) # Handle strings
library(reshape2) # Handle matrix datasets
library(e1071) # Misc Functions of the Department of Statistics, Wien Universiteit Austria.
library(DT) # Interactive tables
library(corrplot) # Nice correlation plots-
library(text2vec) # Framework for natural language processing
library(glmnet) # Generalized linear models
library(ggmap) # Geolocations and connection with Google Maps / Google Earth API
library(leaflet) # Nice maps as html widgets
library(qdap) # Text splitting
library(pander) # markdown tables
If the interest to the reader is to replicate the framework presented in this literature review to test it or re - apply it to his/her personal use; Most, if not all of the computational tools presented herein are written within the R environment. It is recommended to continuously update R to its latest version and to work with some working environment like RStudio.
8.2 Search articles in a programatic manner
8.2.1 Fulltext package for R
With the function ft_search
for each supported source the following data is retrieved.
$source
= Host source $found
= Number of articles found $data
= Articles matching the keywords of the query. The information contained in this slot correspond to a list of each article DOI’s
The “limit” argument maximum is set up to 1000 articles retrieval in a single call. When the argument is not specified the function will retrieve the first 10 results. When “limit” is set up to 0, only the metadata will be retrieved. The result is a large list in where each slot corresponds to one of the sources supported in the fulltext package (PLOS, Crossref, Entrez, arXiv, biorxiv, and europmc) the results of the query are stored in the respective slot. See ?ft_search
for more information.
ft_get
function retrieves the full text from the search results. However, the resulting text is formatted for publication as XML. To make it “more” machine readable, the package fulltext provide the function chunks which is designed to facilitate the extraction of different sections of an article. Those sections (e.g. Title, Authors, abstract) can be defined with the “what” parameter of the function. As an additional feature, the ft_get
function can be fed with a list of DOI’s. Some useful metadata (when available) can also be extracted (See package documentation).
library(fulltext)
library(europepmc)
Querying and fetching articles from PLOS, Arxiv, Crossref and EuropePMC
query1 <- ft_search( query = c("interactions + frugivor*"),
limit = 1000, from = c("plos", "arxiv"))
query1 # plos and arxiv
query2 <- ft_search( query = c("interactions + frugivor*"),
limit = 1000, from = c("crossref"))
query2 # crossref
query3 <- europepmc::epmc_search(query = c("interactions + frugivor*"))
query3# europepmc
doi <- data.frame("doi"=c(query1$plos$data$id,
query1$arxiv$data$id,
query2$crossref$data$doi,query3$doi),
"source"= c(rep("plos",
length(query1$plos$data$id)),
c(rep("arxiv", length(query1$arxiv$data$id))),
rep("crossref",length(query2$crossref$data$url)),
rep("europepmc", length(query3$doi))))
doi <- unique(doi) # Clean repeated dois
length(doi$doi) # Retrieved dois for the query interactions + frugivor
par(las=1)
plot(doi$source, col = "orange",
ylab = "# of DOI's retrieved",
xlab = "Source",
main = "query = Interaction + frugivor*")
Google Scholar via Sci - Hub API
Note that the code below has been written for python and requires Scihub.py to be loaded
sh=SciHub()
# The number of search terms depends on the needs of the user.
names = ["Search term 1", "Search term 2", "Search term 3"]
results = sh.search( names, 300)
results['papers']
# "Get urls"
for paper in results['papers']:
print(paper['url'])
# "Get titles"
for paper in results['papers']:
print(paper['name'])
8.3 Downloading articles
Getting articles from open sources
# from Plos
# Retrieve xmls (100 first articles in the list)
plos <- ft_get(query1$plos$data$id[1:100], from = "plos")
# Fetch only the body of text
plos.text <- fulltext::chunks(plos, what = "body")
# from Crossref
# This url's can be pasted into a browser for pdf download via sci-hub.cc ( With Captchas)
link <- paste("http://sci-hub.io/", query2$crossref$data$doi, sep = "")
head(link)
tail(link)
Getting articles from Google Scholar
Note: scholar_links.csv was obtained with SciHub.py
# links for url’s retrieved with scihu.py
links <-read.csv("scholar_links.csv",
header =T, stringsAsFactors = FALSE)
links$Links
links$Title
# Prunning bad links
# Erase broken links from "Academia.edu"
academia <- grep("academia", links$Links)
clean.link <- links[-academia,]
clean.link <- clean.link[-grep("scholar.google.com", clean.link$Links)]
pdf <- grep("pdf", clean.link$Links)
# Get those links that direct to pdf ( Erase citations and other links that does not contain pdfs)
# Download the articles into the working directory folder, with the title of the article as the file name.
for ( i in 1:length(pdf)){
print (i)
download.file(clean.link$Links[pdf[i]],paste(clean.link$Title[pdf[i]], ".pdf"))
}
8.3.0.1 Transforming PDF to text.
With pdftools package
# For documents retrieved from google scholar or with a custom list of pdfs
library(pdftools)
# File names of folder where pdfs are stored
files <- list.files("Scholar/pdfs/")
# Create directory path for each file
files <- paste("Scholar/pdfs/", files, sep = "")
text_scholar <- c()
for (i in 1:length(files)) {
print (i)
text_scholar$text[[i]] <-pdf_text(files[i]) # Makes OCR
text_scholar$title[[i]] <- files[i]
}
with fulltext package
path <- list.files( "PDFs/")
path <- paste0("PDFs/", path, sep ="")
kals <- c()
for (i in 1:length(path)){tryCatch(
kals[[i]]<-ft_extract(path[i])
)
}
kals[[2]]$data # PDF is stored as an image, probably an scan.
with Google tesseract
library(tesseract)
pdf <- file.path("PDFs/File2.pdf")
bitmap <- pdf_render_page(pdf, dpi = 300, numeric = TRUE)
# transform pdf to tiff image
tiff::writeTIFF(bitmap, "page.tiff") # write output
out <- ocr("page.tiff") # Perform OCR
cat(out)
8.4 Create a corpus
For the example only the articles from PLOS will be used
library(tm)
# Create a corpus for mining (as example only with the list of articles from plos)
# Transform the list of documents into a vectorial corpus
doc.vec <- VectorSource(plos.text$plos)
# Transform the list of documents into a corpus
doc.corpus <- Corpus(doc.vec)
doc.corpus # Corpus with articles downloaded from PLOS
doc.vec <- VectorSource(text_scholar$text)
doc.corpus <- Corpus(doc.vec) # Create the corpus with articles from scholar
8.4.1 Preprocessing
# Removing puctuation
doc.corpus <- tm_map(doc.corpus, removePunctuation)
# Removing numbers
doc.corpus <- tm_map(doc.corpus, removeNumbers)
# Removing stopwords
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
# Removing white space
doc.corpus <- tm_map(doc.corpus, stripWhitespace)
# Removing common word endings and stem words to its roots
doc.corpus <- tm_map(doc.corpus, stemDocument)
# Tell R to treat the preprocesed documents as text documents
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)
8.4.2 Document term matrix
dtm <- tm::DocumentTermMatrix(doc.corpus)
dtm
8.4.3 Building a Thesaurus
A custom thesaurus can be built extracting terms form a collection of targeted articles
library(DT)
y.n.terms <- read.csv("freq.2.train.csv", header = T)
DT::datatable(y.n.terms,
caption = "Terms extracted from a corpus of a 100 articles")
Focusing in the terms related to frugivory interactions
DT::datatable(y.n.terms[y.n.terms$train == "Y",],
caption = "Terms related with frugivory extracted from a corpus of a 100 articles")
frug.n.terms <- as.character(droplevels(y.n.terms$term[y.n.terms$train == "Y"]))
8.5 Automated Content Analysis
With a DOI list we can retrieve the fulltext of articles manually screened for the presence of interactions. The articles were obtained with the ft_get
function, come from PloS and correspond to the 101 - 200 articles from a search results performed on 17-nov-2016.
res.test <- read.csv("index.csv", header=TRUE,
stringsAsFactors = FALSE)
# List of articles from PLOS which had been manually screened for the presence of interactions
length(res.test$DOI)
length(res.test$DOI[res.test$interaction==1]) # Presence of frugivory interaction
length(res.test$DOI[res.test$interaction==0]) # Absence of frugivory interaction
# Get the full text from the list of DOI's
test.text <- ft_get(res.test$DOI, from = "plos")
# Get the full text from document (into list of lists)
body.t.text <- fulltext::chunks(test.text, what = "body")
body.t.title <- fulltext::chunks(test.text, what = "title")
8.5.1 Customized Summary using regular expressions
Othe content of the article can be summarized based on the specific interest of the researcher. For this example we are interested in the pieces of the articles that are related with frugivory. Mention of frugivory interactions in the text will most likely share the same sentence with the terms used to describe frugivory. Hence, we can extract only the pieces of the article on which frugivory related terms have been used. Using regular expressions we can define a custom function to retrieve those sentences that matches the terms of a given vocabulary ( The frugivory vocabulary in this case )
library(qdap)
q <- c()
for (i in 1:length(body.t.text$plos)){
q[[i]] <- sent_detect(body.t.text$plos[[i]]$body) # Break the text into sentences
}
# Retrieve the sentences which contain the custom terms
x <- paste(frug.n.terms, collapse = "|")
o <- c()
for ( f in 1:length(q)){
o[[f]] <- q[[f]][grepl(x, q[[f]])]
}
o[[1]] # Custom summary of article 1 in the list
8.5.2 Automatic article classification
Note: the following corpus will be created with the manually scanned articles (freq.2.train.csv)
# Transform the list of documents into a vectorial corpus
doc.t.vec <- VectorSource(body.t.text$plos)
# Transform the list of documents into a corpus
doc.t.corpus <- Corpus(doc.t.vec)
## Preprocessing
# Removing stopwords
doc.t.corpus <- tm::tm_map(doc.t.corpus, removeWords, stopwords("english"))
# Removing puctuation
doc.t.corpus <- tm::tm_map(doc.t.corpus, removePunctuation)
# Removing numbers
doc.t.corpus <- tm::tm_map(doc.t.corpus, removeNumbers)
# Removing common word endings and stem the words
doc.t.corpus <- tm::tm_map(doc.t.corpus, stemDocument)
# Removing white space
doc.t.corpus <- tm::tm_map(doc.t.corpus, stripWhitespace)
# Tell R to treat the preprocesed documents as text documents
doc.t.corpus <- tm::tm_map(doc.t.corpus, PlainTextDocument)
# Extract the text content from the document corpus
new.data.table <- c()
for (i in 1:length(doc.t.corpus)){
new.data.table[i] <- doc.t.corpus[[i]]$content
}
# Create a new data frame with the text, adding the DOI as identifiera and also the values of presence / absence of interactions for training.
new.corpus <- data.frame("text" = new.data.table,
"doi" = res.test$DOI,
"inter" = res.test$interaction)
8.5.2.1 With all terms
8.5.2.1.1 Naive Bayes Classification
library(caret)
library(klaR)
library(e1071)
# define a particion
split = 0.80
trainIndex <- replicate(10,
createDataPartition(new.corpus$inter,
p = split,
list = FALSE)) # 10 replicates for validation
train <- c()
for ( i in 1:10){
train[[i]] <- doc.t.corpus[trainIndex[,,i]]
}
test <- c()
for ( i in 1:10){
test[[i]] <- doc.t.corpus[-trainIndex[,,i]]
}
trainmatrix <- c()
testmatrix <- c()
# Create term document matrix
for( i in 1:10){
trainmatrix[[i]] <- tm::DocumentTermMatrix(train[[i]])
testmatrix[[i]] <- tm::DocumentTermMatrix(test[[i]])
}
# Building the model
model1 <- c()
for (i in 1:10){
model1[[i]] <- naiveBayes( as.matrix ( trainmatrix[[i]] ),
as.factor ( res.test$interaction[trainIndex[,,i]] ) )
}
# predict
results1 <-c()
for ( i in 1:10){
results1[[i]] <- predict(model1[[i]],as.matrix(testmatrix[[i]]))
}
cmatrix <- c()
for ( i in 1:10){
cmatrix[[i]] <- confusionMatrix(results1[[i]],
res.test$interaction [-trainIndex[,,i]])
}
library(ROCR)
nb1 <- list()
for( i in 1:10 ){
nb1[[i]]<-prediction(as.numeric(results1[[i]]),
res.test$interaction[-trainIndex[,,i]])
}
nbauc <- c()
for( i in 1:10 ){
nbauc[i] <- performance(nb1[[i]],"auc")@y.values # Calculated AUC
}
mean(unlist(nbauc)) # Mean AUC from naivebayes
sd(unlist(nbauc))
8.5.2.1.2 Vectorization
library(text2vec)
split = 0.80
trainIndex <- createDataPartition(new.corpus$inter,
p = split, list = FALSE)
train <- new.corpus[trainIndex,]
test <- new.corpus[-trainIndex,]
# define preprocessing function and tokenization fucntion
prep_fun <- tolower
tok_fun <- word_tokenizer
it_train <- itoken(as.character(train$text),
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$doi,
progressbar = FALSE)
# Create vocabulary of terms
vocab <- create_vocabulary(it_train)
# Filter vocabulary to a minimum of terms that appear at least 20 times
vocab <- prune_vocabulary(vocab, term_count_min = 20, term_count_max = 1000)
vectorizer <- vocab_vectorizer(vocab)
# Document term matrix with training articles.
dtm_train <- create_dtm(it_train, vectorizer)
### Building the model
library(glmnet)
NFOLDS = 3 # Number of folds for crossvalidation
glmnet_classifier <- cv.glmnet(x = dtm_train, y = train[['inter']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 3-fold cross-validation
nfolds = NFOLDS,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
plot(glmnet_classifier)
mean(glmnet_classifier$cvm) # Mean AUC
max(glmnet_classifier$cvm) # max AUC
sd(glmnet_classifier$cvm)
Once the model have been fitted, the performance of it can be assessed on the test subset of the corpus.
# Note that most text2vec functions are pipe friendly
it_test <- test$text %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$doi, progressbar = FALSE)
dtm_test <- create_dtm(it_test, vectorizer)
preds <- predict(glmnet_classifier,
dtm_test, type = 'response')[,1]
auc <- glmnet:::auc(test$inter, preds)
auc
data.frame("pred" = preds, "obs" = test$inter)
8.5.2.2 With thesaurus
8.5.2.2.1 Naive Bayes Classification
# define a particion
split = 0.80
trainIndex <- replicate(10,
createDataPartition(new.corpus$inter,
p = split, list = FALSE))
# 10 replicates for validation
train <- c()
for ( i in 1:10){
train[[i]] <- new.corpus[trainIndex[,,i],]
}
test <- c()
for ( i in 1:10){
test[[i]] <- new.corpus[-trainIndex[,,i],]
}
# define preprocessing function and tokenization fucntion
prep_fun <- tolower
tok_fun <- word_tokenizer
it_train <- list()
for ( i in 1:10){
it_train[[i]] <- itoken(as.character(train[[i]]$text),
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train[[i]]$doi,
progressbar = FALSE)
}
# Create vocabulary of terms
vocab.pruned <- create_vocabulary(it_train[[1]])
# Match with found terms with terms in the custom dictionary
vocab.pruned$vocab <- vocab.pruned$vocab[(vocab.pruned$vocab$terms %in% frug.n.terms) == TRUE]
vectorizer.prun <- vocab_vectorizer(vocab.pruned)
# Create dtm
dtm_train<- list()
for ( i in 1:10){
dtm_train[[i]] <- create_dtm(it_train[[i]], vectorizer.prun)
}
it_test <- list()
for ( i in 1:10){
it_test[[i]] <- itoken(as.character(test[[i]]$text),
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train[[i]]$doi,
progressbar = FALSE)
}
dtm_test<- list()
for ( i in 1:10){
dtm_test[[i]] <- create_dtm(it_test[[i]],vectorizer.prun)
}
# Building the model
model2 <- list()
for (i in 1:10){
model2[[i]] <- naiveBayes( as.matrix(dtm_train[[i]]),
as.factor(res.test$interaction [trainIndex[,,i]] ))
}
# predict
results2 <-c()
for ( i in 1:10){
results2[[i]] <- predict(model2[[i]],as.matrix(dtm_test[[i]]))
}
cmatrix2 <- c()
for ( i in 1:10){
cmatrix2[[i]] <- confusionMatrix(results2[[i]],
res.test$interaction [-trainIndex[,,i]])
}
nb2 <- list()
for( i in 1:10 ){
nb2[[i]]<-prediction(as.numeric(results2[[i]]),
res.test$interaction[-trainIndex[,,i]])
}
nbauc2 <- c()
for( i in 1:10 ){
nbauc2[i] <- performance(nb2[[i]],"auc")@y.values # Calculated AUC
}
mean(unlist(nbauc2)) # Mean AUC from naivebayes
sd(unlist(nbauc2))
8.5.2.2.2 Vectorization
# define a particion
split = 0.80
trainIndex <- createDataPartition(new.corpus$inter,
p = split, list = FALSE)
train <- new.corpus[trainIndex,]
test <- new.corpus[-trainIndex,]
it_train <- itoken(as.character(train$text),
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$doi,
progressbar = FALSE)
# Create vocabulary of terms
vocab.pruned <- create_vocabulary(it_train)
# Match with found terms with terms in the custom dictionary
vocab.pruned$vocab <- vocab.pruned$vocab[(vocab.pruned$vocab$terms %in% frug.n.terms) == TRUE]
# Filter vocabulary to a minimum of terms that appear at least 20 times
vocab.pruned <- prune_vocabulary(vocab.pruned, term_count_min = 20, term_count_max = 1000)
vectorizer.prun <- vocab_vectorizer(vocab.pruned)
# Create dtm
dtm_train <- create_dtm(it_train, vectorizer.prun)
it_test <- itoken(as.character(test$text),
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$doi,
progressbar = FALSE)
dtm_test <- create_dtm(it_test,vectorizer.prun)
NFOLDS = 3 # Number of folds for crossvalidation
glmnet_classifier2 <- cv.glmnet(x = dtm_train, y = train[['inter']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = NFOLDS,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
plot(glmnet_classifier2)
mean(glmnet_classifier2$cvm) # mean AUC
max(glmnet_classifier2$cvm) # max AUC
sd(glmnet_classifier2$cvm)
Once the model have been fitted, the performance of it can be assessed on the test subset of the corpus.
# Note that most text2vec functions are pipe friendly
it_test <- test$text %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$doi, progressbar = FALSE)
dtm_test <- create_dtm(it_test, vectorizer.prun)
preds2 <- predict(glmnet_classifier2,
dtm_test,
type = 'response')[,1]
auc2 <- glmnet:::auc(test$inter, preds2)
auc2
data.frame("pred" = preds2, "obs" = test$inter)
Comparing model performances
8.6 Specific Entity Extraction
8.6.1 Scientific names
## Function from taxize that links with the global names recognition and discovery tools
scinames <- c()
for (i in 1:length(body.t.text$plos)){
scinames[[i]] <- taxize::scrapenames(text =paste0(body.t.text$plos[[i]]$body,
collapse = ""), all_data_sources = TRUE)
}
# Extracting only the scientific names from the list
sci.data <- c()
for ( i in 1:length(scinames)){
sci.data$s.n[[i]]<- unique(scinames[[i]]$data$scientificname)
sci.data$doi[[i]]<-res.test$DOI[i]
}
sci.data$s.n[1] # Scientific names from the first article in the list
sci.data$doi[1]
x1<-c()
for ( i in 1:length(q)){
x1[[i]] <- paste(paste(sci.data$s.n[[i]], collapse = "|"),
paste(frug.n.terms, collapse = "|"), collapse = "|")
}
for ( i in 1:length(q)){
x1[[i]] <- gsub(pattern="\\(", replacement="",
x= x1[[i]]) # Erase special character "("
}
o1 <- c()
for ( f in 1:length(q)){
o1$title[[f]] <- res.test$title[f]
o1$doi[[f]] <- res.test$DOI[f]
o1$summary[[f]] <- q[[f]][grepl(x1[[f]], q[[f]])]
}
8.7 Custom summary using the species list and thesaurus as terms.
i = 1:10 # ten first articles
v <- c( "summary"= o1$summary[i])
names(v) <- paste(o1$title[i], "|",
"interaction =",res.test$interaction[i], "|", o1$doi[i])
v[1:10]
x2 <- c()
for ( i in 1:length(q)){
x2[[i]] <- paste(sci.data$s.n[[i]],
collapse = "|")
}
for ( i in 1:length(q)){
x2[[i]] <- gsub(pattern="\\(",
replacement="", x= x2[[i]])
}
o3 <- c()
for ( f in 1:length(q)){
o3$title[[f]] <- res.test$title[f]
o3$doi[[f]] <- res.test$DOI[f]
o3$summary[[f]] <- q[[f]][grepl(x2[[f]], q[[f]])]
}
8.8 Custom Summary using only species names
i = 1:10 # 10 first articles
w <- c( "summary"= o3$summary[i])
names(w) <- paste(o3$title[i], "|",
"interaction =",res.test$interaction[i],
"|", o3$doi[i])
w[1:10]
8.8.1 Geographical locations
library(monkeylearn)
# loop for extracting location info from the corpus of text.
key = "cf0b9da7695ba68256cd61ee7fe04cbf84ae4ede"
extractor_id = "ex_isnnZRbS"
# Mine contents matching the extractor id
out1<-c()
for ( i in 1:length(doc.t.corpus)) {
out1[[i]] <- monkeylearn_extract(doc.t.corpus[[i]]$content,
extractor_id = extractor_id,
key=key)
}
# Order most mentioned locations per article
location1<-c()
for (i in 1:length(doc.t.corpus)) {
location1$loc[[i]] <- out1[[i]]$entity[out1[[i]]$tag=="LOCATION"]
location1$count[[i]] <- out1[[i]]$count[out1[[i]]$tag=="LOCATION"]
}
library(reshape2)
# Create a new dataset with locations and incorporate titles + DOI
loc <- melt(location1$loc)
names(loc) <- c("location", "article")
titles <- melt(body.t.title$plos)
names(titles) <- c("title", "erase", "doi")
titles$erase <- NULL
# Add titles and dois to the dataset
loc$title <- titles$title[loc$article]
loc$doi <- titles$doi[loc$article]
# adds scientific names
snames <- melt(sci.data$s.n)
names(snames) <- c("sci.name","article")
# Loop to arrange scientific names with java annotation
java.names <- c()
for ( i in loc$article){
java.names$names[i] <- paste(snames$sci.name[snames$article == i],
collapse = "<br/>")
java.names$art[i] <- i
}
java.names$doi <- loc$doi[match(java.names$art,loc$article)]
# Adds DOI's (links)
java.names <- data.frame(java.names) # Makes dataframe
loc$snames <- java.names$names[match(loc$doi,java.names$doi)] # Adds sci.names to the dataframe
library(ggmap)
geocodes <- geocode(as.character(loc$location)
,messaging = FALSE)
# quota of 2500 queries per day for google api, dsk server works slower
geocodes$title <- loc$title # Adds titles
geocodes$doi <- loc$doi # Adds doi's
geocodes$location <- loc$location # Adds location info
geocodes$snames <- loc$snames
clean.geocodes <- geocodes[complete.cases(geocodes),] # Clean data for which coordinates were not available (or error in locations)
running geocodeQueryCheck()
gives the number of queries available from the geocode function.
With the locations, scientific names and DOI’s arranged into a single dataframe we can plot those into an interactive map usign the leaflet
package. The user can then make a selection of articles based either on specific taxa of interest or in a particular geographic extent. Clicking on each circle will pop-out a list with the link of the article and the scientific names included in it.
leaflet(clean.geocodes1) %>% addTiles() %>% addCircles(popup = popup_style, color = "orange")
# subset by extent coordinates, e.g. INDIA
extent <- c(70,90,0,35) # in the form of: ( lon, lon, lat, lat )
clipped <- clean.geocodes1[clean.geocodes1$lon >= extent[1] & clean.geocodes1$lon <=extent[2],]
clipped <- clipped[clipped$lat >= extent[3] & clipped$lat <=extent[4] ,]
Map
leaflet(clipped)%>% addTiles() %>% # Add default OpenStreetMap map tiles
addCircles(popup = popup_style,color = "orange")
unique(clipped$title) # Selected titles with mentions of locations inside INDIA in the text