8 Suplement

8.1 Requirements

library(devtools) # Tools to Make Developing R Packages Easier
library(pdftools) # Tools to manipulate pdfs
library(taxize) # Taxonomic names management
library(monkeylearn) # Machine learning API
library(fulltext) # Retrieve full text of scientific articles from (some) open sources 
library(SnowballC) # An R interface to the C libstemmer library, word recognition
library(NLP) # Requirement from tm package
library(tm) # Text mining package
library(NCmisc) # To handle large list visualization 
library(stringr) # Handle strings 
library(reshape2) # Handle matrix datasets 
library(e1071) # Misc Functions of the Department of Statistics, Wien Universiteit Austria. 
library(DT) # Interactive tables
library(corrplot) # Nice correlation plots-
library(text2vec) # Framework for natural language processing 
library(glmnet) # Generalized linear models 
library(ggmap) # Geolocations and connection with Google Maps / Google Earth API
library(leaflet) # Nice maps as html widgets
library(qdap) # Text splitting 
library(pander) # markdown tables

If the interest to the reader is to replicate the framework presented in this literature review to test it or re - apply it to his/her personal use; Most, if not all of the computational tools presented herein are written within the R environment. It is recommended to continuously update R to its latest version and to work with some working environment like RStudio.

8.2 Search articles in a programatic manner

8.2.1 Fulltext package for R

With the function ft_search for each supported source the following data is retrieved.

$source = Host source $found = Number of articles found $data = Articles matching the keywords of the query. The information contained in this slot correspond to a list of each article DOI’s

The “limit” argument maximum is set up to 1000 articles retrieval in a single call. When the argument is not specified the function will retrieve the first 10 results. When “limit” is set up to 0, only the metadata will be retrieved. The result is a large list in where each slot corresponds to one of the sources supported in the fulltext package (PLOS, Crossref, Entrez, arXiv, biorxiv, and europmc) the results of the query are stored in the respective slot. See ?ft_search for more information.

ft_get function retrieves the full text from the search results. However, the resulting text is formatted for publication as XML. To make it “more” machine readable, the package fulltext provide the function chunks which is designed to facilitate the extraction of different sections of an article. Those sections (e.g. Title, Authors, abstract) can be defined with the “what” parameter of the function. As an additional feature, the ft_get function can be fed with a list of DOI’s. Some useful metadata (when available) can also be extracted (See package documentation).

library(fulltext)
library(europepmc)

Querying and fetching articles from PLOS, Arxiv, Crossref and EuropePMC

query1 <- ft_search( query = c("interactions + frugivor*"), 
                     limit = 1000, from = c("plos", "arxiv"))
query1 # plos and arxiv
query2 <- ft_search( query = c("interactions + frugivor*"), 
                     limit = 1000, from = c("crossref"))
query2 # crossref

query3 <- europepmc::epmc_search(query = c("interactions + frugivor*"))
query3# europepmc
doi <- data.frame("doi"=c(query1$plos$data$id, 
                          query1$arxiv$data$id,
                          query2$crossref$data$doi,query3$doi),
                  "source"= c(rep("plos",
                                  length(query1$plos$data$id)),
                              c(rep("arxiv", length(query1$arxiv$data$id))),
                              rep("crossref",length(query2$crossref$data$url)), 
                              rep("europepmc", length(query3$doi))))

doi <- unique(doi) # Clean repeated dois
length(doi$doi) # Retrieved dois for the query interactions + frugivor 
par(las=1)
plot(doi$source, col = "orange",
     ylab = "# of DOI's retrieved",
     xlab = "Source",
     main = "query = Interaction + frugivor*")

Google Scholar via Sci - Hub API

Note that the code below has been written for python and requires Scihub.py to be loaded

sh=SciHub()
 # The number of search terms depends on the needs of the user. 
names = ["Search term 1", "Search term 2", "Search term 3"]

results = sh.search( names, 300)

results['papers']

# "Get urls" 
for paper in results['papers']:   
      print(paper['url'])

# "Get titles"
for paper in results['papers']:    
      print(paper['name'])  

8.3 Downloading articles

Getting articles from open sources

#  from Plos 
 # Retrieve xmls (100 first articles in the list)
plos <- ft_get(query1$plos$data$id[1:100], from = "plos")
# Fetch only the body of text
plos.text <- fulltext::chunks(plos, what = "body") 


# from Crossref 
# This url's can be pasted into a browser for pdf download via sci-hub.cc ( With Captchas)
link <- paste("http://sci-hub.io/", query2$crossref$data$doi, sep = "")
head(link) 
tail(link)

Getting articles from Google Scholar

Note: scholar_links.csv was obtained with SciHub.py

# links for url’s retrieved with scihu.py
links <-read.csv("scholar_links.csv",
                 header =T, stringsAsFactors = FALSE) 
links$Links
links$Title

# Prunning bad links
 # Erase broken links from "Academia.edu"
academia <- grep("academia", links$Links) 

clean.link <- links[-academia,]
clean.link <- clean.link[-grep("scholar.google.com", clean.link$Links)] 


pdf <- grep("pdf", clean.link$Links) 
# Get those links that direct to pdf ( Erase citations and other links that does not contain pdfs)


# Download the articles into the working directory folder, with the title of the article as the file name. 

for ( i in 1:length(pdf)){
   print (i)
  download.file(clean.link$Links[pdf[i]],paste(clean.link$Title[pdf[i]], ".pdf"))
 
  }

8.3.0.1 Transforming PDF to text.

With pdftools package

# For documents retrieved from google scholar or with a custom list of pdfs
library(pdftools)

# File names of folder where pdfs are stored
files <- list.files("Scholar/pdfs/") 
 # Create directory path for each file
files <- paste("Scholar/pdfs/", files, sep = "")

text_scholar <- c()
for (i in 1:length(files)) {
  print (i)
  text_scholar$text[[i]] <-pdf_text(files[i]) # Makes OCR
  text_scholar$title[[i]] <- files[i]
}

with fulltext package

path <- list.files( "PDFs/")
path <- paste0("PDFs/", path, sep ="")

kals <- c()

for (i in 1:length(path)){tryCatch(
   kals[[i]]<-ft_extract(path[i])
)
}

kals[[2]]$data # PDF is stored as an image, probably an scan. 

with Google tesseract

library(tesseract)

pdf <- file.path("PDFs/File2.pdf")
bitmap <- pdf_render_page(pdf, dpi = 300, numeric = TRUE)
# transform pdf to tiff image
tiff::writeTIFF(bitmap, "page.tiff") # write output

out <- ocr("page.tiff") # Perform OCR
cat(out)

8.4 Create a corpus

For the example only the articles from PLOS will be used

library(tm)
# Create a corpus for mining (as example only with the list of articles from plos)
# Transform the list of documents into a vectorial corpus
doc.vec <- VectorSource(plos.text$plos)
# Transform the list of documents into a corpus
doc.corpus <- Corpus(doc.vec)
doc.corpus # Corpus with articles downloaded from PLOS
doc.vec <- VectorSource(text_scholar$text)
doc.corpus <- Corpus(doc.vec) # Create the corpus with articles from scholar

8.4.1 Preprocessing

# Removing puctuation                                               
doc.corpus <- tm_map(doc.corpus, removePunctuation)
# Removing numbers                                                  
doc.corpus <- tm_map(doc.corpus, removeNumbers)                           
# Removing stopwords                                                
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))       
# Removing white space                                              
doc.corpus <- tm_map(doc.corpus, stripWhitespace) 
# Removing common word endings and stem words to its roots                                
doc.corpus <- tm_map(doc.corpus, stemDocument)     
# Tell R to treat the preprocesed documents as text documents       
doc.corpus <- tm_map(doc.corpus, PlainTextDocument)

8.4.2 Document term matrix

dtm <- tm::DocumentTermMatrix(doc.corpus)
dtm

8.4.3 Building a Thesaurus

A custom thesaurus can be built extracting terms form a collection of targeted articles

library(DT)
y.n.terms <- read.csv("freq.2.train.csv", header = T)

DT::datatable(y.n.terms,
              caption = "Terms extracted from a corpus of a 100 articles")

Focusing in the terms related to frugivory interactions

DT::datatable(y.n.terms[y.n.terms$train == "Y",], 
              caption = "Terms related with frugivory extracted from a corpus of a 100 articles")

frug.n.terms <- as.character(droplevels(y.n.terms$term[y.n.terms$train == "Y"]))

8.5 Automated Content Analysis

With a DOI list we can retrieve the fulltext of articles manually screened for the presence of interactions. The articles were obtained with the ft_get function, come from PloS and correspond to the 101 - 200 articles from a search results performed on 17-nov-2016.

res.test <- read.csv("index.csv", header=TRUE, 
                     stringsAsFactors = FALSE) 
# List of articles from PLOS which had been manually screened for the presence of interactions

length(res.test$DOI)
length(res.test$DOI[res.test$interaction==1]) # Presence of frugivory interaction
length(res.test$DOI[res.test$interaction==0]) # Absence of frugivory interaction
# Get the full text from the list of DOI's 
test.text <- ft_get(res.test$DOI, from = "plos")

# Get the full text from document (into list of lists)
body.t.text <- fulltext::chunks(test.text, what = "body")
body.t.title <- fulltext::chunks(test.text, what = "title")

8.5.1 Customized Summary using regular expressions

Othe content of the article can be summarized based on the specific interest of the researcher. For this example we are interested in the pieces of the articles that are related with frugivory. Mention of frugivory interactions in the text will most likely share the same sentence with the terms used to describe frugivory. Hence, we can extract only the pieces of the article on which frugivory related terms have been used. Using regular expressions we can define a custom function to retrieve those sentences that matches the terms of a given vocabulary ( The frugivory vocabulary in this case )

library(qdap)


q <- c()

for (i in 1:length(body.t.text$plos)){
  q[[i]] <- sent_detect(body.t.text$plos[[i]]$body) # Break the text into sentences
}

# Retrieve the sentences which contain the custom terms 
x <- paste(frug.n.terms, collapse = "|")
o <- c()
for ( f in 1:length(q)){ 
 o[[f]] <- q[[f]][grepl(x, q[[f]])] 
              }
o[[1]] # Custom summary of article 1 in the list

8.5.2 Automatic article classification

Note: the following corpus will be created with the manually scanned articles (freq.2.train.csv)

# Transform the list of documents into a vectorial corpus
doc.t.vec <- VectorSource(body.t.text$plos)
# Transform the list of documents into a corpus
doc.t.corpus <- Corpus(doc.t.vec)
## Preprocessing                                                    
# Removing stopwords                                                
doc.t.corpus <- tm::tm_map(doc.t.corpus, removeWords, stopwords("english"))       
# Removing puctuation                                               
doc.t.corpus <- tm::tm_map(doc.t.corpus, removePunctuation)                       
# Removing numbers                                                  
doc.t.corpus <- tm::tm_map(doc.t.corpus, removeNumbers)                           
# Removing common word endings and stem the words                              
doc.t.corpus <- tm::tm_map(doc.t.corpus, stemDocument)                            
# Removing white space                                              
doc.t.corpus <- tm::tm_map(doc.t.corpus, stripWhitespace)                         
# Tell R to treat the preprocesed documents as text documents       
doc.t.corpus <- tm::tm_map(doc.t.corpus, PlainTextDocument)                       
# Extract the text content from the document corpus
new.data.table <- c()

for (i in 1:length(doc.t.corpus)){
  new.data.table[i] <- doc.t.corpus[[i]]$content
  
}

# Create a new data frame with the text, adding the DOI as identifiera and also the values of presence / absence of interactions for training. 

new.corpus <- data.frame("text" = new.data.table, 
                         "doi" = res.test$DOI, 
                         "inter" = res.test$interaction)

8.5.2.1 With all terms

8.5.2.1.1 Naive Bayes Classification
library(caret)
library(klaR)
library(e1071)
# define a particion

split = 0.80
trainIndex <- replicate(10,
                        createDataPartition(new.corpus$inter,
                                            p = split, 
                                            list = FALSE)) # 10 replicates for validation

train <- c()
for ( i in 1:10){
train[[i]] <- doc.t.corpus[trainIndex[,,i]]
}

test <- c()
for ( i in 1:10){
test[[i]] <- doc.t.corpus[-trainIndex[,,i]]
}

trainmatrix <- c()
testmatrix <- c()
# Create term document matrix 
for( i in 1:10){
trainmatrix[[i]] <- tm::DocumentTermMatrix(train[[i]])
testmatrix[[i]] <- tm::DocumentTermMatrix(test[[i]])
}
# Building the model
model1 <- c()
for (i in 1:10){
model1[[i]] <- naiveBayes( as.matrix ( trainmatrix[[i]] ),
                           as.factor ( res.test$interaction[trainIndex[,,i]] ) )
}
# predict
results1 <-c()
for ( i in 1:10){
results1[[i]] <- predict(model1[[i]],as.matrix(testmatrix[[i]]))
}

cmatrix <- c()
for ( i in 1:10){
cmatrix[[i]] <- confusionMatrix(results1[[i]],
                                res.test$interaction [-trainIndex[,,i]])
} 
library(ROCR)
nb1 <- list()
for( i in 1:10 ){
nb1[[i]]<-prediction(as.numeric(results1[[i]]), 
                     res.test$interaction[-trainIndex[,,i]])
}

nbauc <- c()
for( i in 1:10 ){
nbauc[i] <- performance(nb1[[i]],"auc")@y.values # Calculated AUC
}

mean(unlist(nbauc)) # Mean AUC from naivebayes
sd(unlist(nbauc))
8.5.2.1.2 Vectorization
library(text2vec)

split = 0.80
trainIndex <- createDataPartition(new.corpus$inter,
                                  p = split, list = FALSE)

train <- new.corpus[trainIndex,]
test <- new.corpus[-trainIndex,]
# define preprocessing function and tokenization fucntion
prep_fun <- tolower
tok_fun <- word_tokenizer

it_train <- itoken(as.character(train$text), 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train$doi, 
             progressbar = FALSE)

# Create vocabulary of terms 
vocab <- create_vocabulary(it_train)

# Filter vocabulary to a minimum of terms that appear at least 20 times
vocab <- prune_vocabulary(vocab, term_count_min = 20, term_count_max = 1000)
vectorizer <- vocab_vectorizer(vocab)
# Document term matrix with training articles. 
dtm_train <- create_dtm(it_train, vectorizer) 
### Building the model
library(glmnet)
NFOLDS = 3 # Number of folds for crossvalidation

glmnet_classifier <- cv.glmnet(x = dtm_train, y = train[['inter']],
                              family = 'binomial',
                              # L1 penalty
                              alpha = 1,
                              # interested in the area under ROC curve
                              type.measure = "auc",
                              # 3-fold cross-validation
                              nfolds = NFOLDS,
                              # high value is less accurate, but has faster training
                              thresh = 1e-3,
                              # again lower number of iterations for faster training
                              maxit = 1e3)
plot(glmnet_classifier)
mean(glmnet_classifier$cvm) # Mean AUC
max(glmnet_classifier$cvm) # max AUC
sd(glmnet_classifier$cvm) 

Once the model have been fitted, the performance of it can be assessed on the test subset of the corpus.

# Note that most text2vec functions are pipe friendly
it_test <- test$text %>%
  prep_fun %>%
  tok_fun %>%
  itoken(ids = test$doi, progressbar = FALSE)

dtm_test <- create_dtm(it_test, vectorizer)
preds <- predict(glmnet_classifier, 
                 dtm_test, type = 'response')[,1]
auc <- glmnet:::auc(test$inter, preds)
auc
data.frame("pred" = preds, "obs" = test$inter)

8.5.2.2 With thesaurus

8.5.2.2.1 Naive Bayes Classification
# define a particion

split = 0.80
trainIndex <- replicate(10,
                        createDataPartition(new.corpus$inter, 
                                            p = split, list = FALSE)) 
# 10 replicates for validation


train <- c()
for ( i in 1:10){
train[[i]] <- new.corpus[trainIndex[,,i],]
}

test <- c()
for ( i in 1:10){
test[[i]] <- new.corpus[-trainIndex[,,i],]
}


# define preprocessing function and tokenization fucntion
prep_fun <- tolower
tok_fun <- word_tokenizer

it_train <- list()
for ( i in 1:10){
it_train[[i]] <- itoken(as.character(train[[i]]$text), 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train[[i]]$doi, 
             progressbar = FALSE)
}

# Create vocabulary of terms 
vocab.pruned <- create_vocabulary(it_train[[1]])

# Match with found terms with terms in the custom dictionary 

vocab.pruned$vocab <- vocab.pruned$vocab[(vocab.pruned$vocab$terms %in%  frug.n.terms) == TRUE]

vectorizer.prun <- vocab_vectorizer(vocab.pruned)

# Create dtm
dtm_train<- list()
for ( i in 1:10){
dtm_train[[i]] <- create_dtm(it_train[[i]], vectorizer.prun) 
}

it_test <- list()
for ( i in 1:10){
it_test[[i]] <- itoken(as.character(test[[i]]$text), 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train[[i]]$doi, 
             progressbar = FALSE)
}

dtm_test<- list()
for ( i in 1:10){
dtm_test[[i]] <- create_dtm(it_test[[i]],vectorizer.prun)
}
# Building the model

model2 <- list()
for (i in 1:10){
model2[[i]] <- naiveBayes( as.matrix(dtm_train[[i]]), 
                           as.factor(res.test$interaction [trainIndex[,,i]] ))
}

# predict
results2 <-c()
for ( i in 1:10){
results2[[i]] <- predict(model2[[i]],as.matrix(dtm_test[[i]]))
}

cmatrix2 <- c()
for ( i in 1:10){
cmatrix2[[i]] <- confusionMatrix(results2[[i]],
                                 res.test$interaction [-trainIndex[,,i]])
} 
nb2 <- list()
for( i in 1:10 ){
nb2[[i]]<-prediction(as.numeric(results2[[i]]),
                     res.test$interaction[-trainIndex[,,i]])
}

nbauc2 <- c()
for( i in 1:10 ){
nbauc2[i] <- performance(nb2[[i]],"auc")@y.values # Calculated AUC
}

mean(unlist(nbauc2)) # Mean AUC from naivebayes
sd(unlist(nbauc2))
8.5.2.2.2 Vectorization
# define a particion

split = 0.80
trainIndex <- createDataPartition(new.corpus$inter, 
                                  p = split, list = FALSE)

train <- new.corpus[trainIndex,]
test <- new.corpus[-trainIndex,]

it_train <- itoken(as.character(train$text), 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train$doi, 
             progressbar = FALSE)

# Create vocabulary of terms 
vocab.pruned <- create_vocabulary(it_train)

# Match with found terms with terms in the custom dictionary 

vocab.pruned$vocab <- vocab.pruned$vocab[(vocab.pruned$vocab$terms %in%  frug.n.terms) == TRUE]

# Filter vocabulary to a minimum of terms that appear at least 20 times
vocab.pruned <- prune_vocabulary(vocab.pruned, term_count_min = 20, term_count_max = 1000)
vectorizer.prun <- vocab_vectorizer(vocab.pruned)
# Create dtm
dtm_train <- create_dtm(it_train, vectorizer.prun) 

it_test <- itoken(as.character(test$text), 
             preprocessor = prep_fun, 
             tokenizer = tok_fun, 
             ids = train$doi, 
             progressbar = FALSE)
dtm_test <- create_dtm(it_test,vectorizer.prun)
NFOLDS = 3 # Number of folds for crossvalidation

glmnet_classifier2 <- cv.glmnet(x = dtm_train, y = train[['inter']],
                              family = 'binomial',
                              # L1 penalty
                              alpha = 1,
                              # interested in the area under ROC curve
                              type.measure = "auc",
                              # 5-fold cross-validation
                              nfolds = NFOLDS,
                              # high value is less accurate, but has faster training
                              thresh = 1e-3,
                              # again lower number of iterations for faster training
                              maxit = 1e3)
plot(glmnet_classifier2)
mean(glmnet_classifier2$cvm) # mean AUC
max(glmnet_classifier2$cvm) # max AUC
sd(glmnet_classifier2$cvm)

Once the model have been fitted, the performance of it can be assessed on the test subset of the corpus.

# Note that most text2vec functions are pipe friendly
it_test <- test$text %>%
  prep_fun %>%
  tok_fun %>%
  itoken(ids = test$doi, progressbar = FALSE)

dtm_test <- create_dtm(it_test, vectorizer.prun)

preds2 <- predict(glmnet_classifier2, 
                  dtm_test, 
                  type = 'response')[,1]
auc2 <- glmnet:::auc(test$inter, preds2)
auc2
data.frame("pred" = preds2, "obs" = test$inter)

Comparing model performances

8.6 Specific Entity Extraction

8.6.1 Scientific names

## Function from taxize that links with the global names recognition and discovery tools

scinames <- c()

  for (i in 1:length(body.t.text$plos)){
    scinames[[i]] <- taxize::scrapenames(text =paste0(body.t.text$plos[[i]]$body,
                                                      collapse = ""), all_data_sources = TRUE)
  }


# Extracting only the scientific names from the list

sci.data <- c()
  for ( i in 1:length(scinames)){
  sci.data$s.n[[i]]<-  unique(scinames[[i]]$data$scientificname)
  sci.data$doi[[i]]<-res.test$DOI[i]
  }
sci.data$s.n[1] # Scientific names from the first article in the list
sci.data$doi[1]

x1<-c()

for ( i in 1:length(q)){
  x1[[i]] <- paste(paste(sci.data$s.n[[i]], collapse = "|"), 
                   paste(frug.n.terms, collapse = "|"), collapse = "|")
}

for ( i in 1:length(q)){
x1[[i]] <- gsub(pattern="\\(", replacement="", 
                x= x1[[i]]) # Erase special character "("
}


o1 <- c()


for ( f in 1:length(q)){ 
  o1$title[[f]] <- res.test$title[f]
  o1$doi[[f]] <- res.test$DOI[f]
  o1$summary[[f]] <- q[[f]][grepl(x1[[f]], q[[f]])] 
              }

8.7 Custom summary using the species list and thesaurus as terms.

i = 1:10 # ten first articles

v <- c( "summary"= o1$summary[i])

names(v) <- paste(o1$title[i], "|", 
                  "interaction =",res.test$interaction[i], "|", o1$doi[i])
v[1:10]
x2 <- c()

for ( i in 1:length(q)){
  x2[[i]] <- paste(sci.data$s.n[[i]], 
                   collapse = "|")
}

for ( i in 1:length(q)){
x2[[i]] <- gsub(pattern="\\(",
                replacement="", x= x2[[i]])
}


o3 <- c()
for ( f in 1:length(q)){ 
 o3$title[[f]] <- res.test$title[f]
 o3$doi[[f]] <- res.test$DOI[f]
 o3$summary[[f]] <-  q[[f]][grepl(x2[[f]], q[[f]])] 
}

8.8 Custom Summary using only species names

i = 1:10 # 10 first articles

w <- c( "summary"= o3$summary[i])

names(w) <- paste(o3$title[i], "|", 
                  "interaction =",res.test$interaction[i],
                  "|", o3$doi[i])
w[1:10]

8.8.1 Geographical locations

library(monkeylearn)
 # loop for extracting location info from the corpus of text.
key = "cf0b9da7695ba68256cd61ee7fe04cbf84ae4ede"
extractor_id = "ex_isnnZRbS"
# Mine contents matching the extractor id
out1<-c()

  for ( i in 1:length(doc.t.corpus)) {
    out1[[i]] <- monkeylearn_extract(doc.t.corpus[[i]]$content, 
                                     extractor_id = extractor_id,
                                     key=key)
  }
  


  # Order most mentioned locations per article
location1<-c()
  for (i in 1:length(doc.t.corpus)) { 
    location1$loc[[i]] <- out1[[i]]$entity[out1[[i]]$tag=="LOCATION"]
     location1$count[[i]] <- out1[[i]]$count[out1[[i]]$tag=="LOCATION"]
  }
library(reshape2)
# Create a new dataset with locations and incorporate titles +  DOI
loc <- melt(location1$loc)

names(loc) <- c("location", "article")


titles <- melt(body.t.title$plos)
names(titles) <- c("title", "erase", "doi")
titles$erase <- NULL

# Add titles and dois to the dataset 
loc$title <- titles$title[loc$article]
loc$doi <- titles$doi[loc$article]  

# adds scientific names
snames <- melt(sci.data$s.n)
names(snames) <- c("sci.name","article")

# Loop to arrange scientific names with java annotation

java.names <- c()

for ( i in loc$article){
  java.names$names[i] <- paste(snames$sci.name[snames$article == i],
                               collapse = "<br/>")
  java.names$art[i] <- i 
}

java.names$doi <- loc$doi[match(java.names$art,loc$article)]
# Adds DOI's (links)

java.names <- data.frame(java.names) # Makes dataframe

loc$snames <- java.names$names[match(loc$doi,java.names$doi)] # Adds sci.names to the dataframe 
library(ggmap)
geocodes <- geocode(as.character(loc$location)
                    ,messaging = FALSE) 
# quota of 2500 queries per day for google api, dsk server works slower 
geocodes$title <- loc$title # Adds titles
geocodes$doi <- loc$doi # Adds doi's
geocodes$location <- loc$location # Adds location info
geocodes$snames <- loc$snames

clean.geocodes <- geocodes[complete.cases(geocodes),] # Clean data for which coordinates were not available (or error in locations)

running geocodeQueryCheck() gives the number of queries available from the geocode function.

With the locations, scientific names and DOI’s arranged into a single dataframe we can plot those into an interactive map usign the leafletpackage. The user can then make a selection of articles based either on specific taxa of interest or in a particular geographic extent. Clicking on each circle will pop-out a list with the link of the article and the scientific names included in it.

leaflet(clean.geocodes1) %>% addTiles() %>% addCircles(popup = popup_style, color = "orange")
# subset by extent coordinates, e.g. INDIA 

extent <- c(70,90,0,35) # in the form of: ( lon, lon, lat, lat ) 

clipped <- clean.geocodes1[clean.geocodes1$lon >= extent[1] & clean.geocodes1$lon <=extent[2],]
clipped <- clipped[clipped$lat >= extent[3] & clipped$lat <=extent[4] ,]

Map

leaflet(clipped)%>% addTiles() %>%  # Add default OpenStreetMap map tiles
  addCircles(popup = popup_style,color = "orange")
unique(clipped$title) # Selected titles with mentions of locations inside INDIA in the text