4.1 Lab: Working with a SQL database
4.1.1 Querying an SQL database
Now that we have our tables in the database, let’s see how we can query them. First we connect using dbConnect
and then we type our query with dbGetQuery
. Generally, we will write the query clauses
in capital letters and the column names in lowercase.
<- dbConnect(RSQLite::SQLite(), "data_tweets.sqlite")
db <- dbGetQuery(db, 'SELECT *
test FROM table_tweets_en
LIMIT 5')
str(test)
This is the list of query clauses that we will use below (you always need SELECT
and FROM
):
SELECT
: determines which columns to include in the query’s result setFROM
: identifies the tables from which to draw data and how tables should be joinedLIMIT
: in combination with SELECT, limits the number of records returnedWHERE
: filter out unwanted dataGROUP BY
: groups rows together by common column valuesORDER BY
: sorts the rows in the final result set by one or more columns
Let’s start with some examples of SELECT
:
# Get column names
dbListFields(db, "table_tweets")
dbListFields(db, "table_tweets_en")
# Check number of rows
dbGetQuery(db, "SELECT COUNT(*)
FROM table_tweets_en")
# querying just one column
dbGetQuery(db, "SELECT text
FROM table_tweets_en
LIMIT 10")
# multiple columns
dbGetQuery(db, "SELECT screen_name, text
FROM table_tweets_en
LIMIT 10")
# adding expressions
dbGetQuery(db, "SELECT text, UPPER(screen_name) as SCREEN_NAME
FROM table_tweets_en
LIMIT 5")
# adding aliases to the new columns
dbGetQuery(db, "SELECT tweet_created_at, text, text AS story
FROM table_tweets_en
LIMIT 5")
# it's often a good idea to write a query in multiple lines so that it's easier to read it:
dbGetQuery(db, "SELECT text,
text AS story
FROM table_tweets_en
LIMIT 10")
# selecting unique values from a column
dbGetQuery(db, "SELECT DISTINCT screen_name
FROM table_tweets_en
LIMIT 100")
Now some examples of how we can use WHERE
to subset the database based on values of specific columns:
# selecting based on values of a column
dbGetQuery(db, "SELECT screen_name, text, tweet_created_at
FROM table_tweets_en
WHERE screen_name = 'SenatorEMurphy'
LIMIT 10")
dbGetQuery(db, "SELECT screen_name, text, tweet_created_at
FROM table_tweets_en
WHERE screen_name = 'SenatorEMurphy'")
# greater than, working with dates
dbGetQuery(db, "SELECT screen_name, text, tweet_created_at
FROM table_tweets_en
WHERE tweet_created_at > '2020-04-27'")
# AND operator
dbGetQuery(db, "SELECT screen_name, text, tweet_created_at, retweet_count
FROM table_tweets_en
WHERE tweet_created_at > '2020-04-20' AND screen_name == 'SenatorEMurphy'
LIMIT 10")
# OR operator
dbGetQuery(db, "SELECT screen_name, text, retweet_count
FROM table_tweets_en
WHERE retweet_count>100 AND (screen_name = 'SenatorEMurphy' OR screen_name = 'Rees4Neath')
LIMIT 100")
# membership, IN
dbGetQuery(db, "SELECT screen_name, tweet_created_at, text
FROM table_tweets_en
WHERE screen_name IN ('SenatorEMurphy', 'Rees4Neath')
LIMIT 10")
# matching conditions with LIKE:
# _ matches exactly one character
# % matches any number of characters
dbGetQuery(db, "SELECT screen_name, tweet_created_at_date, text
FROM table_tweets_en
WHERE tweet_created_at_date LIKE '2013-01-__'
LIMIT 10")
# Exclude retweets (they start with 'RT ')
dbGetQuery(db, "SELECT screen_name, tweet_created_at_date, text
FROM table_tweets_en
WHERE text NOT LIKE 'RT %'
LIMIT 10")
dbGetQuery(db, "SELECT screen_name, tweet_created_at_date, text
FROM table_tweets_en
WHERE tweet_created_at_date LIKE '2013-03%'
LIMIT 10")
# SQLite does not have regular expressions, but we can get creative...
dbGetQuery(db, "SELECT screen_name, tweet_created_at, text, screen_name
FROM table_tweets_en
WHERE text LIKE '%Facebook%'
LIMIT 5")
Very useful for us – GROUP BY
allows us to aggregate results based on values of a specific variables. We will come back to this later, so don’t worry about it for now.
# What does the code below do?
dbGetQuery(db,
"SELECT screen_name, COUNT(screen_name) AS tweet_count, tweet_created_at_date
FROM table_tweets_en
GROUP BY screen_name, tweet_created_at_date
LIMIT 10")
And we can then order the results using ORDER BY
. The default is alphabetical order for the first column, but we can change that:
# sort by tweet count
dbGetQuery(db,
"SELECT screen_name, COUNT(screen_name) AS tweet_count, tweet_created_at_date
FROM table_tweets_en
GROUP BY screen_name, tweet_created_at_date
ORDER BY tweet_count DESC
LIMIT 10")
# That was a retweet!
# what was the screen_name with the most posts in our data?
dbGetQuery(db,
"SELECT screen_name, COUNT(screen_name) AS tweet_count
FROM table_tweets_en
GROUP BY screen_name
ORDER BY tweet_count DESC
LIMIT 10")
# You can also specify the column number instead of the name
dbGetQuery(db,
"SELECT screen_name, COUNT(screen_name) AS tweet_count
FROM table_tweets_en
GROUP BY screen_name
ORDER BY 2 DESC
LIMIT 10")
# and we close the connection for now
dbDisconnect(db)
4.1.2 Querying multiple SQL tables
Now’s where we get to see the magic of SQL. It makes it really easy to combine multiple SQL tables using JOIN
. We just select columns from different tables and then specify what key should be used to match them using ON
.
The example belows shows an inner join
, but there are other types (left vs right outer joins, cross joins…) that we will not cover in this workshop.
library(DBI)
<- dbConnect(RSQLite::SQLite(), "data_tweets.sqlite")
db
# Check which tables are in there
dbListTables(db)
# We create two new tables in our database that
# id_str: Is the identifier of the single tweets! (Unique!)
# we'll join later on
<- dbGetQuery(db,
table1 "SELECT id_str, screen_name, retweet_count
FROM table_tweets_en
Limit 100000")
dbWriteTable(db, "table1", table1, overwrite = TRUE)
<- dbGetQuery(db,
table2 "SELECT id_str, screen_name, favorite_count
FROM table_tweets_en
Limit 100000")
dbWriteTable(db, "table2", table2, overwrite = TRUE)
# Check which tables are in there
dbListTables(db)
# Get column names
dbListFields(db, "table_tweets_en")
dbListFields(db, "table1")
dbListFields(db, "table2")
# Let's take the table1 and merge it with data from the table2
# The identifer variables ideally have different names but we can also specify them as below
dbGetQuery(db, "SELECT id_str, retweet_count, screen_name
FROM table1
JOIN table2
ON table1.id_str = table2.id_str
LIMIT 10")
dbGetQuery(db, "SELECT table1.id_str,
table1.retweet_count,
table1.screen_name,
table2.favorite_count
FROM table1
JOIN table2
ON table1.id_str = table2.id_str
LIMIT 10")
Once we know how to merge tables, we can use columns from both tables. For example, let’s say we want to find the tweets that got the highest retweet_count (beware we have retweets included here). Here’s how we would do it:
dbGetQuery(db, "SELECT table1.id_str,
table1.retweet_count,
table1.screen_name,
table2.favorite_count
FROM table1
JOIN table2
ON table1.id_str = table2.id_str
WHERE retweet_count > 100
ORDER BY retweet_count DESC
LIMIT 10")
4.1.3 Grouping and aggregating
SQL makes it really easy to compute aggregate statistics of one column based on values of a different column, with a series of built-in functions (COUNT, AVG, MAX, MIN…). Let’s now see a few examples of this type of queries:
# use COUNT with * to return the total number of rows
dbGetQuery(db, 'SELECT COUNT(*)
FROM table_tweets_en')
Combining everything we’ve learned, we can for example count the total number of posts by screen_name, or by screen_name and retweet_count.
dbGetQuery(db,
"SELECT retweet_count,
table1.screen_name,
COUNT(*) AS total_tweets_by_screen_name
FROM table1 JOIN table2
ON table1.id_str = table2.id_str
GROUP BY table1.screen_name
LIMIT 20")
Here’s another example. Let’s say we want to find out which user (screen_name) wrote the most tweets (and add the sum of favorites given by other users). How would we do this?
dbGetQuery(db,
"SELECT table1.screen_name,
COUNT(*) AS total_tweets,
SUM(favorite_count) AS favorite_count_total
FROM table1 JOIN table2
ON table1.id_str = table2.id_str
GROUP BY table1.screen_name
ORDER BY total_tweets DESC
LIMIT 20")
There are other functions besides just COUNT
. We can also compute averages, the maximum and minimum of a column… We can also use functions like SUBSTR
to extract parts of a character string.
# average retweet_count (sorted) and number of tweets
dbGetQuery(db,
"SELECT screen_name, SUBSTR(screen_name, 1 , 5), AVG(retweet_count)
FROM table_tweets_en
WHERE text NOT LIKE 'RT %'
GROUP BY screen_name
ORDER BY AVG(retweet_count) DESC
LIMIT 20")
# Most retweeted original tweets
dbGetQuery(db,
"SELECT screen_name, SUBSTR(text, 1 , 40), retweet_count
FROM table_tweets_en
WHERE text NOT LIKE 'RT %'
ORDER BY retweet_count DESC
LIMIT 20")