Open and explore each file with the previous questions in mind.
Database: Collection of tables connected via some value.
Table; record (= observation = row); field (= column); key; unique identifier; entry (value).
Explore all tabs. In Database Structure:
Is data redundant?
Menu:
File > New Database.
Modify Table
library(here) library(tidyverse)
Easiest is to work directly with the .csv files.
plots <- read_csv(here("plots.csv"))
species <- read_csv(here("species.csv"))
surveys <- read_csv(here("surveys.csv"))
surveys ## # A tibble: 35,549 x 9 ## record_id month day year plot_id species_id sex hindfoot_length ## <int> <int> <int> <int> <int> <chr> <chr> <int> ## 1 1 7 16 1977 2 NL M 32 ## 2 2 7 16 1977 3 NL M 33 ## 3 3 7 16 1977 2 DM F 37 ## 4 4 7 16 1977 7 DM M 36 ## 5 5 7 16 1977 3 DM M 35 ## 6 6 7 16 1977 1 PF M 14 ## 7 7 7 16 1977 2 PE F NA ## 8 8 7 16 1977 1 DM M 37 ## 9 9 7 16 1977 1 DM F 34 ## 10 10 7 16 1977 6 PF F 20 ## # ... with 35,539 more rows, and 1 more variable: weight <int>
If you already have a database, you can also use it.
path <- here("data-raw/portal/portal_mammals.sqlite")
db <- DBI::dbConnect(RSQLite::SQLite(), path)
Copy each table from the database.
plots_db <- tbl(db, "plots") species_db <- tbl(db, "species") surveys_db <- tbl(db, "surveys")
surveys_db ## # Source: table<surveys> [?? x 9] ## # Database: sqlite 3.22.0 ## # [C:\Users\LeporeM\Documents\Dropbox\git\carpentries\data-raw\portal\portal_mammals.sqlite] ## record_id month day year plot_id species_id sex hindfoot_length ## <int> <int> <int> <int> <int> <chr> <chr> <dbl> ## 1 1 7 16 1977 2 NL M 32 ## 2 2 7 16 1977 3 NL M 33 ## 3 3 7 16 1977 2 DM F 37 ## 4 4 7 16 1977 7 DM M 36 ## 5 5 7 16 1977 3 DM M 35 ## 6 6 7 16 1977 1 PF M 14 ## 7 7 7 16 1977 2 PE F NA ## 8 8 7 16 1977 1 DM M 37 ## 9 9 7 16 1977 1 DM F 34 ## 10 10 7 16 1977 6 PF F 20 ## # ... with more rows, and 1 more variable: weight <dbl>
SELECT <columns> FROM <table> SELECT * FROM surveys;
SELECT year, month, day FROM surveys;
SELECT surveys.year, surveys.month, surveys.day FROM surveys;
Good
SELECT year FROM surveys;
Bad
select year from surveys;
LIMIT and DISTINCTSELECT * FROM surveys LIMIT 10;
SELECT DISTINCT species_id FROM surveys;
+, -, *, /, and ROUND(<what?>, <digits>)SELECT year, month, day, weight / 1000 FROM surveys;
SELECT plot_id, species_id, sex, weight, ROUND(weight / 1000, 2) FROM surveys;
<INTEGER> / 1000Change weight to integer:
SELECT weight / 1000 FROM surveys
SELECT weight / 1000.00 FROM surveys
Change back
SELECT day, month, year, species_id, weight * 1000 FROM surveys;
WHERESELECT * FROM surveys WHERE species_id = 'DM';
SELECT * FROM surveys WHERE year >= 2000;
AND, OR, and INSELECT * FROM surveys WHERE (year >= 2000) AND (species_id = 'DM');
SELECT *
FROM surveys
WHERE (species_id = 'DM') OR (species_id = 'DO') OR (species_id = 'DS');
-- Same as above. BTW, I'm a comment :)
SELECT *
FROM surveys
WHERE species_id IN ('DM', 'DO', 'DS');
SELECT day, month, year, species_id, weight / 1000 FROM surveys WHERE (plot_id = 1) AND (weight > 75);
ORDER BY <column> ASC/DESCSELECT * FROM species ORDER BY taxa ASC;
SELECT * FROM species ORDER BY taxa DESC;
SELECT * FROM species -- ASC is implicit (default). ORDER BY genus, species;
SELECT year, species_id, weight / 1000 FROM surveys ORDER BY weight DESC;
SELECT year, month, day, species_id, ROUND(weight / 1000, 2) FROM surveys WHERE year = 1999 ORDER BY species_id;
“Good” style and comments make your code easier to understand.
SQL helps you select columns, filter and order rows
Pick specific rows using conditions with AND, OR and IN.
Compute on column values with arithmetic operators.
Setup
# Use packages
library(here)
library(tidyverse)
# Connect to the database
path <- here("data-raw/portal/portal_mammals.sqlite")
db <- DBI::dbConnect(RSQLite::SQLite(), path)
# Use a specific table from the database
surveys_db <- tbl(db, "surveys")
class(surveys_db)
## [1] "tbl_dbi" "tbl_sql" "tbl_lazy" "tbl"
surveys_db ## # Source: table<surveys> [?? x 9] ## # Database: sqlite 3.22.0 ## # [C:\Users\LeporeM\Documents\Dropbox\git\carpentries\data-raw\portal\portal_mammals.sqlite] ## record_id month day year plot_id species_id sex hindfoot_length ## <int> <int> <int> <int> <int> <chr> <chr> <dbl> ## 1 1 7 16 1977 2 NL M 32 ## 2 2 7 16 1977 3 NL M 33 ## 3 3 7 16 1977 2 DM F 37 ## 4 4 7 16 1977 7 DM M 36 ## 5 5 7 16 1977 3 DM M 35 ## 6 6 7 16 1977 1 PF M 14 ## 7 7 7 16 1977 2 PE F NA ## 8 8 7 16 1977 1 DM M 37 ## 9 9 7 16 1977 1 DM F 34 ## 10 10 7 16 1977 6 PF F 20 ## # ... with more rows, and 1 more variable: weight <dbl>
show_query(surveys_db) ## <SQL> ## SELECT * ## FROM `surveys`
{sql, engine = "sql", connection = "db"}
SELECT record_id, plot_id, species_id, sex FROM surveys
| record_id | plot_id | species_id | sex |
|---|---|---|---|
| 1 | 2 | NL | M |
| 2 | 3 | NL | M |
| 3 | 2 | DM | F |
| 4 | 7 | DM | M |
| 5 | 3 | DM | M |
| 6 | 1 | PF | M |
| 7 | 2 | PE | F |
| 8 | 1 | DM | M |
| 9 | 1 | DM | F |
| 10 | 6 | PF | F |
How can I summarize my data by aggregating, filtering, or ordering query results?
How can I make sure column names from my queries make sense and aren’t too long?
COUNT, MAX, MIN, and AVG SELECT COUNT(*) FROM surveys;
SELECT COUNT(*), SUM(weight) FROM surveys;
-- All animals SELECT SUM(weight), AVG(weight), MIN(weight), MAX(weight) FROM surveys; -- Only weights between 5 and 10 SELECT SUM(weight), AVG(weight), MIN(weight), MAX(weight) FROM surveys WHERE (weight > 5) AND (weight < 10);
GROUP BYSELECT species_id, COUNT(*) FROM surveys GROUP BY species_id;
Solution of 1
SELECT year, COUNT(*) FROM surveys GROUP BY year;
Solution of 2 and 3
SELECT year, species_id, COUNT(*), AVG(weight) FROM surveys GROUP BY year, species_id;
ORDER BY a summary columnSELECT species_id, COUNT(*) FROM surveys GROUP BY species_id ORDER BY COUNT(species_id);
AS SELECT MAX(year) AS last_surveyed_year FROM surveys; SELECT MAX(year) last_surveyed_year FROM surveys;
AS is optional. This also works (but isn’t “good” style)
SELECT MAX(year) last_surveyed_year FROM surveys;
GROUP BY <cols> HAVING <cond> HAVING ~ WHERE:
<columns> WHERE <condition><results> HAVING <condition>GROUP BY <cols> HAVING <cond> SELECT species_id, COUNT(species_id) FROM surveys GROUP BY species_id -- "Smells" a bit HAVING COUNT(species_id) > 10; -- Same but nicer SELECT species_id, COUNT(species_id) AS n FROM surveys GROUP BY species_id HAVING n > 10;
http://rstd.io/code-smells (Jenny Bryan)
SELECT taxa, COUNT(*) AS n FROM species GROUP BY taxa HAVING n > 10;
CREATE VIEW viewname AS SELECT * FROM surveys WHERE year = 2000 AND (month > 4 AND month < 10);
Store
CREATE VIEW summer_2000 AS SELECT * FROM surveys WHERE year = 2000 AND (month > 4 AND month < 10);
Reuse
SELECT * FROM summer_2000 WHERE species_id == 'PE';
NULL values-- 45 individuals which sex is of NULL sex SELECT sex, COUNT(*) FROM summer_2000 GROUP BY sex -- As you expect, excludes NULL SELECT COUNT(*) FROM summer_2000 -- Excludes NULL WHERE sex == 'F' -- But tally includes NULL: 366 + 382 + 45 = 793 SELECT COUNT(*) FROM summer_2000
Use MIN, MAX, AVG, SUM, COUNT, etc. to operate on aggregated data.
Use AS to create aliases.
Use HAVING to filter on aggregate properties (similar to WHERE).
Use CREATE VIEW to store a query.
surveys %>% group_by(species_id) %>% summarize(n = n()) %>% filter(n > 10) ## # A tibble: 30 x 2 ## species_id n ## <chr> <int> ## 1 AB 303 ## 2 AH 437 ## 3 BA 46 ## 4 CB 50 ## 5 CM 13 ## 6 CQ 16 ## 7 DM 10596 ## 8 DO 3027 ## 9 DS 2504 ## 10 DX 40 ## # ... with 20 more rows
count() is so common that has a shortcut
## # A tibble: 30 x 2 ## species_id n ## <chr> <int> ## 1 AB 303 ## 2 AH 437 ## 3 BA 46 ## 4 CB 50 ## 5 CM 13 ## 6 CQ 16 ## 7 DM 10596 ## 8 DO 3027 ## 9 DS 2504 ## 10 DX 40 ## # ... with 20 more rows
JOIN <table> USING <(columns)>SELECT * FROM surveys JOIN species ON surveys.species_id = species.species_id;
Same, when column names match.
SELECT * FROM surveys JOIN species -- Same column name in both tables USING (species_id);
SELECT species.genus, species.species, surveys.weight FROM surveys JOIN species ON surveys.species_id = species.species_id;
Same
SELECT genus, species, weight FROM surveys JOIN species USING (species_id)
<kind of> JOININNER JOIN = JOIN: Keeps all rows that match in both tables
SELECT * FROM surveys INNER JOIN species USING (species_id);
LEFT JOIN: Keeps all rows that match in left table
(“left” is closest to FROM)
SELECT * FROM surveys LEFT JOIN species USING (species_id);
JOINed results Left table implicit: Drives attention to surveys.weight.
SELECT plot_type, AVG(surveys.weight) FROM plots JOIN surveys USING (plot_id) GROUP BY plot_type;
Same but buries intention.
SELECT plots.plot_type, AVG(surveys.weight) FROM surveys JOIN plots ON surveys.plot_id = plots.plot_id GROUP BY plots.plot_type;
SELECT plot_id, genus, COUNT(*) AS n FROM surveys JOIN species USING (species_id) GROUP BY genus, plot_id ORDER BY plot_id ASC, n DESC;
IFNULL(<column>, <value>) NULL valuesSELECT species_id, sex, IFNULL(sex, 'U') FROM surveys;
SELECT hindfoot_length, IFNULL(hindfoot_length, 30) FROM surveys;
SELECT species_id, AVG(IFNULL(hindfoot_length,30)) FROM surveys GROUP BY species_id;
NULLIF(<column>, <value>) NULLIF is the inverse of IFNULL.
Returns NULL
If the first argument is equal to the second argument it returns NULL.
Else, it returns the original value.
SELECT species_id, plot_id, NULLIF(plot_id, 7) FROM surveys;
SELECT genus, LENGTH(genus) FROM species ORDER BY LENGTH(genus) DESC
Use JOIN <table> USING <(common column)> to combine data from two tables.
Use (INNER) JOIN to keep rows that match on both tables; or LEFT JOIN to keep all rows of the left table and the matching rows of the right table.
Use IFNULL(<column>, <value>) to “fill” NULL values; and NULLIF(<column>, <value>) to “null out” specific values.
Use other functions e.g. LENGTH(<column>) to operate on individual values.
plots %>% left_join(surveys) %>% group_by(plot_type) %>% mutate(mean_weight = mean(weight, na.rm = TRUE)) %>% select(plot_type, mean_weight) %>% unique() ## Joining, by = "plot_id" ## # A tibble: 5 x 2 ## # Groups: plot_type [5] ## plot_type mean_weight ## <chr> <dbl> ## 1 Spectab exclosure 51.6 ## 2 Control 48.6 ## 3 Long-term Krat Exclosure 27.2 ## 4 Rodent Exclosure 32.5 ## 5 Short-term Krat Exclosure 41.2