# install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.0 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.3.4 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# To save time and space, limit the number of lines printed
lines <- 10
options(
dplyr.print_min = lines, dplyr.print_max = lines, tidyverse.quiet = TRUE
)
# install.packages("readxl")
path <- "../data-raw/H1_H2_datasheet_Mauro.xlsx"
serc <- readxl::read_xlsx(path)
Any style is OK; For consistence with all of my work, I use this style: http://style.tidyverse.org/.
# Changing names to lowercase to reduce the chances of confusion due to case
serc <- set_names(serc, tolower)
# Changing periods to underscore
serc <- set_names(serc, str_replace(names(serc), "\\.", "_"))
Glimpse variables names and type.
glimpse(serc)
#> Observations: 2,996
#> Variables: 20
#> $ HA <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ XP <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ YP <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ Tag <dbl> 10108, 10111, 10398, 11133, 10102, 10399, 10691, 1...
#> $ MainstemTag <dbl> 10108, 10111, 10398, 11133, 10102, 10399, 10691, 1...
#> $ Mnemonic <chr> "ULAM", "FRPE", "SANIC4", "ULRU", "ACRU", "SANIC4"...
#> $ CanopyCode <dbl> 88, 31, 84, 89, 2, 84, 84, 84, 84, 82, 84, 31, 2, ...
#> $ X <dbl> 7.0, 1.4, 9.6, 5.2, 0.4, 9.7, 8.6, 8.1, 7.8, 5.7, ...
#> $ Y <dbl> 2.7, 8.5, 1.5, 4.0, 1.7, 1.0, 0.2, 0.1, 0.5, 4.0, ...
#> $ DBH <chr> "3.4", "11.5", "1.3", "8.4", "12.3", "2", "1.8", "...
#> $ Con <chr> "LI", "LI", "LI", "LI", "LI", "LI", "LI", "LI", "L...
#> $ Codes <chr> "L", "L", "L", "L", "M", "M", "M", "M", "M", "M", ...
#> $ HOM <chr> "1.3\r", "1.3\r", "1.3\r", "1.3\r", "1.3\r", "1.3\...
#> $ Notes <chr> "SE", "West", "DC tag collected 25-Mar-2015", "NE"...
#> $ DBH.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ STATUS.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ CODES.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ NOTES.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ EXACT.DATE <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ TEAM <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
Skim variables’ summary by type.
# install.packages("skimr")
skimr::skim(serc)
#> Numeric Variables
#> # A tibble: 8 x 13
#> var type missing complete n mean sd min
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 CanopyCode numeric 0 2996 2996 33.73 21.416 2
#> 2 HA numeric 0 2996 2996 1.46 0.498 1
#> 3 MainstemTag numeric 0 2996 2996 15313.02 4990.763 10001
#> 4 Tag numeric 0 2996 2996 15355.07 4970.081 10001
#> 5 X numeric 0 2996 2996 5.02 2.878 0
#> 6 XP numeric 0 2996 2996 5.49 2.948 1
#> 7 Y numeric 0 2996 2996 5.11 2.887 0
#> 8 YP numeric 0 2996 2996 5.72 2.894 1
#> # ... with 5 more variables: `25% quantile` <dbl>, median <dbl>, `75%
#> # quantile` <dbl>, max <dbl>, hist <chr>
#>
#> Character Variables
#> # A tibble: 6 x 9
#> var type complete missing empty n min max n_unique
#> * <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Codes character 1343 1653 0 2996 1 14 127
#> 2 Con character 2996 0 0 2996 2 2 6
#> 3 DBH character 2996 0 0 2996 1 18 463
#> 4 HOM character 2996 0 0 2996 1 5 26
#> 5 Mnemonic character 2996 0 0 2996 4 6 49
#> 6 Notes character 589 2407 0 2996 2 52 213
summary(serc)
#> HA XP YP Tag
#> Min. :1.00 Min. : 1.00 Min. : 1.00 Min. :10001
#> 1st Qu.:1.00 1st Qu.: 3.00 1st Qu.: 3.00 1st Qu.:10752
#> Median :1.00 Median : 5.00 Median : 6.00 Median :11502
#> Mean :1.46 Mean : 5.49 Mean : 5.72 Mean :15355
#> 3rd Qu.:2.00 3rd Qu.: 8.00 3rd Qu.: 8.00 3rd Qu.:20669
#> Max. :2.00 Max. :10.00 Max. :10.00 Max. :21633
#> MainstemTag Mnemonic CanopyCode X
#> Min. :10001 Length:2996 Min. : 2.0 Min. : 0.00
#> 1st Qu.:10681 Class :character 1st Qu.: 22.0 1st Qu.: 2.50
#> Median :11428 Mode :character Median : 28.0 Median : 5.10
#> Mean :15313 Mean : 33.7 Mean : 5.02
#> 3rd Qu.:20654 3rd Qu.: 39.0 3rd Qu.: 7.50
#> Max. :21633 Max. :132.0 Max. :10.30
#> Y DBH Con Codes
#> Min. : 0.00 Length:2996 Length:2996 Length:2996
#> 1st Qu.: 2.60 Class :character Class :character Class :character
#> Median : 5.20 Mode :character Mode :character Mode :character
#> Mean : 5.11
#> 3rd Qu.: 7.60
#> Max. :10.00
#> HOM Notes DBH.2018 STATUS.2018
#> Length:2996 Length:2996 Mode:logical Mode:logical
#> Class :character Class :character NA's:2996 NA's:2996
#> Mode :character Mode :character
#>
#>
#>
#> CODES.2018 NOTES.2018 EXACT.DATE TEAM
#> Mode:logical Mode:logical Mode:logical Mode:logical
#> NA's:2996 NA's:2996 NA's:2996 NA's:2996
#>
#>
#>
#>
The exploratory data analysis that follows addresses the following email:
On Tue, Oct 10, 2017 at 1:27 PM, Shue, Jessica ShueJ@si.edu wrote:
(RE: What summaries of your fieldwork data you would like to have?)
In this data set the variable that seems to uniquely identify each stem is Tag. Therefore, the number of unique stems is n.
n <- length(unique(serc$tag))
n
#> [1] 2996
Unfortunately this data set seems to have no data on previous censuses. But we can still summarize and explore the distribution of DBH values, to check for outliers.
# DBH should be numeric
str(serc$dbh)
#> chr [1:2996] "3.4" "11.5" "1.3" "8.4" "12.3" "2" "1.8" ...
serc <- mutate(serc, dbh = as.numeric(dbh))
#> Warning in evalq(as.numeric(dbh), <environment>): NAs introduced by
#> coercion
str(serc$dbh)
#> num [1:2996] 3.4 11.5 1.3 8.4 12.3 2 1.8 2.3 3 1.9 ...
summary(serc$dbh)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 0.9 1.9 4.3 10.3 10.3 113.1 315
# Create a reference data set
mean_plus_two_sd <- mean(serc$dbh, na.rm = TRUE) +
sd(serc$dbh, na.rm = TRUE) * 2
refs <- tribble(
~reference, ~value,
"common", mean(serc$dbh, na.rm = TRUE),
"extreeme", mean_plus_two_sd
)
ggplot(serc, aes(x = dbh)) +
geom_histogram(size = 3) +
geom_vline(data = refs, aes(xintercept = value, colour = reference)) +
geom_text(
data = refs,
aes(x = value, y = 0, label = round(value), colour = reference),
hjust = 1
) +
scale_colour_manual(values = c("blue", "red"))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> Warning: Removed 315 rows containing non-finite values (stat_bin).
These data has dbh values greater than the mean plus two standard deviations.
extreeme_data <- filter(serc, dbh > mean_plus_two_sd)
extreeme_data
#> # A tibble: 164 x 20
#> ha xp yp tag mainstemtag mnemonic canopycode x y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 5 10135 10135 UNKN 99 10.0 6.0
#> 2 1 1 5 10127 10127 QUMI 67 4.0 5.5
#> 3 1 1 7 10172 10172 QUMI 67 1.9 4.8
#> 4 1 1 7 10180 10180 LIST2 40 4.2 9.1
#> 5 1 1 8 10193 10193 FAGR 28 10.0 0.5
#> 6 1 1 9 10200 10200 FAGR 28 0.7 9.4
#> 7 1 2 7 10069 10069 UNKN 99 4.9 6.3
#> 8 1 2 8 10086 10086 LIST2 40 10.0 5.4
#> 9 1 3 2 10210 10210 LIST2 40 7.3 8.0
#> 10 1 3 3 10212 10212 ACRU 2 7.9 3.9
#> # ... with 154 more rows, and 11 more variables: dbh <dbl>, con <chr>,
#> # codes <chr>, hom <chr>, notes <chr>, dbh_2018 <lgl>,
#> # status_2018 <lgl>, codes_2018 <lgl>, notes_2018 <lgl>,
#> # exact_date <lgl>, team <lgl>
This saves the extreme for re-check.
write_csv(extreeme_data, "extreeme_data.csv")
Living status after being Dead previous census; good to make a note of prior error to correct in database
In this data set, the variable giving status has all missing values.
unique(
select(serc, matches("status"))
)
#> # A tibble: 1 x 1
#> status_2018
#> <lgl>
#> 1 NA
Let’s demonstrate this issue on a toy data set.
toy <- tibble(
dbh = sample(1:80, 100, replace = TRUE),
tag = rep(1:50, 2),
census = c(rep(1, 50), rep(2, 50)),
status = sample(c("alive", "dead"), 100, replace = TRUE)
)
toy
#> # A tibble: 100 x 4
#> dbh tag census status
#> <int> <int> <dbl> <chr>
#> 1 7 1 1 dead
#> 2 67 2 1 dead
#> 3 49 3 1 alive
#> 4 13 4 1 alive
#> 5 1 5 1 alive
#> 6 38 6 1 dead
#> 7 40 7 1 dead
#> 8 24 8 1 dead
#> 9 59 9 1 dead
#> 10 62 10 1 dead
#> # ... with 90 more rows
Let’s start by sorting (arranging) the data first by tag, then census, then status
toy <- arrange(toy, tag, census, status)
Now we are ready to flag zombie stems.
grouped <- group_by(toy, tag)
zombie_sequence <- "dead->alive"
flagged <- mutate(
grouped,
status_sequence = paste0(status, collapse = "->"),
is_zombie = ifelse(grepl(zombie_sequence, status_sequence), TRUE, FALSE)
)
flagged
#> # A tibble: 100 x 6
#> # Groups: tag [50]
#> dbh tag census status status_sequence is_zombie
#> <int> <int> <dbl> <chr> <chr> <lgl>
#> 1 7 1 1 dead dead->dead FALSE
#> 2 37 1 2 dead dead->dead FALSE
#> 3 67 2 1 dead dead->alive TRUE
#> 4 26 2 2 alive dead->alive TRUE
#> 5 49 3 1 alive alive->dead FALSE
#> 6 14 3 2 dead alive->dead FALSE
#> 7 13 4 1 alive alive->alive FALSE
#> 8 43 4 2 alive alive->alive FALSE
#> 9 1 5 1 alive alive->alive FALSE
#> 10 40 5 2 alive alive->alive FALSE
#> # ... with 90 more rows
We can search the tag of zombie stems.
zombi_data <- filter(flagged, is_zombie)
zombi_tags <- unique(
pull(zombi_data, tag)
)
zombi_tags
#> [1] 2 6 8 9 18 19 22 25 28 32 34 36 37 40 42 43 49
To confirm, we can filer the flagged data with zombi_tags.
filter(flagged, tag %in% zombi_tags)
#> # A tibble: 34 x 6
#> # Groups: tag [17]
#> dbh tag census status status_sequence is_zomby
#> <int> <int> <dbl> <chr> <chr> <lgl>
#> 1 67 2 1 dead dead->alive TRUE
#> 2 26 2 2 alive dead->alive TRUE
#> 3 38 6 1 dead dead->alive TRUE
#> 4 63 6 2 alive dead->alive TRUE
#> 5 24 8 1 dead dead->alive TRUE
#> 6 58 8 2 alive dead->alive TRUE
#> 7 59 9 1 dead dead->alive TRUE
#> 8 6 9 2 alive dead->alive TRUE
#> 9 6 18 1 dead dead->alive TRUE
#> 10 44 18 2 alive dead->alive TRUE
#> # ... with 24 more rows
In this data set, the time-aware variable has only missing values.
unique(
select(serc, matches("date"))
)
#> # A tibble: 1 x 1
#> exact_date
#> <lgl>
#> 1 NA
Let’s demonstrate this analysis with a toy data set.
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
some_recent_dates <- sample(seq(today() - 60, today(), 1), 100, replace = TRUE)
toy <- mutate(toy, exact_date = some_recent_dates)
toy
#> # A tibble: 100 x 5
#> dbh tag census status exact_date
#> <int> <int> <dbl> <chr> <date>
#> 1 7 1 1 dead 2017-10-07
#> 2 37 1 2 dead 2017-11-12
#> 3 67 2 1 dead 2017-09-29
#> 4 26 2 2 alive 2017-09-17
#> 5 49 3 1 alive 2017-09-24
#> 6 14 3 2 dead 2017-11-07
#> 7 13 4 1 alive 2017-09-29
#> 8 43 4 2 alive 2017-09-28
#> 9 1 5 1 alive 2017-09-18
#> 10 40 5 2 alive 2017-11-12
#> # ... with 90 more rows
data_from_this_week <- filter(toy, exact_date > today() - 7)
decreasing_dbh <- arrange(data_from_this_week, desc(dbh))
# Top largest
head(decreasing_dbh, 3)
#> # A tibble: 3 x 5
#> dbh tag census status exact_date
#> <int> <int> <dbl> <chr> <date>
#> 1 74 36 2 alive 2017-11-12
#> 2 68 44 2 dead 2017-11-09
#> 3 62 46 1 alive 2017-11-10
These data set seems not to have trees with multiple stems.
matching_tag <- select(serc, matches("tag"))
lapply(matching_tag, function(x) unique(length(x)))
#> $tag
#> [1] 2996
#>
#> $mainstemtag
#> [1] 2996
Let’s demonstrate this issue with an example data set from bci. We start by preparing the data set.
# Lowering names to avoid confusions due to case
bci_stems <- set_names(qcr::bci12s7mini, tolower)
# Rearrange and sort useful variables
bci_tems <- arrange(
select(bci_stems, tag, stemid, matches("id"), everything()),
tag, stemid
)
bci_tems
#> # A tibble: 2,165 x 20
#> tag stemid treeid measureid censusid stemtag sp quadrat gx
#> <chr> <int> <int> <int> <int> <chr> <chr> <chr> <dbl>
#> 1 000851 1 858 766 171 apeime 4402 899
#> 2 001080 1 1083 NA NA <NA> alchco 4314 873
#> 3 001080 2 1083 NA NA <NA> alchco 4314 873
#> 4 001080 3 1083 NA NA <NA> alchco 4314 873
#> 5 001080 4 1083 NA NA <NA> alchco 4314 873
#> 6 001080 5 1083 NA NA <NA> alchco 4314 873
#> 7 001080 6 1083 NA NA <NA> alchco 4314 873
#> 8 001080 7 1083 NA NA <NA> alchco 4314 873
#> 9 001080 8 1083 NA NA <NA> alchco 4314 873
#> 10 001080 9 1083 NA NA <NA> alchco 4314 873
#> # ... with 2,155 more rows, and 11 more variables: gy <dbl>, dbh <dbl>,
#> # pom <chr>, hom <dbl>, exactdate <chr>, dfstatus <chr>, codes <chr>,
#> # countpom <dbl>, date <dbl>, status <chr>, agb <dbl>
We can see that tag is repeated for each unique stemid. So, to know that number of stems per tree, we can count the number of unique tags.
count(bci_stems, tag, sort = TRUE)
#> # A tibble: 1,000 x 2
#> tag n
#> <chr> <int>
#> 1 123563 27
#> 2 111351 21
#> 3 230602 21
#> 4 253807 16
#> 5 162652 15
#> 6 249474 15
#> 7 041253 14
#> 8 127430 14
#> 9 138737 14
#> 10 203957 14
#> # ... with 990 more rows
It seems that the species names are coded in the variable mnemonic. We can compare the codes in the data set with a table of codes that we know is correct. Suppose we have a table of valid codes:
valid_codes <- unique(select(serc, mnemonic))
valid_codes
#> # A tibble: 49 x 1
#> mnemonic
#> <chr>
#> 1 ULAM
#> 2 FRPE
#> 3 SANIC4
#> 4 ULRU
#> 5 ACRU
#> 6 ROMU
#> 7 LIST2
#> 8 PAQU2
#> 9 LOJA
#> 10 CEOC2
#> # ... with 39 more rows
For demonstration, lets insert wrong codes in a few places.
is_wrong <- sample(c(TRUE, FALSE), nrow(serc), replace = TRUE)
# Inserting 5 wrong codes
wrong_serc <- serc
insert_wrong <- sample(seq_along(wrong_serc$mnemonic), 5)
wrong_serc$mnemonic[insert_wrong] <- "WRONG"
This keeps only the valid codes.
filter(wrong_serc, mnemonic %in% valid_codes$mnemonic)
#> # A tibble: 2,991 x 20
#> ha xp yp tag mainstemtag mnemonic canopycode x y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 1 10108 10108 ULAM 88 7.0 2.7
#> 2 1 1 1 10111 10111 FRPE 31 1.4 8.5
#> 3 1 1 1 10398 10398 SANIC4 84 9.6 1.5
#> 4 1 1 1 11133 11133 ULRU 89 5.2 4.0
#> 5 1 1 1 10102 10102 ACRU 2 0.4 1.7
#> 6 1 1 1 10399 10399 SANIC4 84 9.7 1.0
#> 7 1 1 1 10691 10691 SANIC4 84 8.6 0.2
#> 8 1 1 1 10696 10696 SANIC4 84 8.1 0.1
#> 9 1 1 1 10699 10699 SANIC4 84 7.8 0.5
#> 10 1 1 1 10797 10797 ROMU 82 5.7 4.0
#> # ... with 2,981 more rows, and 11 more variables: dbh <dbl>, con <chr>,
#> # codes <chr>, hom <chr>, notes <chr>, dbh_2018 <lgl>,
#> # status_2018 <lgl>, codes_2018 <lgl>, notes_2018 <lgl>,
#> # exact_date <lgl>, team <lgl>
This finds the wrong codes.
# Notice the negation: !
filter(wrong_serc, !mnemonic %in% valid_codes$mnemonic)
#> # A tibble: 5 x 20
#> ha xp yp tag mainstemtag mnemonic canopycode x y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 8 10194 10195 WRONG 28 9.4 7.8
#> 2 1 2 2 10292 10292 WRONG 31 9.2 1.1
#> 3 1 4 7 10357 10357 WRONG 94 0.0 7.7
#> 4 1 8 2 10747 10747 WRONG 37 0.4 1.9
#> 5 2 2 1 20106 20106 WRONG 28 7.8 7.6
#> # ... with 11 more variables: dbh <dbl>, con <chr>, codes <chr>,
#> # hom <chr>, notes <chr>, dbh_2018 <lgl>, status_2018 <lgl>,
#> # codes_2018 <lgl>, notes_2018 <lgl>, exact_date <lgl>, team <lgl>
Lets calculate useful summaries for each species; particularly, let’s calculate for each species what an extreme value is – here defined as the mean value plus 2 times the standard deviation.
# Cleaning the data by removing dbh values equal to NA
valid_dbh <- filter(serc, !is.na(dbh))
grouped <- group_by(valid_dbh, mnemonic)
summarized <- summarise(
grouped,
max = max(dbh, na.rm = TRUE),
min = min(dbh, na.rm = TRUE),
mean = mean(dbh, na.rm = TRUE),
sd = sd(dbh, na.rm = TRUE),
extreeme = mean + 2 * sd
)
summarized
#> # A tibble: 45 x 6
#> mnemonic max min mean sd extreeme
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ACRU 55.3 0.9 16.36 12.56 41.49
#> 2 AMAR3 4.9 1.5 3.20 2.40 8.01
#> 3 ARSP2 2.8 2.8 2.80 NaN NaN
#> 4 CAAL27 82.4 4.8 29.86 15.15 60.16
#> 5 CACA18 21.9 0.9 4.67 3.31 11.28
#> 6 CAGL8 98.0 2.5 33.05 27.28 87.60
#> 7 CARA2 2.2 2.2 2.20 NaN NaN
#> 8 CARYA 33.4 33.4 33.40 NaN NaN
#> 9 CEOC2 18.3 1.9 5.79 5.39 16.57
#> 10 COFL2 16.6 2.4 6.75 3.09 12.93
#> # ... with 35 more rows
And now let’s plot the dbh values of the (cleaned) data and then overlay the extreme value of each species.
extreme <- rename(summarized, dbh = extreeme)
ggplot(valid_dbh, aes(x = dbh, y = fct_reorder(mnemonic, dbh))) +
geom_point(size = 1) +
geom_point(
data = extreme,
aes(x = dbh),
size = 1, colour = "red", shape = 19
) +
labs(y = NULL)
#> Warning: Removed 7 rows containing missing values (geom_point).
Is this different to the section “Check for code errors of species names”?