# install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.0 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.3.4 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# To save time and space, limit the number of lines printed
lines <- 10
options(
dplyr.print_min = lines, dplyr.print_max = lines, tidyverse.quiet = TRUE
)
# install.packages("readxl")
path <- "../data-raw/H1_H2_datasheet_Mauro.xlsx"
serc <- readxl::read_xlsx(path)
Any style is OK; For consistence with all of my work, I use this style: http://style.tidyverse.org/.
# Changing names to lowercase to reduce the chances of confusion due to case
serc <- set_names(serc, tolower)
# Changing periods to underscore
serc <- set_names(serc, str_replace(names(serc), "\\.", "_"))
Glimpse variables names and type.
glimpse(serc)
#> Observations: 2,996
#> Variables: 20
#> $ HA <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ XP <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ YP <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
#> $ Tag <dbl> 10108, 10111, 10398, 11133, 10102, 10399, 10691, 1...
#> $ MainstemTag <dbl> 10108, 10111, 10398, 11133, 10102, 10399, 10691, 1...
#> $ Mnemonic <chr> "ULAM", "FRPE", "SANIC4", "ULRU", "ACRU", "SANIC4"...
#> $ CanopyCode <dbl> 88, 31, 84, 89, 2, 84, 84, 84, 84, 82, 84, 31, 2, ...
#> $ X <dbl> 7.0, 1.4, 9.6, 5.2, 0.4, 9.7, 8.6, 8.1, 7.8, 5.7, ...
#> $ Y <dbl> 2.7, 8.5, 1.5, 4.0, 1.7, 1.0, 0.2, 0.1, 0.5, 4.0, ...
#> $ DBH <chr> "3.4", "11.5", "1.3", "8.4", "12.3", "2", "1.8", "...
#> $ Con <chr> "LI", "LI", "LI", "LI", "LI", "LI", "LI", "LI", "L...
#> $ Codes <chr> "L", "L", "L", "L", "M", "M", "M", "M", "M", "M", ...
#> $ HOM <chr> "1.3\r", "1.3\r", "1.3\r", "1.3\r", "1.3\r", "1.3\...
#> $ Notes <chr> "SE", "West", "DC tag collected 25-Mar-2015", "NE"...
#> $ DBH.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ STATUS.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ CODES.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ NOTES.2018 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ EXACT.DATE <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
#> $ TEAM <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
Skim variables’ summary by type.
# install.packages("skimr")
skimr::skim(serc)
#> Numeric Variables
#> # A tibble: 8 x 13
#> var type missing complete n mean sd min
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 CanopyCode numeric 0 2996 2996 33.73 21.416 2
#> 2 HA numeric 0 2996 2996 1.46 0.498 1
#> 3 MainstemTag numeric 0 2996 2996 15313.02 4990.763 10001
#> 4 Tag numeric 0 2996 2996 15355.07 4970.081 10001
#> 5 X numeric 0 2996 2996 5.02 2.878 0
#> 6 XP numeric 0 2996 2996 5.49 2.948 1
#> 7 Y numeric 0 2996 2996 5.11 2.887 0
#> 8 YP numeric 0 2996 2996 5.72 2.894 1
#> # ... with 5 more variables: `25% quantile` <dbl>, median <dbl>, `75%
#> # quantile` <dbl>, max <dbl>, hist <chr>
#>
#> Character Variables
#> # A tibble: 6 x 9
#> var type complete missing empty n min max n_unique
#> * <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Codes character 1343 1653 0 2996 1 14 127
#> 2 Con character 2996 0 0 2996 2 2 6
#> 3 DBH character 2996 0 0 2996 1 18 463
#> 4 HOM character 2996 0 0 2996 1 5 26
#> 5 Mnemonic character 2996 0 0 2996 4 6 49
#> 6 Notes character 589 2407 0 2996 2 52 213
summary(serc)
#> HA XP YP Tag
#> Min. :1.00 Min. : 1.00 Min. : 1.00 Min. :10001
#> 1st Qu.:1.00 1st Qu.: 3.00 1st Qu.: 3.00 1st Qu.:10752
#> Median :1.00 Median : 5.00 Median : 6.00 Median :11502
#> Mean :1.46 Mean : 5.49 Mean : 5.72 Mean :15355
#> 3rd Qu.:2.00 3rd Qu.: 8.00 3rd Qu.: 8.00 3rd Qu.:20669
#> Max. :2.00 Max. :10.00 Max. :10.00 Max. :21633
#> MainstemTag Mnemonic CanopyCode X
#> Min. :10001 Length:2996 Min. : 2.0 Min. : 0.00
#> 1st Qu.:10681 Class :character 1st Qu.: 22.0 1st Qu.: 2.50
#> Median :11428 Mode :character Median : 28.0 Median : 5.10
#> Mean :15313 Mean : 33.7 Mean : 5.02
#> 3rd Qu.:20654 3rd Qu.: 39.0 3rd Qu.: 7.50
#> Max. :21633 Max. :132.0 Max. :10.30
#> Y DBH Con Codes
#> Min. : 0.00 Length:2996 Length:2996 Length:2996
#> 1st Qu.: 2.60 Class :character Class :character Class :character
#> Median : 5.20 Mode :character Mode :character Mode :character
#> Mean : 5.11
#> 3rd Qu.: 7.60
#> Max. :10.00
#> HOM Notes DBH.2018 STATUS.2018
#> Length:2996 Length:2996 Mode:logical Mode:logical
#> Class :character Class :character NA's:2996 NA's:2996
#> Mode :character Mode :character
#>
#>
#>
#> CODES.2018 NOTES.2018 EXACT.DATE TEAM
#> Mode:logical Mode:logical Mode:logical Mode:logical
#> NA's:2996 NA's:2996 NA's:2996 NA's:2996
#>
#>
#>
#>
The exploratory data analysis that follows addresses the following email:
On Tue, Oct 10, 2017 at 1:27 PM, Shue, Jessica ShueJ@si.edu wrote:
(RE: What summaries of your fieldwork data you would like to have?)
In this data set the variable that seems to uniquely identify each stem is Tag
. Therefore, the number of unique stems is n
.
n <- length(unique(serc$tag))
n
#> [1] 2996
Unfortunately this data set seems to have no data on previous censuses. But we can still summarize and explore the distribution of DBH
values, to check for outliers.
# DBH should be numeric
str(serc$dbh)
#> chr [1:2996] "3.4" "11.5" "1.3" "8.4" "12.3" "2" "1.8" ...
serc <- mutate(serc, dbh = as.numeric(dbh))
#> Warning in evalq(as.numeric(dbh), <environment>): NAs introduced by
#> coercion
str(serc$dbh)
#> num [1:2996] 3.4 11.5 1.3 8.4 12.3 2 1.8 2.3 3 1.9 ...
summary(serc$dbh)
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> 0.9 1.9 4.3 10.3 10.3 113.1 315
# Create a reference data set
mean_plus_two_sd <- mean(serc$dbh, na.rm = TRUE) +
sd(serc$dbh, na.rm = TRUE) * 2
refs <- tribble(
~reference, ~value,
"common", mean(serc$dbh, na.rm = TRUE),
"extreeme", mean_plus_two_sd
)
ggplot(serc, aes(x = dbh)) +
geom_histogram(size = 3) +
geom_vline(data = refs, aes(xintercept = value, colour = reference)) +
geom_text(
data = refs,
aes(x = value, y = 0, label = round(value), colour = reference),
hjust = 1
) +
scale_colour_manual(values = c("blue", "red"))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> Warning: Removed 315 rows containing non-finite values (stat_bin).
These data has dbh values greater than the mean plus two standard deviations.
extreeme_data <- filter(serc, dbh > mean_plus_two_sd)
extreeme_data
#> # A tibble: 164 x 20
#> ha xp yp tag mainstemtag mnemonic canopycode x y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 5 10135 10135 UNKN 99 10.0 6.0
#> 2 1 1 5 10127 10127 QUMI 67 4.0 5.5
#> 3 1 1 7 10172 10172 QUMI 67 1.9 4.8
#> 4 1 1 7 10180 10180 LIST2 40 4.2 9.1
#> 5 1 1 8 10193 10193 FAGR 28 10.0 0.5
#> 6 1 1 9 10200 10200 FAGR 28 0.7 9.4
#> 7 1 2 7 10069 10069 UNKN 99 4.9 6.3
#> 8 1 2 8 10086 10086 LIST2 40 10.0 5.4
#> 9 1 3 2 10210 10210 LIST2 40 7.3 8.0
#> 10 1 3 3 10212 10212 ACRU 2 7.9 3.9
#> # ... with 154 more rows, and 11 more variables: dbh <dbl>, con <chr>,
#> # codes <chr>, hom <chr>, notes <chr>, dbh_2018 <lgl>,
#> # status_2018 <lgl>, codes_2018 <lgl>, notes_2018 <lgl>,
#> # exact_date <lgl>, team <lgl>
This saves the extreme for re-check.
write_csv(extreeme_data, "extreeme_data.csv")
Living status after being Dead previous census; good to make a note of prior error to correct in database
In this data set, the variable giving status has all missing values.
unique(
select(serc, matches("status"))
)
#> # A tibble: 1 x 1
#> status_2018
#> <lgl>
#> 1 NA
Let’s demonstrate this issue on a toy data set.
toy <- tibble(
dbh = sample(1:80, 100, replace = TRUE),
tag = rep(1:50, 2),
census = c(rep(1, 50), rep(2, 50)),
status = sample(c("alive", "dead"), 100, replace = TRUE)
)
toy
#> # A tibble: 100 x 4
#> dbh tag census status
#> <int> <int> <dbl> <chr>
#> 1 7 1 1 dead
#> 2 67 2 1 dead
#> 3 49 3 1 alive
#> 4 13 4 1 alive
#> 5 1 5 1 alive
#> 6 38 6 1 dead
#> 7 40 7 1 dead
#> 8 24 8 1 dead
#> 9 59 9 1 dead
#> 10 62 10 1 dead
#> # ... with 90 more rows
Let’s start by sorting (arranging) the data first by tag, then census, then status
toy <- arrange(toy, tag, census, status)
Now we are ready to flag zombie stems.
grouped <- group_by(toy, tag)
zombie_sequence <- "dead->alive"
flagged <- mutate(
grouped,
status_sequence = paste0(status, collapse = "->"),
is_zombie = ifelse(grepl(zombie_sequence, status_sequence), TRUE, FALSE)
)
flagged
#> # A tibble: 100 x 6
#> # Groups: tag [50]
#> dbh tag census status status_sequence is_zombie
#> <int> <int> <dbl> <chr> <chr> <lgl>
#> 1 7 1 1 dead dead->dead FALSE
#> 2 37 1 2 dead dead->dead FALSE
#> 3 67 2 1 dead dead->alive TRUE
#> 4 26 2 2 alive dead->alive TRUE
#> 5 49 3 1 alive alive->dead FALSE
#> 6 14 3 2 dead alive->dead FALSE
#> 7 13 4 1 alive alive->alive FALSE
#> 8 43 4 2 alive alive->alive FALSE
#> 9 1 5 1 alive alive->alive FALSE
#> 10 40 5 2 alive alive->alive FALSE
#> # ... with 90 more rows
We can search the tag of zombie stems.
zombi_data <- filter(flagged, is_zombie)
zombi_tags <- unique(
pull(zombi_data, tag)
)
zombi_tags
#> [1] 2 6 8 9 18 19 22 25 28 32 34 36 37 40 42 43 49
To confirm, we can filer the flagged data with zombi_tags
.
filter(flagged, tag %in% zombi_tags)
#> # A tibble: 34 x 6
#> # Groups: tag [17]
#> dbh tag census status status_sequence is_zomby
#> <int> <int> <dbl> <chr> <chr> <lgl>
#> 1 67 2 1 dead dead->alive TRUE
#> 2 26 2 2 alive dead->alive TRUE
#> 3 38 6 1 dead dead->alive TRUE
#> 4 63 6 2 alive dead->alive TRUE
#> 5 24 8 1 dead dead->alive TRUE
#> 6 58 8 2 alive dead->alive TRUE
#> 7 59 9 1 dead dead->alive TRUE
#> 8 6 9 2 alive dead->alive TRUE
#> 9 6 18 1 dead dead->alive TRUE
#> 10 44 18 2 alive dead->alive TRUE
#> # ... with 24 more rows
In this data set, the time-aware variable has only missing values.
unique(
select(serc, matches("date"))
)
#> # A tibble: 1 x 1
#> exact_date
#> <lgl>
#> 1 NA
Let’s demonstrate this analysis with a toy data set.
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
some_recent_dates <- sample(seq(today() - 60, today(), 1), 100, replace = TRUE)
toy <- mutate(toy, exact_date = some_recent_dates)
toy
#> # A tibble: 100 x 5
#> dbh tag census status exact_date
#> <int> <int> <dbl> <chr> <date>
#> 1 7 1 1 dead 2017-10-07
#> 2 37 1 2 dead 2017-11-12
#> 3 67 2 1 dead 2017-09-29
#> 4 26 2 2 alive 2017-09-17
#> 5 49 3 1 alive 2017-09-24
#> 6 14 3 2 dead 2017-11-07
#> 7 13 4 1 alive 2017-09-29
#> 8 43 4 2 alive 2017-09-28
#> 9 1 5 1 alive 2017-09-18
#> 10 40 5 2 alive 2017-11-12
#> # ... with 90 more rows
data_from_this_week <- filter(toy, exact_date > today() - 7)
decreasing_dbh <- arrange(data_from_this_week, desc(dbh))
# Top largest
head(decreasing_dbh, 3)
#> # A tibble: 3 x 5
#> dbh tag census status exact_date
#> <int> <int> <dbl> <chr> <date>
#> 1 74 36 2 alive 2017-11-12
#> 2 68 44 2 dead 2017-11-09
#> 3 62 46 1 alive 2017-11-10
These data set seems not to have trees with multiple stems.
matching_tag <- select(serc, matches("tag"))
lapply(matching_tag, function(x) unique(length(x)))
#> $tag
#> [1] 2996
#>
#> $mainstemtag
#> [1] 2996
Let’s demonstrate this issue with an example data set from bci. We start by preparing the data set.
# Lowering names to avoid confusions due to case
bci_stems <- set_names(qcr::bci12s7mini, tolower)
# Rearrange and sort useful variables
bci_tems <- arrange(
select(bci_stems, tag, stemid, matches("id"), everything()),
tag, stemid
)
bci_tems
#> # A tibble: 2,165 x 20
#> tag stemid treeid measureid censusid stemtag sp quadrat gx
#> <chr> <int> <int> <int> <int> <chr> <chr> <chr> <dbl>
#> 1 000851 1 858 766 171 apeime 4402 899
#> 2 001080 1 1083 NA NA <NA> alchco 4314 873
#> 3 001080 2 1083 NA NA <NA> alchco 4314 873
#> 4 001080 3 1083 NA NA <NA> alchco 4314 873
#> 5 001080 4 1083 NA NA <NA> alchco 4314 873
#> 6 001080 5 1083 NA NA <NA> alchco 4314 873
#> 7 001080 6 1083 NA NA <NA> alchco 4314 873
#> 8 001080 7 1083 NA NA <NA> alchco 4314 873
#> 9 001080 8 1083 NA NA <NA> alchco 4314 873
#> 10 001080 9 1083 NA NA <NA> alchco 4314 873
#> # ... with 2,155 more rows, and 11 more variables: gy <dbl>, dbh <dbl>,
#> # pom <chr>, hom <dbl>, exactdate <chr>, dfstatus <chr>, codes <chr>,
#> # countpom <dbl>, date <dbl>, status <chr>, agb <dbl>
We can see that tag
is repeated for each unique stemid
. So, to know that number of stems per tree, we can count the number of unique tag
s.
count(bci_stems, tag, sort = TRUE)
#> # A tibble: 1,000 x 2
#> tag n
#> <chr> <int>
#> 1 123563 27
#> 2 111351 21
#> 3 230602 21
#> 4 253807 16
#> 5 162652 15
#> 6 249474 15
#> 7 041253 14
#> 8 127430 14
#> 9 138737 14
#> 10 203957 14
#> # ... with 990 more rows
It seems that the species names are coded in the variable mnemonic
. We can compare the codes in the data set with a table of codes that we know is correct. Suppose we have a table of valid codes:
valid_codes <- unique(select(serc, mnemonic))
valid_codes
#> # A tibble: 49 x 1
#> mnemonic
#> <chr>
#> 1 ULAM
#> 2 FRPE
#> 3 SANIC4
#> 4 ULRU
#> 5 ACRU
#> 6 ROMU
#> 7 LIST2
#> 8 PAQU2
#> 9 LOJA
#> 10 CEOC2
#> # ... with 39 more rows
For demonstration, lets insert wrong codes in a few places.
is_wrong <- sample(c(TRUE, FALSE), nrow(serc), replace = TRUE)
# Inserting 5 wrong codes
wrong_serc <- serc
insert_wrong <- sample(seq_along(wrong_serc$mnemonic), 5)
wrong_serc$mnemonic[insert_wrong] <- "WRONG"
This keeps only the valid codes.
filter(wrong_serc, mnemonic %in% valid_codes$mnemonic)
#> # A tibble: 2,991 x 20
#> ha xp yp tag mainstemtag mnemonic canopycode x y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 1 10108 10108 ULAM 88 7.0 2.7
#> 2 1 1 1 10111 10111 FRPE 31 1.4 8.5
#> 3 1 1 1 10398 10398 SANIC4 84 9.6 1.5
#> 4 1 1 1 11133 11133 ULRU 89 5.2 4.0
#> 5 1 1 1 10102 10102 ACRU 2 0.4 1.7
#> 6 1 1 1 10399 10399 SANIC4 84 9.7 1.0
#> 7 1 1 1 10691 10691 SANIC4 84 8.6 0.2
#> 8 1 1 1 10696 10696 SANIC4 84 8.1 0.1
#> 9 1 1 1 10699 10699 SANIC4 84 7.8 0.5
#> 10 1 1 1 10797 10797 ROMU 82 5.7 4.0
#> # ... with 2,981 more rows, and 11 more variables: dbh <dbl>, con <chr>,
#> # codes <chr>, hom <chr>, notes <chr>, dbh_2018 <lgl>,
#> # status_2018 <lgl>, codes_2018 <lgl>, notes_2018 <lgl>,
#> # exact_date <lgl>, team <lgl>
This finds the wrong codes.
# Notice the negation: !
filter(wrong_serc, !mnemonic %in% valid_codes$mnemonic)
#> # A tibble: 5 x 20
#> ha xp yp tag mainstemtag mnemonic canopycode x y
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1 8 10194 10195 WRONG 28 9.4 7.8
#> 2 1 2 2 10292 10292 WRONG 31 9.2 1.1
#> 3 1 4 7 10357 10357 WRONG 94 0.0 7.7
#> 4 1 8 2 10747 10747 WRONG 37 0.4 1.9
#> 5 2 2 1 20106 20106 WRONG 28 7.8 7.6
#> # ... with 11 more variables: dbh <dbl>, con <chr>, codes <chr>,
#> # hom <chr>, notes <chr>, dbh_2018 <lgl>, status_2018 <lgl>,
#> # codes_2018 <lgl>, notes_2018 <lgl>, exact_date <lgl>, team <lgl>
Lets calculate useful summaries for each species; particularly, let’s calculate for each species what an extreme value is – here defined as the mean value plus 2 times the standard deviation.
# Cleaning the data by removing dbh values equal to NA
valid_dbh <- filter(serc, !is.na(dbh))
grouped <- group_by(valid_dbh, mnemonic)
summarized <- summarise(
grouped,
max = max(dbh, na.rm = TRUE),
min = min(dbh, na.rm = TRUE),
mean = mean(dbh, na.rm = TRUE),
sd = sd(dbh, na.rm = TRUE),
extreeme = mean + 2 * sd
)
summarized
#> # A tibble: 45 x 6
#> mnemonic max min mean sd extreeme
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ACRU 55.3 0.9 16.36 12.56 41.49
#> 2 AMAR3 4.9 1.5 3.20 2.40 8.01
#> 3 ARSP2 2.8 2.8 2.80 NaN NaN
#> 4 CAAL27 82.4 4.8 29.86 15.15 60.16
#> 5 CACA18 21.9 0.9 4.67 3.31 11.28
#> 6 CAGL8 98.0 2.5 33.05 27.28 87.60
#> 7 CARA2 2.2 2.2 2.20 NaN NaN
#> 8 CARYA 33.4 33.4 33.40 NaN NaN
#> 9 CEOC2 18.3 1.9 5.79 5.39 16.57
#> 10 COFL2 16.6 2.4 6.75 3.09 12.93
#> # ... with 35 more rows
And now let’s plot the dbh values of the (cleaned) data and then overlay the extreme value of each species.
extreme <- rename(summarized, dbh = extreeme)
ggplot(valid_dbh, aes(x = dbh, y = fct_reorder(mnemonic, dbh))) +
geom_point(size = 1) +
geom_point(
data = extreme,
aes(x = dbh),
size = 1, colour = "red", shape = 19
) +
labs(y = NULL)
#> Warning: Removed 7 rows containing missing values (geom_point).
Is this different to the section “Check for code errors of species names”?