This document follows a discussion with Jess Sue (https://github.com/forestgeo/qcr/issues/2). To address these issues we will use some functions that temporarily live in the qcr package (I may eventually move elsewhere).
# devtools::install_github("forestgeo/qcr")
library(qcr)
We will also need some other packages. We’ll load all packages here. To be explicit, however, I’ll use the syntax package::function()
, except when a function comes from the base package.
library(readxl)
library(here)
#> here() starts at C:/Users/dora/Dropbox/git_repos/qcr
library(fs)
# tidyverse includes a number of packages
library(tidyverse)
#> -- Attaching packages ---------------------------------------- tidyverse 1.2.1 --
#> v ggplot2 2.2.1 v purrr 0.2.4
#> v tibble 1.4.2 v dplyr 0.7.4
#> v tidyr 0.7.2 v stringr 1.2.0
#> v readr 1.1.1 v forcats 0.2.0
#> -- Conflicts ------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
Create a list of dataframes – each dataframe maps to a spreadsheets in an excel workbook.
# Path to excel workbook
workbook_path <- here::here("inst/issue_2/101_01-05-2018_JS_raw_file.xlsx")
workbook_path
#> [1] "C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/101_01-05-2018_JS_raw_file.xlsx"
df_list <- qcr::map_read_excel(workbook_path)
# Showing only part of the data to save space
str(df_list, list.len = 3)
#> List of 8
#> $ Root :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 6 variables:
#> ..$ FormId : chr "185528"
#> ..$ Form Name : chr "SCBI Data Entry 2018"
#> ..$ Submitted On : chr "01-05-2018 03:45 PM"
#> .. [list output truncated]
#> $ Existing_stems :Classes 'tbl_df', 'tbl' and 'data.frame': 0 obs. of 0 variables
#> $ Field Meta Data :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 4 variables:
#> ..$ Submission Id : chr "73e4154d-ca4b-495d-936b-98d95fe916a0"
#> ..$ Quadrat_1 Latitude : chr "38.888244"
#> ..$ Quadrat_1 Longitude: chr "-76.554154"
#> .. [list output truncated]
#> [list output truncated]
Naming spreadsheets consistently.
# * Inconsistent upper and lower case.
# * Inconsistent separation between words with " " or "_".
names(df_list)
#> [1] "Root" "Existing_stems" "Field Meta Data"
#> [4] "Form Meta Data" "new_secondary_stems" "orignal_stems"
#> [7] "Recruits" "Start_page"
# Consistent
df_list <- qcr::nms_link_tolower(df_list)
names(df_list)
#> [1] "root" "existing_stems" "field_meta_data"
#> [4] "form_meta_data" "new_secondary_stems" "orignal_stems"
#> [7] "recruits" "start_page"
The same issue applies for the names inside each element of df_list
. If you keep the inconsistent names, selecting variables by name becomes a nightmare. Let’s fix that.
df_list <- lapply(df_list, nms_link_tolower)
# Fixed
lapply(df_list, names)
#> $root
#> [1] "formid" "form_name" "submitted_on" "form_version"
#> [5] "submitted_by" "submission_id"
#>
#> $existing_stems
#> character(0)
#>
#> $field_meta_data
#> [1] "submission_id" "quadrat_1_latitude" "quadrat_1_longitude"
#> [4] "quadrat_1_timestamp"
#>
#> $form_meta_data
#> [1] "submission_id" "start_form_time_stamp" "end_form_time_stamp"
#>
#> $new_secondary_stems
#> [1] "submission_id" "section_id" "quadrat" "tag"
#> [5] "species" "px" "py" "stem_tag"
#> [9] "dbh" "status" "codes" "pom"
#> [13] "notes" "new_stem?"
#>
#> $orignal_stems
#> [1] "submission_id" "section_id" "quadrat" "tag"
#> [5] "stem_tag" "species" "px" "py"
#> [9] "dbh" "status" "codes" "pom"
#> [13] "dbh_2018" "status_2018" "codes_2018" "notes"
#> [17] "data_check" "dbh_check"
#>
#> $recruits
#> [1] "submission_id" "quadrat" "tag" "stem_tag"
#> [5] "species" "px" "py" "dbh"
#> [9] "status" "codes" "pom" "notes"
#>
#> $start_page
#> [1] "submission_id" "section_id" "quadrat" "date"
#> [5] "team" "stem_count" "view_map" "map"
# Suggestion: The symbol `?` is difficult to handle. Avoid it if you can. One
# common way to replace a `?` is whith the prefix `is_`, so `new_stem?` becomes
# `is_new_stem` and the values should be interpreted as logical: i.e. TRUE or
# FALSE (or 1 or 0).
# Before
dplyr::select(df_list$new_secondary_stems, matches("new_stem?"), dplyr::everything())
#> # A tibble: 3 x 14
#> `new_stem?` submission_id section_id quadrat tag species px py
#> <chr> <chr> <lgl> <chr> <chr> <chr> <chr> <chr>
#> 1 1 73e4154d-ca4b-~ NA 101 10004 libe 9.80~ 5.80~
#> 2 0 73e4154d-ca4b-~ NA 101 10004 libe 9.80~ 5.80~
#> 3 0 73e4154d-ca4b-~ NA 101 10004 libe 9.80~ 5.80~
#> # ... with 6 more variables: stem_tag <chr>, dbh <chr>, status <chr>,
#> # codes <chr>, pom <chr>, notes <lgl>
# Clean (if you eventually rename to `is_new_var`, then use `var = is_new_var`)
df_list <- qcr::drop_fake_stems(df_list, var = "new_stem?")
# After
dplyr::select(df_list$new_secondary_stems, `new_stem?`, dplyr::everything())
#> # A tibble: 1 x 14
#> `new_stem?` submission_id section_id quadrat tag species px py
#> <chr> <chr> <lgl> <chr> <chr> <chr> <chr> <chr>
#> 1 1 73e4154d-ca4b-~ NA 101 10004 libe 9.80~ 5.80~
#> # ... with 6 more variables: stem_tag <chr>, dbh <chr>, status <chr>,
#> # codes <chr>, pom <chr>, notes <lgl>
Write a csv file for each dataframe in the list (i.e. for each spreadsheet in the excel workbook).
output_dir <- here::here("inst/issue_2/output")
output_dir
#> [1] "C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output"
qcr::walk_write_csv(df_list, output_dir, prefix = "mybook-")
# Check if it worked
new_files <- fs::dir_ls(output_dir, regexp = "*.csv")
new_files
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-existing_stems.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-field_meta_data.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-form_meta_data.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-new_secondary_stems.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-orignal_stems.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-recruits.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-root.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-start_page.csv
# Cleanup my directory
unlink(new_files)
Combining as many spreadsheets as possible. (Empty spreadsheets will be discarded because they have no column-names that can be used for combining all spreadsheets into one.)
length(df_list)
#> [1] 8
df_list <- qcr::keep_non_empty(df_list)
#> Warning: Discarding empty tables:
#> existing_stems
length(df_list)
#> [1] 7
all_possible <- qcr::reduce_full_join(df_list)
#> Joining, by = "submission_id"
#> Joining, by = "submission_id"
#> Joining, by = "submission_id"
#> Joining, by = c("submission_id", "section_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")
#> Joining, by = c("submission_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")
#> Joining, by = c("submission_id", "section_id", "quadrat")
# This is it
tibble::as.tibble(all_possible)
#> # A tibble: 6 x 34
#> formid form_name submitted_on form_version submitted_by submission_id
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 185528 SCBI Data~ 01-05-2018 0~ 35 shuej@si.edu 73e4154d-ca4b~
#> 2 <NA> <NA> <NA> <NA> <NA> 73e4154d-ca4b~
#> 3 <NA> <NA> <NA> <NA> <NA> 73e4154d-ca4b~
#> 4 <NA> <NA> <NA> <NA> <NA> 73e4154d-ca4b~
#> 5 <NA> <NA> <NA> <NA> <NA> 73e4154d-ca4b~
#> 6 <NA> <NA> <NA> <NA> <NA> 73e4154d-ca4b~
#> # ... with 28 more variables: quadrat_1_latitude <chr>,
#> # quadrat_1_longitude <chr>, quadrat_1_timestamp <chr>,
#> # start_form_time_stamp <chr>, end_form_time_stamp <chr>,
#> # section_id <lgl>, quadrat <chr>, tag <chr>, species <chr>, px <chr>,
#> # py <chr>, stem_tag <chr>, dbh <chr>, status <chr>, codes <chr>,
#> # pom <chr>, notes <lgl>, `new_stem?` <chr>, dbh_2018 <chr>,
#> # status_2018 <chr>, codes_2018 <chr>, data_check <chr>,
#> # dbh_check <chr>, date <chr>, team <chr>, stem_count <chr>,
#> # view_map <chr>, map <chr>
Combining specific spreadsheets.
these_sheets <- c("recruits", "new_secondary_stems", "orignal_stems")
cherries <- reduce_full_join(df_list, these_sheets)
#> Warning in all(purrr::map(.df_nm, ~rlang::has_name(.df, .))): coercing
#> argument of type 'list' to logical
#> Joining, by = c("submission_id", "section_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")
#> Joining, by = c("submission_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")
# This is it
tibble::as.tibble(cherries)
#> # A tibble: 6 x 19
#> submission_id section_id quadrat tag species px py stem_tag
#> <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 73e4154d-ca4b-49~ NA 101 10004 libe 9.80~ 5.80~ 4
#> 2 73e4154d-ca4b-49~ NA 101 10004 libe 9.80~ 5.80~ 1
#> 3 73e4154d-ca4b-49~ NA 101 10004 libe 9.80~ 5.80~ 2
#> 4 73e4154d-ca4b-49~ NA 101 10004 libe 9.80~ 5.80~ 3
#> 5 73e4154d-ca4b-49~ NA 101 123456 acne 2 2 1
#> 6 73e4154d-ca4b-49~ NA 101 123456 acne 2 2 2
#> # ... with 11 more variables: dbh <chr>, status <chr>, codes <chr>,
#> # pom <chr>, notes <lgl>, `new_stem?` <chr>, dbh_2018 <chr>,
#> # status_2018 <chr>, codes_2018 <chr>, data_check <chr>, dbh_check <chr>
Saving the output as .csv.
all_possible_path <- file.path(paste0(output_dir, "/all_possible.csv"))
readr::write_csv(all_possible, all_possible_path)
cherries_path <- file.path(paste0(output_dir, "/cherries.csv"))
readr::write_csv(cherries, cherries_path)
# Check then clean up
new_files <- fs::dir_ls(output_dir, regexp = "*.csv")
new_files
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/all_possible.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/cherries.csv
unlink(new_files)
If the list that you pass to reduce_full_join()
includes the spreadsheet start_page
, its columns will be added.
# Reordering columns to show relevant columns first
tibble::as.tibble(
dplyr::select(all_possible, date, team, dplyr::everything())
)
#> # A tibble: 6 x 34
#> date team formid form_name submitted_on form_version submitted_by
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 01-05-~ JS 185528 SCBI Data ~ 01-05-2018 0~ 35 shuej@si.edu
#> 2 01-05-~ JS <NA> <NA> <NA> <NA> <NA>
#> 3 01-05-~ JS <NA> <NA> <NA> <NA> <NA>
#> 4 01-05-~ JS <NA> <NA> <NA> <NA> <NA>
#> 5 01-05-~ JS <NA> <NA> <NA> <NA> <NA>
#> 6 01-05-~ JS <NA> <NA> <NA> <NA> <NA>
#> # ... with 27 more variables: submission_id <chr>,
#> # quadrat_1_latitude <chr>, quadrat_1_longitude <chr>,
#> # quadrat_1_timestamp <chr>, start_form_time_stamp <chr>,
#> # end_form_time_stamp <chr>, section_id <lgl>, quadrat <chr>, tag <chr>,
#> # species <chr>, px <chr>, py <chr>, stem_tag <chr>, dbh <chr>,
#> # status <chr>, codes <chr>, pom <chr>, notes <lgl>, `new_stem?` <chr>,
#> # dbh_2018 <chr>, status_2018 <chr>, codes_2018 <chr>, data_check <chr>,
#> # dbh_check <chr>, stem_count <chr>, view_map <chr>, map <chr>
pattern <- "submitted|quadrat_1|start|end"
summary <- dplyr::select(all_possible, matches(pattern))
summary_path <- paste0(output_dir, "/summary.csv")
readr::write_csv(summary, summary_path)
# Check if it worked
new_files <- fs::dir_ls(output_dir, regexp = "*.csv")
new_files
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/summary.csv
# Clean up
unlink(new_files)