Manipulating multiple spreadsheets in an excel workbook

Pull each spreadsheets out of an excel workbook and into a dataframe
We would want to remove any stems in the new_secondary_stems sheet with a ‘0’ in the ‘New Stem?’ column as they are automatically generated
At the very least we would likely want to keep them separate but export [the spreadsheets] as individual .csv files
Depending on the site they may wish to combine the separate sheets in the excel file […] into one .csv file
Date (YYYY-MM-DD format) and Team from the Start_Page added as columns to the combined file, or to each of the separate files
- Possibly a summary table of the following for the data manager: Root ‘Submitted On’, Field Meta Data: ‘Quadrat_1 Latitude’, ‘Quadrat_1 Longitude’, and ‘Quadrat_1 Timestamp’, and Form Meta Data: ‘Start Form Time Stamp’ and ‘End Form Time Stamp’

This document follows a discussion with Jess Sue (https://github.com/forestgeo/qcr/issues/2). To address these issues we will use some functions that temporarily live in the qcr package (I may eventually move elsewhere).

# devtools::install_github("forestgeo/qcr")
library(qcr)

We will also need some other packages. We’ll load all packages here. To be explicit, however, I’ll use the syntax package::function(), except when a function comes from the base package.

library(readxl)
library(here)

#> here() starts at C:/Users/dora/Dropbox/git_repos/qcr

library(fs)
# tidyverse includes a number of packages
library(tidyverse)

#> -- Attaching packages ---------------------------------------- tidyverse 1.2.1 --

#> v ggplot2 2.2.1     v purrr   0.2.4
#> v tibble  1.4.2     v dplyr   0.7.4
#> v tidyr   0.7.2     v stringr 1.2.0
#> v readr   1.1.1     v forcats 0.2.0

#> -- Conflicts ------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()

Pull each spreadsheets out of an excel workbook and into a dataframe

Create a list of dataframes – each dataframe maps to a spreadsheets in an excel workbook.

# Path to excel workbook
workbook_path <- here::here("inst/issue_2/101_01-05-2018_JS_raw_file.xlsx")
workbook_path

#> [1] "C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/101_01-05-2018_JS_raw_file.xlsx"

df_list <- qcr::map_read_excel(workbook_path)
# Showing only part of the data to save space
str(df_list, list.len = 3)

#> List of 8
#>  $ Root               :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of  6 variables:
#>   ..$ FormId       : chr "185528"
#>   ..$ Form Name    : chr "SCBI Data Entry 2018"
#>   ..$ Submitted On : chr "01-05-2018 03:45 PM"
#>   .. [list output truncated]
#>  $ Existing_stems     :Classes 'tbl_df', 'tbl' and 'data.frame': 0 obs. of  0 variables
#>  $ Field Meta Data    :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of  4 variables:
#>   ..$ Submission Id      : chr "73e4154d-ca4b-495d-936b-98d95fe916a0"
#>   ..$ Quadrat_1 Latitude : chr "38.888244"
#>   ..$ Quadrat_1 Longitude: chr "-76.554154"
#>   .. [list output truncated]
#>   [list output truncated]

Naming spreadsheets consistently.

# * Inconsistent upper and lower case.
# * Inconsistent separation between words with " " or "_".
names(df_list)

#> [1] "Root"                "Existing_stems"      "Field Meta Data"    
#> [4] "Form Meta Data"      "new_secondary_stems" "orignal_stems"      
#> [7] "Recruits"            "Start_page"

# Consistent
df_list <- qcr::nms_link_tolower(df_list)
names(df_list)

#> [1] "root"                "existing_stems"      "field_meta_data"    
#> [4] "form_meta_data"      "new_secondary_stems" "orignal_stems"      
#> [7] "recruits"            "start_page"

The same issue applies for the names inside each element of df_list. If you keep the inconsistent names, selecting variables by name becomes a nightmare. Let’s fix that.

df_list <- lapply(df_list, nms_link_tolower)
# Fixed
lapply(df_list, names)

#> $root
#> [1] "formid"        "form_name"     "submitted_on"  "form_version" 
#> [5] "submitted_by"  "submission_id"
#> 
#> $existing_stems
#> character(0)
#> 
#> $field_meta_data
#> [1] "submission_id"       "quadrat_1_latitude"  "quadrat_1_longitude"
#> [4] "quadrat_1_timestamp"
#> 
#> $form_meta_data
#> [1] "submission_id"         "start_form_time_stamp" "end_form_time_stamp"  
#> 
#> $new_secondary_stems
#>  [1] "submission_id" "section_id"    "quadrat"       "tag"          
#>  [5] "species"       "px"            "py"            "stem_tag"     
#>  [9] "dbh"           "status"        "codes"         "pom"          
#> [13] "notes"         "new_stem?"    
#> 
#> $orignal_stems
#>  [1] "submission_id" "section_id"    "quadrat"       "tag"          
#>  [5] "stem_tag"      "species"       "px"            "py"           
#>  [9] "dbh"           "status"        "codes"         "pom"          
#> [13] "dbh_2018"      "status_2018"   "codes_2018"    "notes"        
#> [17] "data_check"    "dbh_check"    
#> 
#> $recruits
#>  [1] "submission_id" "quadrat"       "tag"           "stem_tag"     
#>  [5] "species"       "px"            "py"            "dbh"          
#>  [9] "status"        "codes"         "pom"           "notes"        
#> 
#> $start_page
#> [1] "submission_id" "section_id"    "quadrat"       "date"         
#> [5] "team"          "stem_count"    "view_map"      "map"

We would want to remove any stems in the new_secondary_stems sheet with a ‘0’ in the ‘New Stem?’ column as they are automatically generated

# Suggestion: The symbol `?` is difficult to handle. Avoid it if you can. One
# common way to replace a `?` is whith the prefix `is_`, so `new_stem?` becomes
# `is_new_stem` and the values should be interpreted as logical: i.e. TRUE or
# FALSE (or 1 or 0).

# Before
dplyr::select(df_list$new_secondary_stems, matches("new_stem?"), dplyr::everything())

#> # A tibble: 3 x 14
#>   `new_stem?` submission_id   section_id quadrat tag   species px    py   
#>   <chr>       <chr>           <lgl>      <chr>   <chr> <chr>   <chr> <chr>
#> 1 1           73e4154d-ca4b-~ NA         101     10004 libe    9.80~ 5.80~
#> 2 0           73e4154d-ca4b-~ NA         101     10004 libe    9.80~ 5.80~
#> 3 0           73e4154d-ca4b-~ NA         101     10004 libe    9.80~ 5.80~
#> # ... with 6 more variables: stem_tag <chr>, dbh <chr>, status <chr>,
#> #   codes <chr>, pom <chr>, notes <lgl>

# Clean (if you eventually rename to `is_new_var`, then use `var = is_new_var`)
df_list <- qcr::drop_fake_stems(df_list, var = "new_stem?")

# After
dplyr::select(df_list$new_secondary_stems, `new_stem?`, dplyr::everything())

#> # A tibble: 1 x 14
#>   `new_stem?` submission_id   section_id quadrat tag   species px    py   
#>   <chr>       <chr>           <lgl>      <chr>   <chr> <chr>   <chr> <chr>
#> 1 1           73e4154d-ca4b-~ NA         101     10004 libe    9.80~ 5.80~
#> # ... with 6 more variables: stem_tag <chr>, dbh <chr>, status <chr>,
#> #   codes <chr>, pom <chr>, notes <lgl>

At the very least we would likely want to keep them separate but export [the spreadsheets] as individual .csv files

Write a csv file for each dataframe in the list (i.e. for each spreadsheet in the excel workbook).

output_dir <- here::here("inst/issue_2/output")
output_dir

#> [1] "C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output"

qcr::walk_write_csv(df_list, output_dir, prefix = "mybook-")

# Check if it worked
new_files <- fs::dir_ls(output_dir, regexp = "*.csv")
new_files

#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-existing_stems.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-field_meta_data.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-form_meta_data.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-new_secondary_stems.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-orignal_stems.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-recruits.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-root.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/mybook-start_page.csv

# Cleanup my directory
unlink(new_files)

Depending on the site they may wish to combine the separate sheets in the excel file […] into one .csv file

Combining as many spreadsheets as possible. (Empty spreadsheets will be discarded because they have no column-names that can be used for combining all spreadsheets into one.)

length(df_list)

#> [1] 8

df_list <- qcr::keep_non_empty(df_list)

#> Warning: Discarding empty tables: 
#> existing_stems

length(df_list)

#> [1] 7

all_possible <- qcr::reduce_full_join(df_list)

#> Joining, by = "submission_id"
#> Joining, by = "submission_id"
#> Joining, by = "submission_id"

#> Joining, by = c("submission_id", "section_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")

#> Joining, by = c("submission_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")

#> Joining, by = c("submission_id", "section_id", "quadrat")

# This is it
tibble::as.tibble(all_possible)

#> # A tibble: 6 x 34
#>   formid form_name  submitted_on  form_version submitted_by submission_id 
#>   <chr>  <chr>      <chr>         <chr>        <chr>        <chr>         
#> 1 185528 SCBI Data~ 01-05-2018 0~ 35           shuej@si.edu 73e4154d-ca4b~
#> 2 <NA>   <NA>       <NA>          <NA>         <NA>         73e4154d-ca4b~
#> 3 <NA>   <NA>       <NA>          <NA>         <NA>         73e4154d-ca4b~
#> 4 <NA>   <NA>       <NA>          <NA>         <NA>         73e4154d-ca4b~
#> 5 <NA>   <NA>       <NA>          <NA>         <NA>         73e4154d-ca4b~
#> 6 <NA>   <NA>       <NA>          <NA>         <NA>         73e4154d-ca4b~
#> # ... with 28 more variables: quadrat_1_latitude <chr>,
#> #   quadrat_1_longitude <chr>, quadrat_1_timestamp <chr>,
#> #   start_form_time_stamp <chr>, end_form_time_stamp <chr>,
#> #   section_id <lgl>, quadrat <chr>, tag <chr>, species <chr>, px <chr>,
#> #   py <chr>, stem_tag <chr>, dbh <chr>, status <chr>, codes <chr>,
#> #   pom <chr>, notes <lgl>, `new_stem?` <chr>, dbh_2018 <chr>,
#> #   status_2018 <chr>, codes_2018 <chr>, data_check <chr>,
#> #   dbh_check <chr>, date <chr>, team <chr>, stem_count <chr>,
#> #   view_map <chr>, map <chr>

Combining specific spreadsheets.

these_sheets <- c("recruits", "new_secondary_stems", "orignal_stems")
cherries <- reduce_full_join(df_list, these_sheets)

#> Warning in all(purrr::map(.df_nm, ~rlang::has_name(.df, .))): coercing
#> argument of type 'list' to logical

#> Joining, by = c("submission_id", "section_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")

#> Joining, by = c("submission_id", "quadrat", "tag", "species", "px", "py", "stem_tag", "dbh", "status", "codes", "pom", "notes")

# This is it
tibble::as.tibble(cherries)

#> # A tibble: 6 x 19
#>   submission_id     section_id quadrat tag    species px    py    stem_tag
#>   <chr>             <lgl>      <chr>   <chr>  <chr>   <chr> <chr> <chr>   
#> 1 73e4154d-ca4b-49~ NA         101     10004  libe    9.80~ 5.80~ 4       
#> 2 73e4154d-ca4b-49~ NA         101     10004  libe    9.80~ 5.80~ 1       
#> 3 73e4154d-ca4b-49~ NA         101     10004  libe    9.80~ 5.80~ 2       
#> 4 73e4154d-ca4b-49~ NA         101     10004  libe    9.80~ 5.80~ 3       
#> 5 73e4154d-ca4b-49~ NA         101     123456 acne    2     2     1       
#> 6 73e4154d-ca4b-49~ NA         101     123456 acne    2     2     2       
#> # ... with 11 more variables: dbh <chr>, status <chr>, codes <chr>,
#> #   pom <chr>, notes <lgl>, `new_stem?` <chr>, dbh_2018 <chr>,
#> #   status_2018 <chr>, codes_2018 <chr>, data_check <chr>, dbh_check <chr>

Saving the output as .csv.

all_possible_path <- file.path(paste0(output_dir, "/all_possible.csv"))
readr::write_csv(all_possible, all_possible_path)

cherries_path <- file.path(paste0(output_dir, "/cherries.csv"))
readr::write_csv(cherries, cherries_path)

# Check then clean up
new_files <- fs::dir_ls(output_dir, regexp = "*.csv")
new_files

#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/all_possible.csv
#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/cherries.csv

unlink(new_files)

Date (YYYY-MM-DD format) and Team from the Start_Page added as columns to the combined file, or to each of the separate files

If the list that you pass to reduce_full_join() includes the spreadsheet start_page, its columns will be added.

# Reordering columns to show relevant columns first
tibble::as.tibble(
  dplyr::select(all_possible, date, team, dplyr::everything())
)

#> # A tibble: 6 x 34
#>   date    team  formid form_name   submitted_on  form_version submitted_by
#>   <chr>   <chr> <chr>  <chr>       <chr>         <chr>        <chr>       
#> 1 01-05-~ JS    185528 SCBI Data ~ 01-05-2018 0~ 35           shuej@si.edu
#> 2 01-05-~ JS    <NA>   <NA>        <NA>          <NA>         <NA>        
#> 3 01-05-~ JS    <NA>   <NA>        <NA>          <NA>         <NA>        
#> 4 01-05-~ JS    <NA>   <NA>        <NA>          <NA>         <NA>        
#> 5 01-05-~ JS    <NA>   <NA>        <NA>          <NA>         <NA>        
#> 6 01-05-~ JS    <NA>   <NA>        <NA>          <NA>         <NA>        
#> # ... with 27 more variables: submission_id <chr>,
#> #   quadrat_1_latitude <chr>, quadrat_1_longitude <chr>,
#> #   quadrat_1_timestamp <chr>, start_form_time_stamp <chr>,
#> #   end_form_time_stamp <chr>, section_id <lgl>, quadrat <chr>, tag <chr>,
#> #   species <chr>, px <chr>, py <chr>, stem_tag <chr>, dbh <chr>,
#> #   status <chr>, codes <chr>, pom <chr>, notes <lgl>, `new_stem?` <chr>,
#> #   dbh_2018 <chr>, status_2018 <chr>, codes_2018 <chr>, data_check <chr>,
#> #   dbh_check <chr>, stem_count <chr>, view_map <chr>, map <chr>

Possibly a summary table of the following for the data manager: Root ‘Submitted On’, Field Meta Data: ‘Quadrat_1 Latitude’, ‘Quadrat_1 Longitude’, and ‘Quadrat_1 Timestamp’, and Form Meta Data: ‘Start Form Time Stamp’ and ‘End Form Time Stamp’

pattern <- "submitted|quadrat_1|start|end"
summary <- dplyr::select(all_possible, matches(pattern))

summary_path <- paste0(output_dir, "/summary.csv")
readr::write_csv(summary, summary_path)

# Check if it worked
new_files <- fs::dir_ls(output_dir, regexp = "*.csv")
new_files

#> C:/Users/dora/Dropbox/git_repos/qcr/inst/issue_2/output/summary.csv

# Clean up
unlink(new_files)