My goal is to integrate allodb – at least a simple subset of it – with bmss. This document has two sections:
In the first section I explore the equations dataset; I realize that some parameters are missing. This helps me to ask Erika the right question: Where can I find the missing parameters? The answer is, “In the allodb_master dataset.
In the second section I try to integrate the parameters into the equation column.
In this section my goal is to explain what columns are missing from the current equations dataset. I’ll show two examples, starting with a simplistic one.
library(tidyverse)
#> -- Attaching packages --------------------- tidyverse 1.2.1 --
#> v ggplot2 2.2.1     v purrr   0.2.4
#> v tibble  1.4.2     v dplyr   0.7.4
#> v tidyr   0.8.0     v stringr 1.3.0
#> v readr   1.1.1     v forcats 0.3.0
#> -- Conflicts ------------------------ tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()
library(bmss)
library(allodb)
library(here)
#> here() starts at C:/Users/LeporeM/Dropbox/git_repos/allodb
biomass_per_indiv()Let’s start with a little reminder of what the current code of bmss does.
This is a simplistic model of what the equations dataset might look like.
eqn <- bmss::toy_default_eqn
eqn
#> # A tibble: 6 x 5
#>   site  sp     eqn      eqn_source eqn_type
#>   <chr> <chr>  <chr>    <chr>      <chr>   
#> 1 bci   swars1 30 * dbh default    species 
#> 2 bci   hybapr 40 * dbh default    species 
#> 3 bci   aegipa 8 * dbh  default    site    
#> 4 bci   beilpe 8 * dbh  default    site    
#> 5 xxx   aaaaaa 9 * dbh  default    site    
#> 6 xxx   bbbbbb 9 * dbh  default    site
Now say that we have a census dataset like this one:
cns <- bmss::user_data
cns
#> # A tibble: 4 x 3
#>   site  sp       dbh
#>   <chr> <chr>  <int>
#> 1 bci   swars1    33
#> 2 bci   hybapr    24
#> 3 bci   aegipa    12
#> 4 bci   beilpe     9
We can calculate biomass with our currently naive biomass_per_ind() function.
biomass_per_ind(cns, site = "site", sp = "sp", dbh = "dbh")
#> You gave no custom equations.
#>   * Using default equations.
#> # A tibble: 4 x 7
#>   site  sp       dbh eqn      eqn_source eqn_type  bmss
#>   <chr> <chr>  <int> <chr>    <chr>      <chr>    <dbl>
#> 1 bci   swars1    33 30 * dbh default    species   264.
#> 2 bci   hybapr    24 40 * dbh default    species   192.
#> 3 bci   aegipa    12 8 * dbh  default    site       96.
#> 4 bci   beilpe     9 8 * dbh  default    site       72.
biomass_per_indiv()Let’s brake down what biomass_per_ind() does to better understand what columns we need from the equations dataset.
joint <- left_join(cns, eqn)
#> Joining, by = c("site", "sp")
joint
#> # A tibble: 4 x 6
#>   site  sp       dbh eqn      eqn_source eqn_type
#>   <chr> <chr>  <int> <chr>    <chr>      <chr>   
#> 1 bci   swars1    33 30 * dbh default    species 
#> 2 bci   hybapr    24 40 * dbh default    species 
#> 3 bci   aegipa    12 8 * dbh  default    site    
#> 4 bci   beilpe     9 8 * dbh  default    site
To make things simpler, let’s keep only the columns that matter.
joint <- select(joint, dbh, eqn)
joint
#> # A tibble: 4 x 2
#>     dbh eqn     
#>   <int> <chr>   
#> 1    33 30 * dbh
#> 2    24 40 * dbh
#> 3    12 8 * dbh 
#> 4     9 8 * dbh
eqn with the corresponding value from the column dbh.joint <- mutate(
  joint, 
  eqn_replaced = str_replace(eqn, "dbh", as.character(dbh))
)
joint
#> # A tibble: 4 x 3
#>     dbh eqn      eqn_replaced
#>   <int> <chr>    <chr>       
#> 1    33 30 * dbh 30 * 33     
#> 2    24 40 * dbh 40 * 24     
#> 3    12 8 * dbh  8 * 12      
#> 4     9 8 * dbh  8 * 9
eqn_replaced to calculate biomass (bmss).dplyr::mutate(
  joint, 
  bmss = map_dbl(.data$eqn_replaced, ~eval(parse(text = .x), envir = joint))
)
#> # A tibble: 4 x 4
#>     dbh eqn      eqn_replaced  bmss
#>   <int> <chr>    <chr>        <dbl>
#> 1    33 30 * dbh 30 * 33       990.
#> 2    24 40 * dbh 40 * 24       960.
#> 3    12 8 * dbh  8 * 12         96.
#> 4     9 8 * dbh  8 * 9          72.
The equations in the equations dataset have many parameters – not only dbh. And we need one column for each of those parameters.
Let’s glimpse the dataset, and focus on the equation column.
glimpse(allodb::equations)
#> Observations: 421
#> Variables: 23
#> $ equation_id                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ biomass_component            <chr> "Stem and branches (live)", "Stem...
#> $ equation                     <chr> "a*(DBH^2)^b", "a*(DBH^2)^b", "a+...
#> $ allometry_specificity        <chr> "Species", "Species", "Species", ...
#> $ development_species          <chr> NA, NA, NA, "Ulmus americana", NA...
#> $ geographic_area              <chr> "North Carolina, Georgia", "North...
#> $ dbh_min_cm                   <chr> "14.22", "29.46", "2.5", "2.5", "...
#> $ dbh_max_cm                   <chr> "25.91", "41.66", "40", "40", "55...
#> $ n_trees                      <int> 9, 9, NA, NA, NA, NA, NA, NA, NA,...
#> $ dbh_units_original           <chr> "in", "in", "cm", "cm", "mm", "mm...
#> $ biomass_units_original       <chr> "lb", "lb", "kg", "kg", "kg", "kg...
#> $ allometry_development_method <chr> "harvest", "harvest", "harvest", ...
#> $ model_parameters             <chr> "DBH", "DBH", "DBH", "DBH", "DBH"...
#> $ regression_model             <chr> "linear_multiple", "linear_multip...
#> $ other_equations_tested       <chr> "yes", "yes", NA, NA, NA, NA, NA,...
#> $ log_biomass                  <chr> "10", "10", NA, NA, NA, NA, NA, N...
#> $ bias_corrected               <chr> "yes", "yes", "no", "no", "no", "...
#> $ bias_correction_factor       <chr> "included in model", "included in...
#> $ notes_fitting_model          <chr> "Regression equations were develo...
#> $ original_data_availability   <chr> "1", "1", NA, NA, NA, NA, NA, NA,...
#> $ notes_to_consider            <chr> NA, NA, NA, NA, NA, NA, "DBA = ba...
#> $ warning                      <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ ref_id                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, N...
The column equation has many parameters, such as a, b and c. To show how the code may deal with these parameters, I’ll create a small dataset with only the parameter a.
joint2 <- tibble::tribble(
            ~eqn, ~a, ~dbh,
  "33 * dbh * a",  5,   10,
  "24 * dbh * a",  1,   15,
  "12 * dbh * a",  2,   23
)
joint2
#> # A tibble: 3 x 3
#>   eqn              a   dbh
#>   <chr>        <dbl> <dbl>
#> 1 33 * dbh * a    5.   10.
#> 2 24 * dbh * a    1.   15.
#> 3 12 * dbh * a    2.   23.
And I will assume that this dataset is already merged with the user data, so it already has a dbh column – that is, here I start at the step 2 above.
eqn with the corresponding value from the column dbh.a.joint2 <- mutate(
  joint2, 
  eqn_replaced = str_replace(eqn, "dbh", as.character(dbh)),
  eqn_replaced = str_replace(eqn_replaced, "a", as.character(a))
)
joint2
#> # A tibble: 3 x 4
#>   eqn              a   dbh eqn_replaced
#>   <chr>        <dbl> <dbl> <chr>       
#> 1 33 * dbh * a    5.   10. 33 * 10 * 5 
#> 2 24 * dbh * a    1.   15. 24 * 15 * 1 
#> 3 12 * dbh * a    2.   23. 12 * 23 * 2
eqn_replaced to calculate biomass (bmss).dplyr::mutate(
  joint2, 
  bmss = map_dbl(.data$eqn_replaced, ~eval(parse(text = .x), envir = joint2))
)
#> # A tibble: 3 x 5
#>   eqn              a   dbh eqn_replaced  bmss
#>   <chr>        <dbl> <dbl> <chr>        <dbl>
#> 1 33 * dbh * a    5.   10. 33 * 10 * 5  1650.
#> 2 24 * dbh * a    1.   15. 24 * 15 * 1   360.
#> 3 12 * dbh * a    2.   23. 12 * 23 * 2   552.
In this section I show that we can programatically replace the parameters a-d from equations$equation. Also I identify a few parameters – other than a-d and DBH – that are missing and we’ll need to somehow feed into the code for it to to its job.
Get data.
master <- readr::read_csv(
  here::here("data-raw/allodb_master.csv"),
  col_types = allodb::type_allodb_master()
)
# Tidy
master <- master %>% 
  # Add spacing for readability (http://style.tidyverse.org/syntax.html#spacing)
  mutate(
    equation = formatR::tidy_source(text = equation)[[1]]
  ) %>% 
  # Show most key column first
  select(equation, everything())
master
#> # A tibble: 421 x 40
#>    equation    site   family  species  species_code life_form wsg   wsg_id
#>    <chr>       <chr>  <chr>   <chr>    <chr>        <chr>     <chr> <chr> 
#>  1 a * (DBH^2~ Lilly~ Fabace~ Robinia~ 901          Tree      0.6   <NA>  
#>  2 a * (DBH^2~ Lilly~ Fabace~ Robinia~ 901          Tree      0.66  <NA>  
#>  3 a * (DBH^b) Lilly~ Ulmace~ Ulmus a~ 972          Tree      0.46  <NA>  
#>  4 a * (DBH^b) Lilly~ Ulmace~ Ulmus r~ 975          Tree      0.48  <NA>  
#>  5 a + b * DB~ Lilly~ Oleace~ Fraxinu~ 541          Tree      0.51  <NA>  
#>  6 a + b * DB~ Lilly~ Oleace~ Fraxinu~ 544          Tree      0.53  <NA>  
#>  7 a * (DBA^b) Lilly~ Hamame~ Hamamel~ 498          Shrub     0.71  <NA>  
#>  8 a * (DBH^b) Lilly~ Cupres~ Juniper~ 68           Tree      0.44  <NA>  
#>  9 a * (DBH^b) Lilly~ Fagace~ Fagus g~ 531          Tree      0.56  <NA>  
#> 10 a * (DBH^b) Lilly~ Fagace~ Quercus~ 802          Tree      0.6   <NA>  
#> # ... with 411 more rows, and 32 more variables: wsg_specificity <chr>,
#> #   a <dbl>, b <dbl>, c <dbl>, d <dbl>, dbh_min_cm <chr>,
#> #   dbh_max_cm <chr>, n_trees <int>, dbh_units_original <chr>,
#> #   biomass_units_original <chr>, allometry_development_method <chr>,
#> #   site_dbh_unit <chr>, equation_id <chr>, equation_grouping <chr>,
#> #   independent_variable <chr>, regression_model <chr>,
#> #   other_equations_tested <chr>, log_biomass <chr>, bias_corrected <chr>,
#> #   bias_correction_factor <chr>, notes_fitting_model <chr>,
#> #   dependent_variable_biomass_component <chr>,
#> #   allometry_specificity <chr>, development_species <chr>,
#> #   geographic_area <chr>, biomass_equation_source <chr>, ref_id <chr>,
#> #   wsg_source <chr>, ref_wsg_id <chr>, original_data_availability <chr>,
#> #   notes_to_consider <chr>, warning <chr>
Explore distinct equations
master %>%
  select(equation) %>% 
  distinct() %>% 
  print(n = nrow(.))
#> # A tibble: 17 x 1
#>    equation                                 
#>    <chr>                                    
#>  1 a * (DBH^2)^b                            
#>  2 a * (DBH^b)                              
#>  3 a + b * DBH + c * (DBH^d)                
#>  4 a * (DBA^b)                              
#>  5 exp(a + b * ln(DBH))                     
#>  6 exp(a + b * DBH + c * (ln(DBH^d)))       
#>  7 10^a + b * (log10(DBH^c))                
#>  8 a + b * DBH                              
#>  9 a + b * BA                               
#> 10 exp(a + b * ln(DBA))                     
#> 11 exp(a + (b * ln(DBH))) * 419.814 * 1.22  
#> 12 exp(a + (b * ln(DBH))) * 645.704 * 1.05  
#> 13 10^a * DBH^b                             
#> 14 a + (b * DBH) + c * (DBH^2) + d * (DBH^3)
#> 15 NA                                       
#> 16 exp(a + (b * (ln(pi * DBH))))            
#> 17 exp(a + b * (DBH/DBH + c))
Notice NAs. What should we do with this?
is_missing <- is.na(master$equation) | grepl("NA", master$equation)
master %>% 
  # Find both: missing value `NA` and the literal string "NA"
  filter(is_missing)
#> # A tibble: 2 x 40
#>   equation site     family species     species_code life_form wsg   wsg_id
#>   <chr>    <chr>    <chr>  <chr>       <chr>        <chr>     <chr> <chr> 
#> 1 NA       Wind Ri~ Unkown Unknown Un~ UNKN         Tree      0.43  <NA>  
#> 2 NA       Yosemite Unkown Unknown Un~ UNKN         <NA>      <NA>  <NA>  
#> # ... with 32 more variables: wsg_specificity <chr>, a <dbl>, b <dbl>,
#> #   c <dbl>, d <dbl>, dbh_min_cm <chr>, dbh_max_cm <chr>, n_trees <int>,
#> #   dbh_units_original <chr>, biomass_units_original <chr>,
#> #   allometry_development_method <chr>, site_dbh_unit <chr>,
#> #   equation_id <chr>, equation_grouping <chr>,
#> #   independent_variable <chr>, regression_model <chr>,
#> #   other_equations_tested <chr>, log_biomass <chr>, bias_corrected <chr>,
#> #   bias_correction_factor <chr>, notes_fitting_model <chr>,
#> #   dependent_variable_biomass_component <chr>,
#> #   allometry_specificity <chr>, development_species <chr>,
#> #   geographic_area <chr>, biomass_equation_source <chr>, ref_id <chr>,
#> #   wsg_source <chr>, ref_wsg_id <chr>, original_data_availability <chr>,
#> #   notes_to_consider <chr>, warning <chr>
In the meantime, I keep only non missing equations.
non_missing <- master %>% filter(!is_missing)
Replacing a-d.
replaced <- non_missing %>%
  mutate(
    eqn_replaced = str_replace_all(equation, "a", as.character(a)),
  ) %>% 
  mutate(
    eqn_replaced = str_replace_all(eqn_replaced, "b", as.character(b)),
    eqn_replaced = str_replace_all(eqn_replaced, "c", as.character(c)),
    eqn_replaced = str_replace_all(eqn_replaced, "d", as.character(d)),
  ) %>% 
  select(eqn_replaced)
replaced
#> # A tibble: 419 x 1
#>    eqn_replaced                          
#>    <chr>                                 
#>  1 4.13741 * (DBH^2)^1.08876             
#>  2 1.04649 * (DBH^2)^1.37539             
#>  3 0.08248 * (DBH^2.468)                 
#>  4 0.08248 * (DBH^2.468)                 
#>  5 3.203 + -0.234 * DBH + 0.006 * (DBH^2)
#>  6 3.203 + -0.234 * DBH + 0.006 * (DBH^2)
#>  7 38.111 * (DBA^2.9)                    
#>  8 0.1632 * (DBH^2.2454)                 
#>  9 2.0394 * (DBH^2.5715)                 
#> 10 1.5647 * (DBH^2.6887)                 
#> # ... with 409 more rows