13 How to make a simple data dictionary

Inspired by Dania M. Rodriguez (https://cran.r-project.org/web/packages/dataMeta/vignettes/dataMeta_Vignette.html), I wrote an R function data_dic_builder() for making simple data dictionaries.

Explanations:

  • Output
    • the output is a dataframe having two columns
    • column 1 has the names of data variables
    • column 2 has the number of unique values of each variable if it is not an interesting one, or the unique values of each variable if it is an interesting one
  • Arguments
    • ‘df’ is a dataframe which contains the data
    • ‘variable_type’ is a vector which contains 0 or 1. 0 means we are not interested in this variable; 1 means that we have an interest in this variable

Example:

# a function for making simple data dictionaries --------------------------
data_dic_builder <- function(df, variable_type)
{df <- as.data.frame(df) # make sure df is a dataframe

n <- dim(df)[2]

length_of_uniq_var <- rep(0, n)
length_of_uniq_var_1 <- rep(0, n)  

for(i in 1:n)
{length_of_uniq_var[i] <- length(unique(df[, i]))
length_of_uniq_var_1[i] <- ifelse(variable_type[i] == 0, 1, 
                                  length_of_uniq_var[i])
}

m <- sum(variable_type * length_of_uniq_var)  + sum(variable_type == 0)

long_list_1 <- rep("", m)
long_list_2 <- rep("", m)
the_var_names <- names(df)

the_cum_length <- cumsum(length_of_uniq_var_1)

for(i in 1:n)
{if(length_of_uniq_var_1[i] == 1) 
{temp_char_1 <- the_var_names[i]
temp_char_2 <- as.character(length_of_uniq_var[i])
} else {temp_char_1 <- c(the_var_names[i], rep("", length_of_uniq_var_1[i] - 1))
temp_char_2 <- as.character(unique(df[, i]))}
  
  start_point <- ifelse(i==1, 1, the_cum_length[i-1] + 1)
  end_point <- the_cum_length[i]
  
  long_list_1[start_point:end_point] <- temp_char_1
  
  long_list_2[start_point:end_point] <- temp_char_2
}

output_df <- data.frame(Var_name = long_list_1, 
                        Unique_n_or_unique_values = long_list_2)
return(output_df)
}


# a test example ----------------------------------------------------------
## create a fake data set
color_set <- c("red", "green", "blue")
fk_data <- data.frame(x = rnorm(100),
                      type = sample(LETTERS[1:5], 100, replace = TRUE),
                      corlor = sample(color_set, 100, replace = TRUE),
                      y = runif(100))

my_dic <- data_dic_builder(df = fk_data, variable_type = c(0, 1, 1, 0))

print(my_dic)
##    Var_name Unique_n_or_unique_values
## 1         x                       100
## 2      type                         C
## 3                                   B
## 4                                   A
## 5                                   D
## 6                                   E
## 7    corlor                      blue
## 8                                 red
## 9                               green
## 10        y                       100