13 How to make a simple data dictionary
Inspired by Dania M. Rodriguez (https://cran.r-project.org/web/packages/dataMeta/vignettes/dataMeta_Vignette.html), I wrote an R function data_dic_builder() for making simple data dictionaries.
Explanations:
- Output
- the output is a dataframe having two columns
- column 1 has the names of data variables
- column 2 has the number of unique values of each variable if it is not an interesting one, or the unique values of each variable if it is an interesting one
- Arguments
- ‘df’ is a dataframe which contains the data
- ‘variable_type’ is a vector which contains 0 or 1. 0 means we are not interested in this variable; 1 means that we have an interest in this variable
Example:
# a function for making simple data dictionaries --------------------------
data_dic_builder <- function(df, variable_type)
{df <- as.data.frame(df) # make sure df is a dataframe
n <- dim(df)[2]
length_of_uniq_var <- rep(0, n)
length_of_uniq_var_1 <- rep(0, n)
for(i in 1:n)
{length_of_uniq_var[i] <- length(unique(df[, i]))
length_of_uniq_var_1[i] <- ifelse(variable_type[i] == 0, 1,
length_of_uniq_var[i])
}
m <- sum(variable_type * length_of_uniq_var) + sum(variable_type == 0)
long_list_1 <- rep("", m)
long_list_2 <- rep("", m)
the_var_names <- names(df)
the_cum_length <- cumsum(length_of_uniq_var_1)
for(i in 1:n)
{if(length_of_uniq_var_1[i] == 1)
{temp_char_1 <- the_var_names[i]
temp_char_2 <- as.character(length_of_uniq_var[i])
} else {temp_char_1 <- c(the_var_names[i], rep("", length_of_uniq_var_1[i] - 1))
temp_char_2 <- as.character(unique(df[, i]))}
start_point <- ifelse(i==1, 1, the_cum_length[i-1] + 1)
end_point <- the_cum_length[i]
long_list_1[start_point:end_point] <- temp_char_1
long_list_2[start_point:end_point] <- temp_char_2
}
output_df <- data.frame(Var_name = long_list_1,
Unique_n_or_unique_values = long_list_2)
return(output_df)
}
# a test example ----------------------------------------------------------
## create a fake data set
color_set <- c("red", "green", "blue")
fk_data <- data.frame(x = rnorm(100),
type = sample(LETTERS[1:5], 100, replace = TRUE),
corlor = sample(color_set, 100, replace = TRUE),
y = runif(100))
my_dic <- data_dic_builder(df = fk_data, variable_type = c(0, 1, 1, 0))
print(my_dic)
## Var_name Unique_n_or_unique_values
## 1 x 100
## 2 type C
## 3 B
## 4 A
## 5 D
## 6 E
## 7 corlor blue
## 8 red
## 9 green
## 10 y 100