41.4 Error Identification

Catch anomalies early: impossible values, data entry glitches, or KPI mismatches.

  1. Logical constraints

violations <- tx %>%
  mutate(
    v_sales_negative   = sales < 0,
    v_volume_negative  = volume < 0,
    v_listings_negative= listings < 0,
    v_inventory_negative = inventory < 0
  ) %>%
  summarise(across(starts_with("v_"), ~sum(.x, na.rm = TRUE)))
violations
#> # A tibble: 1 × 4
#>   v_sales_negative v_volume_negative v_listings_negative v_inventory_negative
#>              <int>             <int>               <int>                <int>
#> 1                0                 0                   0                    0
  1. Price coherence: median vs. average price (avg_price = volume/sales)

Large gaps are not necessarily errors, but extreme ratios can flag data issues.

coherence <- tx %>%
  mutate(avg_price = ifelse(sales > 0, volume / sales, NA_real_),
         ratio = median / avg_price) %>%
  filter(!is.na(ratio), is.finite(ratio)) %>%
  summarise(
    n = n(),
    ratio_p01 = quantile(ratio, 0.01),
    ratio_p99 = quantile(ratio, 0.99),
    extreme = sum(ratio < 0.25 | ratio > 4, na.rm = TRUE)
  )
coherence
#> # A tibble: 1 × 4
#>       n ratio_p01 ratio_p99 extreme
#>   <int>     <dbl>     <dbl>   <int>
#> 1  7985     0.682     0.992       0
  1. Outliers (univariate) with dlookr
out_summary <- diagnose_outlier(tx %>% select(sales, volume, median, listings, inventory, avg_price))
out_summary %>% arrange(desc(outliers_cnt)) %>% head(10)
#> # A tibble: 6 × 6
#>   variables outliers_cnt outliers_ratio outliers_mean    with_mean without_mean
#>   <chr>            <int>          <dbl>         <dbl>        <dbl>        <dbl>
#> 1 volume            1116          13.0    563924698.  106858621.    33125498.  
#> 2 sales              899          10.5         3057.        550.         234.  
#> 3 listings           742           8.63       17522.       3217.        1568.  
#> 4 inventory          437           5.08          20.8         7.17         6.29
#> 5 median             147           1.71      248251.     128131.      125879.  
#> 6 avg_price          143           1.66      300786.     153202.      150528.
# Full, human-readable data quality report (HTML) with dataReporter
# install.packages("dataReporter")
library(dataReporter)
makeDataReport(tx)  # Generates an HTML report in your working directory