41.4 Error Identification
Catch anomalies early: impossible values, data entry glitches, or KPI mismatches.
- Logical constraints
violations <- tx %>%
mutate(
v_sales_negative = sales < 0,
v_volume_negative = volume < 0,
v_listings_negative= listings < 0,
v_inventory_negative = inventory < 0
) %>%
summarise(across(starts_with("v_"), ~sum(.x, na.rm = TRUE)))
violations
#> # A tibble: 1 × 4
#> v_sales_negative v_volume_negative v_listings_negative v_inventory_negative
#> <int> <int> <int> <int>
#> 1 0 0 0 0- Price coherence: median vs. average price (avg_price = volume/sales)
Large gaps are not necessarily errors, but extreme ratios can flag data issues.
coherence <- tx %>%
mutate(avg_price = ifelse(sales > 0, volume / sales, NA_real_),
ratio = median / avg_price) %>%
filter(!is.na(ratio), is.finite(ratio)) %>%
summarise(
n = n(),
ratio_p01 = quantile(ratio, 0.01),
ratio_p99 = quantile(ratio, 0.99),
extreme = sum(ratio < 0.25 | ratio > 4, na.rm = TRUE)
)
coherence
#> # A tibble: 1 × 4
#> n ratio_p01 ratio_p99 extreme
#> <int> <dbl> <dbl> <int>
#> 1 7985 0.682 0.992 0- Outliers (univariate) with
dlookr
out_summary <- diagnose_outlier(tx %>% select(sales, volume, median, listings, inventory, avg_price))
out_summary %>% arrange(desc(outliers_cnt)) %>% head(10)
#> # A tibble: 6 × 6
#> variables outliers_cnt outliers_ratio outliers_mean with_mean without_mean
#> <chr> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 volume 1116 13.0 563924698. 106858621. 33125498.
#> 2 sales 899 10.5 3057. 550. 234.
#> 3 listings 742 8.63 17522. 3217. 1568.
#> 4 inventory 437 5.08 20.8 7.17 6.29
#> 5 median 147 1.71 248251. 128131. 125879.
#> 6 avg_price 143 1.66 300786. 153202. 150528.# Full, human-readable data quality report (HTML) with dataReporter
# install.packages("dataReporter")
library(dataReporter)
makeDataReport(tx) # Generates an HTML report in your working directory