Chapter 33 Reading Data

What You’ll Learn:

  • Reading CSV and text files
  • Excel files
  • Common import errors
  • Encoding issues
  • File path problems
  • Data type detection

Key Errors Covered: 25+ import errors

Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced

33.1 Introduction

Reading data is the first step in any analysis:

library(readr)
library(dplyr)

# Reading CSV
# data <- read_csv("file.csv")

But file reading has many pitfalls. Let’s master them!

33.2 Reading CSV Files

💡 Key Insight: Base R vs readr

# Base R
data1 <- read.csv("file.csv")
# - Converts strings to factors by default
# - Uses row names
# - Slower for large files

# readr (tidyverse)
data2 <- read_csv("file.csv")
# - Keeps strings as character
# - No row names
# - Much faster
# - Better type guessing
# - Progress bar

33.3 Error #1: cannot open file

⭐ BEGINNER 📁 PATH

33.3.1 The Error

data <- read_csv("nonexistent_file.csv")
#> Error: 'nonexistent_file.csv' does not exist in current working directory ('/Users/bioguo/Downloads/r_errors_book').

🔴 ERROR

Error: 'nonexistent_file.csv' does not exist in current working directory

33.3.2 What It Means

File not found - wrong path or filename.

33.3.3 Common Causes

# Wrong filename
# read_csv("daat.csv")  # Typo

# Wrong directory
# read_csv("data/file.csv")  # data/ doesn't exist

# Case sensitivity (Linux/Mac)
# read_csv("File.csv")  # file is actually "file.csv"

33.3.4 Solutions

SOLUTION 1: Check Working Directory

# Check current directory
getwd()
#> [1] "/Users/bioguo/Downloads/r_errors_book"

# List files
list.files()
#>   [1] "_bookdown_files"                    
#>   [2] "_bookdown.yml"                      
#>   [3] "_output.yml"                        
#>   [4] "{39..51}-chapter.Rmd"               
#>   [5] "01-installation-environment.Rmd"    
#>   [6] "02-working-directory-paths.Rmd"     
#>   [7] "03-objects-variables.Rmd"           
#>   [8] "04-type-mismatch.Rmd"               
#>   [9] "05-vectors-recycling.Rmd"           
#>  [10] "06-na-null-nan-inf.Rmd"             
#>  [11] "07-subscript-dimension.Rmd"         
#>  [12] "08-dollar-double-bracket.Rmd"       
#>  [13] "09-dataframe-construction.Rmd"      
#>  [14] "10-column-manipulation.Rmd"         
#>  [15] "11-matrix-operations.Rmd"           
#>  [16] "12-factor-creation-levels.Rmd"      
#>  [17] "13-factor-operations.Rmd"           
#>  [18] "14-string-basics.Rmd"               
#>  [19] "15-string-pattern-matching.Rmd"     
#>  [20] "16-function-creation.Rmd"           
#>  [21] "17-scoping-environments.Rmd"        
#>  [22] "18-control-flow.Rmd"                
#>  [23] "19-error-handling.Rmd"              
#>  [24] "20-dplyr-intro.Rmd"                 
#>  [25] "21-dplyr-grouping.Rmd"              
#>  [26] "22-dplyr-joins.Rmd"                 
#>  [27] "23-tidyr-intro.Rmd"                 
#>  [28] "24-dplyr-advanced.Rmd"              
#>  [29] "25-dplyr-best-practices.Rmd"        
#>  [30] "26-ggplot2-intro.Rmd"               
#>  [31] "27-ggplot2-advanced.Rmd"            
#>  [32] "28-ggplot2-extensions.Rmd"          
#>  [33] "29-ggplot2-troubleshooting.Rmd"     
#>  [34] "30-statistical-tests.Rmd"           
#>  [35] "31-regression-anova.Rmd"            
#>  [36] "32-stats-best-practices.Rmd"        
#>  [37] "33-reading-data.Rmd"                
#>  [38] "34-writing-data.Rmd"                
#>  [39] "35-database-connections.Rmd"        
#>  [40] "36-apply-family.Rmd"                
#>  [41] "37-purrr-package.Rmd"               
#>  [42] "38-iteration-best-practices.Rmd"    
#>  [43] "39-package-basics.Rmd"              
#>  [44] "40-documentation-testing.Rmd"       
#>  [45] "404.html"                           
#>  [46] "41-package-best-practices.Rmd"      
#>  [47] "42-profiling.Rmd"                   
#>  [48] "43-optimization.Rmd"                
#>  [49] "44-memory.Rmd"                      
#>  [50] "45-parallel.Rmd"                    
#>  [51] "46-advanced-databases.Rmd"          
#>  [52] "47-web-scraping.Rmd"                
#>  [53] "48-shiny-basics.Rmd"                
#>  [54] "49-advanced-programming.Rmd"        
#>  [55] "50-production-code.Rmd"             
#>  [56] "51-final-best-practices.Rmd"        
#>  [57] "52-s3-oop.Rmd"                      
#>  [58] "53-r6-classes.Rmd"                  
#>  [59] "53-r6-s4-oop.Rmd"                   
#>  [60] "54-nse-metaprogramming.Rmd"         
#>  [61] "54-s4-system.Rmd"                   
#>  [62] "55-metaprogramming.Rmd"             
#>  [63] "55-rcpp-basics.Rmd"                 
#>  [64] "56-function-factories.Rmd"          
#>  [65] "57-rcpp-integration.Rmd"            
#>  [66] "appendix-A-error-dictionary.Rmd"    
#>  [67] "book.bib"                           
#>  [68] "build_pdf.R"                        
#>  [69] "build.R"                            
#>  [70] "chapter-template.Rmd"               
#>  [71] "column-manipulation.html"           
#>  [72] "control-flow.html"                  
#>  [73] "dataframe-construction.html"        
#>  [74] "docs"                               
#>  [75] "dollar-double-bracket.html"         
#>  [76] "error-handling.html"                
#>  [77] "factor-creation-levels.html"        
#>  [78] "factor-operations.html"             
#>  [79] "function-creation.html"             
#>  [80] "images"                             
#>  [81] "index.html"                         
#>  [82] "index.Rmd"                          
#>  [83] "installation-environment.html"      
#>  [84] "manifest.json"                      
#>  [85] "mastering-r-through-errors_files"   
#>  [86] "mastering-r-through-errors.html"    
#>  [87] "mastering-r-through-errors.log"     
#>  [88] "mastering-r-through-errors.Rmd"     
#>  [89] "mastering-r-through-errors.tex"     
#>  [90] "matrix-operations.html"             
#>  [91] "na-null-nan-inf.html"               
#>  [92] "objects-variables.html"             
#>  [93] "packages.bib"                       
#>  [94] "part-i-foundation-environment.html" 
#>  [95] "part-ii-data-types-coercion.html"   
#>  [96] "part-iii-indexing-subsetting.html"  
#>  [97] "part-iv-data-frames-matrices.html"  
#>  [98] "part-v-factors.html"                
#>  [99] "part-vi-strings-characters.html"    
#> [100] "part-vii-functions-programming.html"
#>  [ reached 'max' / getOption("max.print") -- omitted 15 entries ]
list.files(pattern = "\\.csv$")
#> character(0)

# Check if file exists
file.exists("mtcars.csv")
#> [1] FALSE

# Create example file for demonstration
write_csv(mtcars, "mtcars.csv")

# Now read it
if (file.exists("mtcars.csv")) {
  data <- read_csv("mtcars.csv", show_col_types = FALSE)
  cat("Successfully read", nrow(data), "rows\n")
}
#> Successfully read 32 rows

SOLUTION 2: Use Full Paths

# Absolute path (Windows)
data <- read_csv("C:/Users/username/Documents/data.csv")

# Absolute path (Mac/Linux)
data <- read_csv("/home/username/data/file.csv")

# Relative from project root (best with RStudio projects)
data <- read_csv("data/file.csv")

# Use here package for project-relative paths
library(here)
data <- read_csv(here("data", "file.csv"))

SOLUTION 3: Interactive File Selection

# Choose file interactively
file_path <- file.choose()
data <- read_csv(file_path)

# Or with safe checking
safe_read_csv <- function(path = NULL) {
  if (is.null(path)) {
    path <- file.choose()
  }
  
  if (!file.exists(path)) {
    stop("File does not exist: ", path)
  }
  
  read_csv(path)
}

33.4 Error #2: Encoding Issues

⭐⭐ INTERMEDIATE 🔤 ENCODING

33.4.1 The Problem

# File with special characters appears garbled
data <- read_csv("international_data.csv")
# Names show as: "Müller" → "Müller"

33.4.2 What Happened

Wrong character encoding - file is in different encoding than assumed.

33.4.3 Solutions

SOLUTION 1: Specify Encoding

# Common encodings
data <- read_csv("file.csv", locale = locale(encoding = "UTF-8"))
data <- read_csv("file.csv", locale = locale(encoding = "latin1"))
data <- read_csv("file.csv", locale = locale(encoding = "Windows-1252"))

# Guess encoding
library(readr)
guess_encoding("file.csv")

# Use guessed encoding
encoding <- guess_encoding("file.csv")$encoding[1]
data <- read_csv("file.csv", locale = locale(encoding = encoding))

SOLUTION 2: Handle in readr

# readr handles UTF-8 by default
data <- read_csv("file.csv")

# For other encodings
data <- read_csv("file.csv", 
                 locale = locale(encoding = "ISO-8859-1"))

33.5 Column Type Problems

💡 Key Insight: Type Detection

# Create sample data
sample_data <- tibble(
  id = 1:5,
  value = c("1.5", "2.3", "3.7", "NA", "5.2"),
  date = c("2024-01-01", "2024-01-02", "2024-01-03", 
           "2024-01-04", "2024-01-05")
)

write_csv(sample_data, "sample.csv")

# Read with automatic type detection
data1 <- read_csv("sample.csv", show_col_types = FALSE)
str(data1)
#> spc_tbl_ [5 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#>  $ id   : num [1:5] 1 2 3 4 5
#>  $ value: num [1:5] 1.5 2.3 3.7 NA 5.2
#>  $ date : Date[1:5], format: "2024-01-01" "2024-01-02" ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   id = col_double(),
#>   ..   value = col_double(),
#>   ..   date = col_date(format = "")
#>   .. )
#>  - attr(*, "problems")=<externalptr>

# Specify column types
data2 <- read_csv("sample.csv",
                  col_types = cols(
                    id = col_integer(),
                    value = col_double(),
                    date = col_date()
                  ))
str(data2)
#> spc_tbl_ [5 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#>  $ id   : int [1:5] 1 2 3 4 5
#>  $ value: num [1:5] 1.5 2.3 3.7 NA 5.2
#>  $ date : Date[1:5], format: "2024-01-01" "2024-01-02" ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   id = col_integer(),
#>   ..   value = col_double(),
#>   ..   date = col_date(format = "")
#>   .. )
#>  - attr(*, "problems")=<externalptr>

# Shorthand
data3 <- read_csv("sample.csv",
                  col_types = "iDd")  # integer, double, date
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)

33.6 Error #3: One or more parsing failures

⭐⭐ INTERMEDIATE 🔧 PARSE

33.6.1 The Warning

# Create problematic data
bad_data <- tibble(
  id = 1:5,
  value = c("1.5", "2.3", "not_a_number", "4.1", "5.2")
)

write_csv(bad_data, "bad_data.csv")

# Read with type specification
data <- read_csv("bad_data.csv", 
                 col_types = cols(value = col_double()),
                 show_col_types = FALSE)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)

⚠️ WARNING

Warning: One or more parsing failures

33.6.2 What It Means

Data doesn’t match expected type.

33.6.3 Solutions

SOLUTION 1: Check Problems

# See what failed
problems(data)
#> # A tibble: 1 × 5
#>     row   col expected actual       file                                        
#>   <int> <int> <chr>    <chr>        <chr>                                       
#> 1     4     2 a double not_a_number /Users/bioguo/Downloads/r_errors_book/bad_d…

# Read as character first
data_char <- read_csv("bad_data.csv", 
                      col_types = cols(value = col_character()))

# Manually handle conversion
data_clean <- data_char %>%
  mutate(value_num = as.numeric(value),
         value_clean = if_else(is.na(value_num), 0, value_num))
#> Warning: There was 1 warning in `mutate()`.
#> ℹ In argument: `value_num = as.numeric(value)`.
#> Caused by warning:
#> ! NAs introduced by coercion

head(data_clean)
#> # A tibble: 5 × 4
#>      id value        value_num value_clean
#>   <dbl> <chr>            <dbl>       <dbl>
#> 1     1 1.5                1.5         1.5
#> 2     2 2.3                2.3         2.3
#> 3     3 not_a_number      NA           0  
#> 4     4 4.1                4.1         4.1
#> 5     5 5.2                5.2         5.2

SOLUTION 2: Use col_guess()

# Let readr guess
data_guess <- read_csv("bad_data.csv", show_col_types = FALSE)

# Check what it guessed
spec(data_guess)
#> cols(
#>   id = col_double(),
#>   value = col_character()
#> )

33.7 Reading Excel Files

💡 Key Insight: readxl Package

library(readxl)

# Create example Excel file
library(writexl)
write_xlsx(list(Sheet1 = mtcars), "example.xlsx")

# Read Excel
data <- read_excel("example.xlsx")
head(data)
#> # A tibble: 6 × 12
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb cyl_factor
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>     
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4 6         
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4 6         
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1 4         
#> 4  21.4     6   258   110  3.08  3.22  19.4     1     0     3     1 6         
#> 5  18.7     8   360   175  3.15  3.44  17.0     0     0     3     2 8         
#> 6  18.1     6   225   105  2.76  3.46  20.2     1     0     3     1 6

# Specify sheet
data <- read_excel("example.xlsx", sheet = "Sheet1")
# Or by number
data <- read_excel("example.xlsx", sheet = 1)

# List sheets
excel_sheets("example.xlsx")
#> [1] "Sheet1"

# Read specific range
data <- read_excel("example.xlsx", range = "A1:D10")

# Skip rows
data <- read_excel("example.xlsx", skip = 2)

# Specify column types
data <- read_excel("example.xlsx",
                   col_types = c("numeric", "text", "date"))
#> Error: Sheet 1 has 12 columns, but `col_types` has length 3.

33.8 Error #4: Expecting numeric

⭐⭐ INTERMEDIATE 🔢 TYPE

33.8.1 The Error

# Excel file with mixed types in column
data <- read_excel("mixed_types.xlsx",
                   col_types = "numeric")

🔴 WARNING

Warning: Expecting numeric in A2 / R2C1: got 'text'

33.8.2 Solutions

SOLUTION: Read as Text First

# Read everything as text
data <- read_excel("file.xlsx", col_types = "text")

# Then convert as needed
data_clean <- data %>%
  mutate(
    numeric_col = as.numeric(numeric_col),
    date_col = as.Date(date_col)
  )

33.9 File Paths Best Practices

🎯 Best Practice: Portable File Paths

# ❌ Bad: Absolute paths
read_csv("C:/Users/John/Documents/data.csv")

# ❌ Bad: Backslashes (Windows only)
read_csv("data\\file.csv")

# ✅ Good: Forward slashes (cross-platform)
read_csv("data/file.csv")

# ✅ Better: here package
library(here)
read_csv(here("data", "file.csv"))

# ✅ Best: RStudio projects + here
# In project root:
read_csv(here("data", "raw", "file.csv"))

# Check paths
here()                           # Project root
here("data", "file.csv")        # Build path
file.exists(here("data"))       # Check if exists

33.10 Reading Other Formats

💡 Key Insight: Format-Specific Packages

# SPSS, SAS, Stata
library(haven)
data <- read_spss("file.sav")
data <- read_sas("file.sas7bdat")
data <- read_stata("file.dta")

# JSON
library(jsonlite)
data <- fromJSON("file.json")

# XML
library(xml2)
doc <- read_xml("file.xml")

# Fixed-width files
data <- read_fwf("file.txt",
                 fwf_widths(c(10, 8, 5)))

# Delimited by other characters
data <- read_delim("file.txt", delim = "|")
data <- read_tsv("file.txt")  # Tab-separated

33.11 Large Files

🎯 Best Practice: Handling Large Files

# Read in chunks
library(readr)

# Specify rows to read
data_sample <- read_csv("large_file.csv", n_max = 1000)

# Read columns selectively
data <- read_csv("large_file.csv",
                 col_select = c(id, value, date))

# Use data.table for speed
library(data.table)
data <- fread("large_file.csv")

# Use vroom for very large files
library(vroom)
data <- vroom("large_file.csv")

# Read in chunks with callback
library(readr)
chunk_size <- 10000
read_csv_chunked("large_file.csv",
                 callback = DataFrameCallback$new(function(chunk, pos) {
                   # Process chunk
                   processed <- chunk %>% filter(value > 0)
                   # Return to accumulate
                   processed
                 }),
                 chunk_size = chunk_size)

33.12 Summary

Key Takeaways:

  1. Check file exists - Use file.exists()
  2. Use readr, not base R - Faster and better
  3. Specify encoding - When dealing with international characters
  4. Check column types - Use col_types or read problems()
  5. Use here package - For portable paths
  6. Handle large files - Read in chunks or use data.table
  7. Excel with readxl - Specify sheets and ranges

Quick Reference:

Error Cause Fix
cannot open file Wrong path Check getwd(), use here()
Encoding garbled Wrong encoding Use locale(encoding = …)
Parsing failures Type mismatch Check problems(), read as text
Sheet not found Wrong sheet name Use excel_sheets()

Reading Functions:

# CSV/text
read_csv("file.csv")              # readr
read_csv2("file.csv")             # European format (;)
read_tsv("file.txt")              # Tab-separated
read_delim("file.txt", "|")       # Custom delimiter

# Excel
library(readxl)
read_excel("file.xlsx")
read_excel("file.xlsx", sheet = 2)
read_excel("file.xlsx", range = "A1:D10")

# Other formats
library(haven)
read_spss("file.sav")
read_sas("file.sas7bdat")
read_stata("file.dta")

# JSON
library(jsonlite)
fromJSON("file.json")

Best Practices:

# ✅ Good
Use here() for paths
Specify col_types for reliability
Check file.exists() before reading
Use read_csv() not read.csv()
Handle encoding explicitly
Read large files in chunks

# ❌ Avoid
Absolute paths
Assuming default encoding
Ignoring parsing warnings
Reading entire large files
Using read.csv() for new code

33.13 Exercises

📝 Exercise 1: Safe File Reader

Write safe_read_csv() that: 1. Checks if file exists 2. Handles errors gracefully 3. Reports file info 4. Returns data or informative error

📝 Exercise 2: Excel Multi-Sheet Reader

Write function to: 1. Read all sheets from Excel file 2. Return named list of data frames 3. Handle empty sheets 4. Report sheet names and dimensions

📝 Exercise 3: Data Inspector

Create inspect_file() that: 1. Checks encoding 2. Detects delimiter 3. Previews first few rows 4. Reports column types 5. Identifies problems

33.14 Exercise Answers

Click to see answers

Exercise 1:

safe_read_csv <- function(path, ...) {
  # Check file exists
  if (!file.exists(path)) {
    stop("File does not exist: ", path)
  }
  
  # Get file info
  info <- file.info(path)
  cat("File:", basename(path), "\n")
  cat("Size:", round(info$size / 1024, 2), "KB\n")
  cat("Modified:", format(info$mtime, "%Y-%m-%d %H:%M"), "\n\n")
  
  # Try to read
  tryCatch({
    data <- read_csv(path, ..., show_col_types = FALSE)
    
    cat("Successfully read:\n")
    cat("  Rows:", nrow(data), "\n")
    cat("  Columns:", ncol(data), "\n")
    cat("  Column names:", paste(names(data), collapse = ", "), "\n")
    
    # Check for problems
    probs <- problems(data)
    if (nrow(probs) > 0) {
      warning("Found ", nrow(probs), " parsing issues")
      print(probs)
    }
    
    return(data)
    
  }, error = function(e) {
    stop("Failed to read file: ", e$message)
  })
}

# Test
write_csv(mtcars, "test.csv")
data <- safe_read_csv("test.csv")
#> File: test.csv 
#> Size: 1.32 KB
#> Modified: 2025-10-26 15:47 
#> 
#> Successfully read:
#>   Rows: 32 
#>   Columns: 12 
#>   Column names: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factor

Exercise 2:

library(readxl)

read_all_sheets <- function(path, ...) {
  # Check file exists
  if (!file.exists(path)) {
    stop("File does not exist: ", path)
  }
  
  # Get sheet names
  sheets <- excel_sheets(path)
  
  cat("Reading Excel file:", basename(path), "\n")
  cat("Sheets found:", length(sheets), "\n\n")
  
  # Read all sheets
  result <- list()
  
  for (sheet in sheets) {
    cat("Reading sheet:", sheet, "... ")
    
    tryCatch({
      data <- read_excel(path, sheet = sheet, ...)
      
      # Check if empty
      if (nrow(data) == 0) {
        cat("EMPTY\n")
        result[[sheet]] <- NULL
      } else {
        cat("OK (", nrow(data), " rows, ", ncol(data), " cols)\n", sep = "")
        result[[sheet]] <- data
      }
      
    }, error = function(e) {
      cat("ERROR:", e$message, "\n")
      result[[sheet]] <- NULL
    })
  }
  
  cat("\nSuccessfully read", length(result), "sheets\n")
  
  return(result)
}

# Test
library(writexl)
write_xlsx(list(
  Cars = mtcars[1:10, ],
  Iris = iris[1:10, ],
  Empty = data.frame()
), "multi_sheet.xlsx")

all_data <- read_all_sheets("multi_sheet.xlsx")
#> Reading Excel file: multi_sheet.xlsx 
#> Sheets found: 3 
#> 
#> Reading sheet: Cars ... OK (10 rows, 12 cols)
#> Reading sheet: Iris ... OK (10 rows, 5 cols)
#> Reading sheet: Empty ... EMPTY
#> 
#> Successfully read 2 sheets
names(all_data)
#> [1] "Cars" "Iris"

Exercise 3:

inspect_file <- function(path, n_preview = 5) {
  cat("=== File Inspection ===\n\n")
  
  # Check exists
  if (!file.exists(path)) {
    stop("File does not exist: ", path)
  }
  
  # File info
  info <- file.info(path)
  cat("File:", path, "\n")
  cat("Size:", round(info$size / 1024, 2), "KB\n")
  cat("Modified:", format(info$mtime, "%Y-%m-%d %H:%M"), "\n\n")
  
  # Guess encoding
  cat("Encoding:\n")
  enc <- guess_encoding(path, n_max = 1000)
  print(enc)
  cat("\n")
  
  # Try to detect structure
  cat("Structure Detection:\n")
  
  # Read first few lines
  lines <- readLines(path, n = 10)
  
  # Detect delimiter
  delims <- c(",", ";", "\t", "|")
  delim_counts <- sapply(delims, function(d) {
    sum(grepl(d, lines[1], fixed = TRUE))
  })
  
  likely_delim <- delims[which.max(delim_counts)]
  cat("Likely delimiter:", 
      switch(likely_delim,
             "," = "comma",
             ";" = "semicolon",
             "\t" = "tab",
             "|" = "pipe"), "\n\n")
  
  # Preview
  cat("Preview (first", n_preview, "lines):\n")
  cat(paste(head(lines, n_preview), collapse = "\n"), "\n\n")
  
  # Try to read
  cat("Reading data...\n")
  data <- read_csv(path, show_col_types = FALSE, n_max = 100)
  
  cat("Dimensions:", nrow(data), "rows (preview) x", ncol(data), "columns\n\n")
  
  cat("Column types:\n")
  print(spec(data))
  
  # Check for problems
  probs <- problems(data)
  if (nrow(probs) > 0) {
    cat("\nProblems found:\n")
    print(probs)
  } else {
    cat("\nNo parsing problems detected\n")
  }
  
  invisible(data)
}

# Test
write_csv(mtcars, "inspect_test.csv")
inspect_file("inspect_test.csv")
#> === File Inspection ===
#> 
#> File: inspect_test.csv 
#> Size: 1.32 KB
#> Modified: 2025-10-26 15:47 
#> 
#> Encoding:
#> # A tibble: 1 × 2
#>   encoding confidence
#>   <chr>         <dbl>
#> 1 ASCII             1
#> 
#> Structure Detection:
#> Likely delimiter: comma 
#> 
#> Preview (first 5 lines):
#> mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,cyl_factor
#> 21,6,160,110,3.9,2.62,16.46,0,1,4,4,6
#> 21,6,160,110,3.9,2.875,17.02,0,1,4,4,6
#> 22.8,4,108,93,3.85,2.32,18.61,1,1,4,1,4
#> 21.4,6,258,110,3.08,3.215,19.44,1,0,3,1,6 
#> 
#> Reading data...
#> Dimensions: 32 rows (preview) x 12 columns
#> 
#> Column types:
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double(),
#>   cyl_factor = col_double()
#> )
#> 
#> No parsing problems detected

33.15 Cleanup