Chapter 33 Reading Data
What You’ll Learn:
- Reading CSV and text files
- Excel files
- Common import errors
- Encoding issues
- File path problems
- Data type detection
Key Errors Covered: 25+ import errors
Difficulty: ⭐⭐ Intermediate to ⭐⭐⭐ Advanced
33.1 Introduction
Reading data is the first step in any analysis:
But file reading has many pitfalls. Let’s master them!
33.3 Error #1: cannot open file
⭐ BEGINNER 📁 PATH
33.3.1 The Error
data <- read_csv("nonexistent_file.csv")
#> Error: 'nonexistent_file.csv' does not exist in current working directory ('/Users/bioguo/Downloads/r_errors_book').🔴 ERROR
Error: 'nonexistent_file.csv' does not exist in current working directory
33.3.4 Solutions
✅ SOLUTION 1: Check Working Directory
# Check current directory
getwd()
#> [1] "/Users/bioguo/Downloads/r_errors_book"
# List files
list.files()
#> [1] "_bookdown_files"
#> [2] "_bookdown.yml"
#> [3] "_output.yml"
#> [4] "{39..51}-chapter.Rmd"
#> [5] "01-installation-environment.Rmd"
#> [6] "02-working-directory-paths.Rmd"
#> [7] "03-objects-variables.Rmd"
#> [8] "04-type-mismatch.Rmd"
#> [9] "05-vectors-recycling.Rmd"
#> [10] "06-na-null-nan-inf.Rmd"
#> [11] "07-subscript-dimension.Rmd"
#> [12] "08-dollar-double-bracket.Rmd"
#> [13] "09-dataframe-construction.Rmd"
#> [14] "10-column-manipulation.Rmd"
#> [15] "11-matrix-operations.Rmd"
#> [16] "12-factor-creation-levels.Rmd"
#> [17] "13-factor-operations.Rmd"
#> [18] "14-string-basics.Rmd"
#> [19] "15-string-pattern-matching.Rmd"
#> [20] "16-function-creation.Rmd"
#> [21] "17-scoping-environments.Rmd"
#> [22] "18-control-flow.Rmd"
#> [23] "19-error-handling.Rmd"
#> [24] "20-dplyr-intro.Rmd"
#> [25] "21-dplyr-grouping.Rmd"
#> [26] "22-dplyr-joins.Rmd"
#> [27] "23-tidyr-intro.Rmd"
#> [28] "24-dplyr-advanced.Rmd"
#> [29] "25-dplyr-best-practices.Rmd"
#> [30] "26-ggplot2-intro.Rmd"
#> [31] "27-ggplot2-advanced.Rmd"
#> [32] "28-ggplot2-extensions.Rmd"
#> [33] "29-ggplot2-troubleshooting.Rmd"
#> [34] "30-statistical-tests.Rmd"
#> [35] "31-regression-anova.Rmd"
#> [36] "32-stats-best-practices.Rmd"
#> [37] "33-reading-data.Rmd"
#> [38] "34-writing-data.Rmd"
#> [39] "35-database-connections.Rmd"
#> [40] "36-apply-family.Rmd"
#> [41] "37-purrr-package.Rmd"
#> [42] "38-iteration-best-practices.Rmd"
#> [43] "39-package-basics.Rmd"
#> [44] "40-documentation-testing.Rmd"
#> [45] "404.html"
#> [46] "41-package-best-practices.Rmd"
#> [47] "42-profiling.Rmd"
#> [48] "43-optimization.Rmd"
#> [49] "44-memory.Rmd"
#> [50] "45-parallel.Rmd"
#> [51] "46-advanced-databases.Rmd"
#> [52] "47-web-scraping.Rmd"
#> [53] "48-shiny-basics.Rmd"
#> [54] "49-advanced-programming.Rmd"
#> [55] "50-production-code.Rmd"
#> [56] "51-final-best-practices.Rmd"
#> [57] "52-s3-oop.Rmd"
#> [58] "53-r6-classes.Rmd"
#> [59] "53-r6-s4-oop.Rmd"
#> [60] "54-nse-metaprogramming.Rmd"
#> [61] "54-s4-system.Rmd"
#> [62] "55-metaprogramming.Rmd"
#> [63] "55-rcpp-basics.Rmd"
#> [64] "56-function-factories.Rmd"
#> [65] "57-rcpp-integration.Rmd"
#> [66] "appendix-A-error-dictionary.Rmd"
#> [67] "book.bib"
#> [68] "build_pdf.R"
#> [69] "build.R"
#> [70] "chapter-template.Rmd"
#> [71] "column-manipulation.html"
#> [72] "control-flow.html"
#> [73] "dataframe-construction.html"
#> [74] "docs"
#> [75] "dollar-double-bracket.html"
#> [76] "error-handling.html"
#> [77] "factor-creation-levels.html"
#> [78] "factor-operations.html"
#> [79] "function-creation.html"
#> [80] "images"
#> [81] "index.html"
#> [82] "index.Rmd"
#> [83] "installation-environment.html"
#> [84] "manifest.json"
#> [85] "mastering-r-through-errors_files"
#> [86] "mastering-r-through-errors.html"
#> [87] "mastering-r-through-errors.log"
#> [88] "mastering-r-through-errors.Rmd"
#> [89] "mastering-r-through-errors.tex"
#> [90] "matrix-operations.html"
#> [91] "na-null-nan-inf.html"
#> [92] "objects-variables.html"
#> [93] "packages.bib"
#> [94] "part-i-foundation-environment.html"
#> [95] "part-ii-data-types-coercion.html"
#> [96] "part-iii-indexing-subsetting.html"
#> [97] "part-iv-data-frames-matrices.html"
#> [98] "part-v-factors.html"
#> [99] "part-vi-strings-characters.html"
#> [100] "part-vii-functions-programming.html"
#> [ reached 'max' / getOption("max.print") -- omitted 15 entries ]
list.files(pattern = "\\.csv$")
#> character(0)
# Check if file exists
file.exists("mtcars.csv")
#> [1] FALSE
# Create example file for demonstration
write_csv(mtcars, "mtcars.csv")
# Now read it
if (file.exists("mtcars.csv")) {
data <- read_csv("mtcars.csv", show_col_types = FALSE)
cat("Successfully read", nrow(data), "rows\n")
}
#> Successfully read 32 rows✅ SOLUTION 2: Use Full Paths
# Absolute path (Windows)
data <- read_csv("C:/Users/username/Documents/data.csv")
# Absolute path (Mac/Linux)
data <- read_csv("/home/username/data/file.csv")
# Relative from project root (best with RStudio projects)
data <- read_csv("data/file.csv")
# Use here package for project-relative paths
library(here)
data <- read_csv(here("data", "file.csv"))✅ SOLUTION 3: Interactive File Selection
33.4 Error #2: Encoding Issues
⭐⭐ INTERMEDIATE 🔤 ENCODING
33.4.3 Solutions
✅ SOLUTION 1: Specify Encoding
# Common encodings
data <- read_csv("file.csv", locale = locale(encoding = "UTF-8"))
data <- read_csv("file.csv", locale = locale(encoding = "latin1"))
data <- read_csv("file.csv", locale = locale(encoding = "Windows-1252"))
# Guess encoding
library(readr)
guess_encoding("file.csv")
# Use guessed encoding
encoding <- guess_encoding("file.csv")$encoding[1]
data <- read_csv("file.csv", locale = locale(encoding = encoding))33.5 Column Type Problems
💡 Key Insight: Type Detection
# Create sample data
sample_data <- tibble(
id = 1:5,
value = c("1.5", "2.3", "3.7", "NA", "5.2"),
date = c("2024-01-01", "2024-01-02", "2024-01-03",
"2024-01-04", "2024-01-05")
)
write_csv(sample_data, "sample.csv")
# Read with automatic type detection
data1 <- read_csv("sample.csv", show_col_types = FALSE)
str(data1)
#> spc_tbl_ [5 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ id : num [1:5] 1 2 3 4 5
#> $ value: num [1:5] 1.5 2.3 3.7 NA 5.2
#> $ date : Date[1:5], format: "2024-01-01" "2024-01-02" ...
#> - attr(*, "spec")=
#> .. cols(
#> .. id = col_double(),
#> .. value = col_double(),
#> .. date = col_date(format = "")
#> .. )
#> - attr(*, "problems")=<externalptr>
# Specify column types
data2 <- read_csv("sample.csv",
col_types = cols(
id = col_integer(),
value = col_double(),
date = col_date()
))
str(data2)
#> spc_tbl_ [5 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ id : int [1:5] 1 2 3 4 5
#> $ value: num [1:5] 1.5 2.3 3.7 NA 5.2
#> $ date : Date[1:5], format: "2024-01-01" "2024-01-02" ...
#> - attr(*, "spec")=
#> .. cols(
#> .. id = col_integer(),
#> .. value = col_double(),
#> .. date = col_date(format = "")
#> .. )
#> - attr(*, "problems")=<externalptr>
# Shorthand
data3 <- read_csv("sample.csv",
col_types = "iDd") # integer, double, date
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)33.6 Error #3: One or more parsing failures
⭐⭐ INTERMEDIATE 🔧 PARSE
33.6.1 The Warning
# Create problematic data
bad_data <- tibble(
id = 1:5,
value = c("1.5", "2.3", "not_a_number", "4.1", "5.2")
)
write_csv(bad_data, "bad_data.csv")
# Read with type specification
data <- read_csv("bad_data.csv",
col_types = cols(value = col_double()),
show_col_types = FALSE)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)⚠️ WARNING
Warning: One or more parsing failures
33.6.3 Solutions
✅ SOLUTION 1: Check Problems
# See what failed
problems(data)
#> # A tibble: 1 × 5
#> row col expected actual file
#> <int> <int> <chr> <chr> <chr>
#> 1 4 2 a double not_a_number /Users/bioguo/Downloads/r_errors_book/bad_d…
# Read as character first
data_char <- read_csv("bad_data.csv",
col_types = cols(value = col_character()))
# Manually handle conversion
data_clean <- data_char %>%
mutate(value_num = as.numeric(value),
value_clean = if_else(is.na(value_num), 0, value_num))
#> Warning: There was 1 warning in `mutate()`.
#> ℹ In argument: `value_num = as.numeric(value)`.
#> Caused by warning:
#> ! NAs introduced by coercion
head(data_clean)
#> # A tibble: 5 × 4
#> id value value_num value_clean
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 1.5 1.5 1.5
#> 2 2 2.3 2.3 2.3
#> 3 3 not_a_number NA 0
#> 4 4 4.1 4.1 4.1
#> 5 5 5.2 5.2 5.233.7 Reading Excel Files
💡 Key Insight: readxl Package
library(readxl)
# Create example Excel file
library(writexl)
write_xlsx(list(Sheet1 = mtcars), "example.xlsx")
# Read Excel
data <- read_excel("example.xlsx")
head(data)
#> # A tibble: 6 × 12
#> mpg cyl disp hp drat wt qsec vs am gear carb cyl_factor
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 6
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 6
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 6
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 8
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 6
# Specify sheet
data <- read_excel("example.xlsx", sheet = "Sheet1")
# Or by number
data <- read_excel("example.xlsx", sheet = 1)
# List sheets
excel_sheets("example.xlsx")
#> [1] "Sheet1"
# Read specific range
data <- read_excel("example.xlsx", range = "A1:D10")
# Skip rows
data <- read_excel("example.xlsx", skip = 2)
# Specify column types
data <- read_excel("example.xlsx",
col_types = c("numeric", "text", "date"))
#> Error: Sheet 1 has 12 columns, but `col_types` has length 3.33.8 Error #4: Expecting numeric
⭐⭐ INTERMEDIATE 🔢 TYPE
33.9 File Paths Best Practices
🎯 Best Practice: Portable File Paths
# ❌ Bad: Absolute paths
read_csv("C:/Users/John/Documents/data.csv")
# ❌ Bad: Backslashes (Windows only)
read_csv("data\\file.csv")
# ✅ Good: Forward slashes (cross-platform)
read_csv("data/file.csv")
# ✅ Better: here package
library(here)
read_csv(here("data", "file.csv"))
# ✅ Best: RStudio projects + here
# In project root:
read_csv(here("data", "raw", "file.csv"))
# Check paths
here() # Project root
here("data", "file.csv") # Build path
file.exists(here("data")) # Check if exists33.10 Reading Other Formats
💡 Key Insight: Format-Specific Packages
# SPSS, SAS, Stata
library(haven)
data <- read_spss("file.sav")
data <- read_sas("file.sas7bdat")
data <- read_stata("file.dta")
# JSON
library(jsonlite)
data <- fromJSON("file.json")
# XML
library(xml2)
doc <- read_xml("file.xml")
# Fixed-width files
data <- read_fwf("file.txt",
fwf_widths(c(10, 8, 5)))
# Delimited by other characters
data <- read_delim("file.txt", delim = "|")
data <- read_tsv("file.txt") # Tab-separated33.11 Large Files
🎯 Best Practice: Handling Large Files
# Read in chunks
library(readr)
# Specify rows to read
data_sample <- read_csv("large_file.csv", n_max = 1000)
# Read columns selectively
data <- read_csv("large_file.csv",
col_select = c(id, value, date))
# Use data.table for speed
library(data.table)
data <- fread("large_file.csv")
# Use vroom for very large files
library(vroom)
data <- vroom("large_file.csv")
# Read in chunks with callback
library(readr)
chunk_size <- 10000
read_csv_chunked("large_file.csv",
callback = DataFrameCallback$new(function(chunk, pos) {
# Process chunk
processed <- chunk %>% filter(value > 0)
# Return to accumulate
processed
}),
chunk_size = chunk_size)33.12 Summary
Key Takeaways:
- Check file exists - Use file.exists()
- Use readr, not base R - Faster and better
- Specify encoding - When dealing with international characters
- Check column types - Use col_types or read problems()
- Use here package - For portable paths
- Handle large files - Read in chunks or use data.table
- Excel with readxl - Specify sheets and ranges
Quick Reference:
| Error | Cause | Fix |
|---|---|---|
| cannot open file | Wrong path | Check getwd(), use here() |
| Encoding garbled | Wrong encoding | Use locale(encoding = …) |
| Parsing failures | Type mismatch | Check problems(), read as text |
| Sheet not found | Wrong sheet name | Use excel_sheets() |
Reading Functions:
# CSV/text
read_csv("file.csv") # readr
read_csv2("file.csv") # European format (;)
read_tsv("file.txt") # Tab-separated
read_delim("file.txt", "|") # Custom delimiter
# Excel
library(readxl)
read_excel("file.xlsx")
read_excel("file.xlsx", sheet = 2)
read_excel("file.xlsx", range = "A1:D10")
# Other formats
library(haven)
read_spss("file.sav")
read_sas("file.sas7bdat")
read_stata("file.dta")
# JSON
library(jsonlite)
fromJSON("file.json")Best Practices:
# ✅ Good
Use here() for paths
Specify col_types for reliability
Check file.exists() before reading
Use read_csv() not read.csv()
Handle encoding explicitly
Read large files in chunks
# ❌ Avoid
Absolute paths
Assuming default encoding
Ignoring parsing warnings
Reading entire large files
Using read.csv() for new code33.13 Exercises
📝 Exercise 1: Safe File Reader
Write safe_read_csv() that:
1. Checks if file exists
2. Handles errors gracefully
3. Reports file info
4. Returns data or informative error
📝 Exercise 2: Excel Multi-Sheet Reader
Write function to: 1. Read all sheets from Excel file 2. Return named list of data frames 3. Handle empty sheets 4. Report sheet names and dimensions
📝 Exercise 3: Data Inspector
Create inspect_file() that:
1. Checks encoding
2. Detects delimiter
3. Previews first few rows
4. Reports column types
5. Identifies problems
33.14 Exercise Answers
Click to see answers
Exercise 1:
safe_read_csv <- function(path, ...) {
# Check file exists
if (!file.exists(path)) {
stop("File does not exist: ", path)
}
# Get file info
info <- file.info(path)
cat("File:", basename(path), "\n")
cat("Size:", round(info$size / 1024, 2), "KB\n")
cat("Modified:", format(info$mtime, "%Y-%m-%d %H:%M"), "\n\n")
# Try to read
tryCatch({
data <- read_csv(path, ..., show_col_types = FALSE)
cat("Successfully read:\n")
cat(" Rows:", nrow(data), "\n")
cat(" Columns:", ncol(data), "\n")
cat(" Column names:", paste(names(data), collapse = ", "), "\n")
# Check for problems
probs <- problems(data)
if (nrow(probs) > 0) {
warning("Found ", nrow(probs), " parsing issues")
print(probs)
}
return(data)
}, error = function(e) {
stop("Failed to read file: ", e$message)
})
}
# Test
write_csv(mtcars, "test.csv")
data <- safe_read_csv("test.csv")
#> File: test.csv
#> Size: 1.32 KB
#> Modified: 2025-10-26 15:47
#>
#> Successfully read:
#> Rows: 32
#> Columns: 12
#> Column names: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, cyl_factorExercise 2:
library(readxl)
read_all_sheets <- function(path, ...) {
# Check file exists
if (!file.exists(path)) {
stop("File does not exist: ", path)
}
# Get sheet names
sheets <- excel_sheets(path)
cat("Reading Excel file:", basename(path), "\n")
cat("Sheets found:", length(sheets), "\n\n")
# Read all sheets
result <- list()
for (sheet in sheets) {
cat("Reading sheet:", sheet, "... ")
tryCatch({
data <- read_excel(path, sheet = sheet, ...)
# Check if empty
if (nrow(data) == 0) {
cat("EMPTY\n")
result[[sheet]] <- NULL
} else {
cat("OK (", nrow(data), " rows, ", ncol(data), " cols)\n", sep = "")
result[[sheet]] <- data
}
}, error = function(e) {
cat("ERROR:", e$message, "\n")
result[[sheet]] <- NULL
})
}
cat("\nSuccessfully read", length(result), "sheets\n")
return(result)
}
# Test
library(writexl)
write_xlsx(list(
Cars = mtcars[1:10, ],
Iris = iris[1:10, ],
Empty = data.frame()
), "multi_sheet.xlsx")
all_data <- read_all_sheets("multi_sheet.xlsx")
#> Reading Excel file: multi_sheet.xlsx
#> Sheets found: 3
#>
#> Reading sheet: Cars ... OK (10 rows, 12 cols)
#> Reading sheet: Iris ... OK (10 rows, 5 cols)
#> Reading sheet: Empty ... EMPTY
#>
#> Successfully read 2 sheets
names(all_data)
#> [1] "Cars" "Iris"Exercise 3:
inspect_file <- function(path, n_preview = 5) {
cat("=== File Inspection ===\n\n")
# Check exists
if (!file.exists(path)) {
stop("File does not exist: ", path)
}
# File info
info <- file.info(path)
cat("File:", path, "\n")
cat("Size:", round(info$size / 1024, 2), "KB\n")
cat("Modified:", format(info$mtime, "%Y-%m-%d %H:%M"), "\n\n")
# Guess encoding
cat("Encoding:\n")
enc <- guess_encoding(path, n_max = 1000)
print(enc)
cat("\n")
# Try to detect structure
cat("Structure Detection:\n")
# Read first few lines
lines <- readLines(path, n = 10)
# Detect delimiter
delims <- c(",", ";", "\t", "|")
delim_counts <- sapply(delims, function(d) {
sum(grepl(d, lines[1], fixed = TRUE))
})
likely_delim <- delims[which.max(delim_counts)]
cat("Likely delimiter:",
switch(likely_delim,
"," = "comma",
";" = "semicolon",
"\t" = "tab",
"|" = "pipe"), "\n\n")
# Preview
cat("Preview (first", n_preview, "lines):\n")
cat(paste(head(lines, n_preview), collapse = "\n"), "\n\n")
# Try to read
cat("Reading data...\n")
data <- read_csv(path, show_col_types = FALSE, n_max = 100)
cat("Dimensions:", nrow(data), "rows (preview) x", ncol(data), "columns\n\n")
cat("Column types:\n")
print(spec(data))
# Check for problems
probs <- problems(data)
if (nrow(probs) > 0) {
cat("\nProblems found:\n")
print(probs)
} else {
cat("\nNo parsing problems detected\n")
}
invisible(data)
}
# Test
write_csv(mtcars, "inspect_test.csv")
inspect_file("inspect_test.csv")
#> === File Inspection ===
#>
#> File: inspect_test.csv
#> Size: 1.32 KB
#> Modified: 2025-10-26 15:47
#>
#> Encoding:
#> # A tibble: 1 × 2
#> encoding confidence
#> <chr> <dbl>
#> 1 ASCII 1
#>
#> Structure Detection:
#> Likely delimiter: comma
#>
#> Preview (first 5 lines):
#> mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,cyl_factor
#> 21,6,160,110,3.9,2.62,16.46,0,1,4,4,6
#> 21,6,160,110,3.9,2.875,17.02,0,1,4,4,6
#> 22.8,4,108,93,3.85,2.32,18.61,1,1,4,1,4
#> 21.4,6,258,110,3.08,3.215,19.44,1,0,3,1,6
#>
#> Reading data...
#> Dimensions: 32 rows (preview) x 12 columns
#>
#> Column types:
#> cols(
#> mpg = col_double(),
#> cyl = col_double(),
#> disp = col_double(),
#> hp = col_double(),
#> drat = col_double(),
#> wt = col_double(),
#> qsec = col_double(),
#> vs = col_double(),
#> am = col_double(),
#> gear = col_double(),
#> carb = col_double(),
#> cyl_factor = col_double()
#> )
#>
#> No parsing problems detected