5.3 Importing data
Next we want to read in the data we’ll be working with. The function read_csv
takes as an argument the path to where the file is located. This can take the form of an absolute path, a relative path to the working directory, or a url. With read_csv you can also specify a value for empty cells, and the data types of columns.
Knowledge check How would you find out more information about read_csv()
and the arguments it can take?
#read measles csv file and save to data frame
<- read_csv("data/tycho_measles.csv", na = "NA") measles_us
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ConditionName = col_character(),
## CountryName = col_character(),
## Admin1Name = col_character(),
## Admin1ISO = col_character(),
## Admin2Name = col_logical(),
## CityName = col_logical(),
## PeriodStartDate = col_character(),
## PeriodEndDate = col_character(),
## PartOfCumulativeCountSeries = col_double(),
## SourceName = col_character(),
## CountValue = col_double()
## )
## Warning: 447894 parsing failures.
## row col expected actual file
## 113302 Admin2Name 1/0/T/F/TRUE/FALSE PHILADELPHIA COUNTY 'data/tycho_measles.csv'
## 113302 CityName 1/0/T/F/TRUE/FALSE PHILADELPHIA 'data/tycho_measles.csv'
## 113303 Admin2Name 1/0/T/F/TRUE/FALSE PHILADELPHIA COUNTY 'data/tycho_measles.csv'
## 113303 CityName 1/0/T/F/TRUE/FALSE PHILADELPHIA 'data/tycho_measles.csv'
## 113304 Admin2Name 1/0/T/F/TRUE/FALSE PHILADELPHIA COUNTY 'data/tycho_measles.csv'
## ...... .......... .................. ................... ........................
## See problems(...) for more details.
You will see some warnings about column specifications. This happens because the first several cells in those columns have a value of NA
, so R attempts to read those columns as logical type. One way of fixing this is to use the IDE to import your data, and select the data types for each column. You could also add a col_types
argument to read_csv
as is done below.
<-
measles_us read_csv(
"data/tycho_measles.csv",
na = "NA",
col_types = cols(Admin2Name = col_character(), CityName = col_character())
)
After reading the data, you will typically want to inspect it and make sure everything looks okay. There are several ways of doing this.
#inspect
#spreadsheet view
View(measles_us)
#summary of columns and first few entries
glimpse(measles_us)
## Rows: 422,051
## Columns: 11
## $ ConditionName <chr> "Measles", "Measles", "Measles", "Measles"…
## $ CountryName <chr> "UNITED STATES OF AMERICA", "UNITED STATES…
## $ Admin1Name <chr> "WISCONSIN", "WISCONSIN", "WISCONSIN", "WI…
## $ Admin1ISO <chr> "US-WI", "US-WI", "US-WI", "US-WI", "US-WI…
## $ Admin2Name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ CityName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ PeriodStartDate <chr> "11/20/1927", "11/27/1927", "12/4/1927", "…
## $ PeriodEndDate <chr> "11/26/1927", "12/3/1927", "12/10/1927", "…
## $ PartOfCumulativeCountSeries <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ SourceName <chr> "US Nationally Notifiable Disease Surveill…
## $ CountValue <dbl> 85, 120, 84, 106, 39, 45, 28, 140, 48, 85,…
#first n rows (defaults to 6)
head(measles_us)
## # A tibble: 6 x 11
## ConditionName CountryName Admin1Name Admin1ISO Admin2Name CityName
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Measles UNITED STATES OF AMERI… WISCONSIN US-WI <NA> <NA>
## 2 Measles UNITED STATES OF AMERI… WISCONSIN US-WI <NA> <NA>
## 3 Measles UNITED STATES OF AMERI… WISCONSIN US-WI <NA> <NA>
## 4 Measles UNITED STATES OF AMERI… WISCONSIN US-WI <NA> <NA>
## 5 Measles UNITED STATES OF AMERI… WISCONSIN US-WI <NA> <NA>
## 6 Measles UNITED STATES OF AMERI… WISCONSIN US-WI <NA> <NA>
## # … with 5 more variables: PeriodStartDate <chr>, PeriodEndDate <chr>,
## # PartOfCumulativeCountSeries <dbl>, SourceName <chr>, CountValue <dbl>
#tibble form in console
measles_us
## # A tibble: 422,051 x 11
## ConditionName CountryName Admin1Name Admin1ISO Admin2Name CityName
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 2 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 3 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 4 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 5 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 6 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 7 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 8 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 9 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## 10 Measles UNITED STATES OF AMER… WISCONSIN US-WI <NA> <NA>
## # … with 422,041 more rows, and 5 more variables: PeriodStartDate <chr>,
## # PeriodEndDate <chr>, PartOfCumulativeCountSeries <dbl>, SourceName <chr>,
## # CountValue <dbl>