Part 3 Week 1 Asynchronous
Getting Started with Data Visualization in R > Week 1
3.1 Installing R and Getting Started
3.2 R basics
3.2.1 Basic R
####Do Basic Math
2 + 2
#> [1] 4
2 - 2
#> [1] 0
2 / 2
#> [1] 1
2 * 2
#> [1] 4
<- 2 + 2
a
####Run each line by either (1) setting the cursor at the end of the line and hitting control+enter on a PC or cmd+enter on Mac or click in the run button in the menu bar above or (2) selecting the line(s) of code you want to execute and then using the keyboard short cut or clicking run
##### You can use pound signs/hashtags to tell R to ignore lines of code
#2+2
2 + 2
#> [1] 4
#############
#START FOLLOWING ALONG WITH THE VIDEO HERE
#############
##### If you tell R to do something it doesn't understand, it will throw an error. R is very picky about punctuation, spelling etc.
######these commands will not do anything
#2234r*S&F(SD&F234)
#c(c))
#### R can also act do logical tests using logical operators.
####Is equal?
2 == 2
#> [1] TRUE
2 == 3
#> [1] FALSE
#####Is not equal?
2 != 2
#> [1] FALSE
2 != 3
#> [1] TRUE
####Greater than and less than
2 > 1
#> [1] TRUE
2 < 1
#> [1] FALSE
####Greater than or equal to, less than or equal to
3 >= 1
#> [1] TRUE
3 <= 1
#> [1] FALSE
#### R can work with character strings
"apple"
#> [1] "apple"
#### It is ok to use spaces in character strings.
"an apple"
#> [1] "an apple"
###You can use logical operators to see whether character strings exactly match each other
"apple" == "apple"
#> [1] TRUE
"apple" == "appla"
#> [1] FALSE
"apple" == "orange"
#> [1] FALSE
"apple" != "orange"
#> [1] TRUE
#### If you try to use inequalities with characters, R will compare how long the character string is
"apple" < "apples"
#> [1] TRUE
"apple" > "apples"
#> [1] FALSE
####save the output of a command to an object
<- 2 + 2
a <- 2 + 2
my_a <- 2 + 2
my.a
######## Don't do this. It won't work
#9a <- 2+2
#my object <- 2+2
#### See what is in the object by "running" the object
a
#> [1] 4
####you can save series of numbers or strings and put them into vectors using the combine function, c().
<- c(1, 2, 3)
numbers numbers
#> [1] 1 2 3
<- c("apple", "orange")
fruits fruits
#> [1] "apple" "orange"
<- c(4:6)
numbers2 numbers2
#> [1] 4 5 6
<- c(TRUE, FALSE, TRUE)
true_false true_false
#> [1] TRUE FALSE TRUE
#### You can combine vectors together
<- c(7:9)
numbers3
<- c(numbers, numbers2, numbers3)
all_numbers
all_numbers
#> [1] 1 2 3 4 5 6 7 8 9
#####You can select certain elements of a vector
<- c(-1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
x
### By position in the vector
4] #The fourth element. x[
#> [1] 12
-4] #All but the fourth. x[
#> [1] -1 10 11 13 14 15 16 17 18 19
2:4] #Elements two to four. x[
#> [1] 10 11 12
-(2:4)] #All elements except two to four. x[
#> [1] -1 13 14 15 16 17 18 19
c(1, 5)] #Elements one and five. x[
#> [1] -1 13
### By Value
== 10] # Elements which are equal to 10. x[x
#> [1] 10
< 0] #all elements less than zero. x[x
#> [1] -1
%in% c(10, 12, 15)] #Elements in the set 2, 4, 7. x[x
#> [1] 10 12 15
#### When you save different kinds of data, that data is given a "class" that describes what kind of data are in the vector
class(numbers)
#> [1] "numeric"
class(fruits)
#> [1] "character"
class(true_false)
#> [1] "logical"
#### If you combine numbers and character vectors together, the numbers will convert to characters
<- c(numbers, fruits)
fruits_numbers fruits_numbers
#> [1] "1" "2" "3" "apple" "orange"
#### Generally for data visualization purposes, it is good to not mix characters and numbers in the same vector.
##### You can change the class of a vector using as.logical,as.numeric,as.character, and as.factor
####here's an example with 1s and 0s
<- c(1, 0, 1, 0)
my_vector
<- as.character(my_vector)
my_vector_character my_vector_character
#> [1] "1" "0" "1" "0"
class(my_vector_character)
#> [1] "character"
<- as.logical(my_vector)
my_vector_logical my_vector_logical
#> [1] TRUE FALSE TRUE FALSE
class(my_vector_logical)
#> [1] "logical"
<- as.factor(my_vector)
my_vector_factor my_vector_factor
#> [1] 1 0 1 0
#> Levels: 0 1
class(my_vector_factor)
#> [1] "factor"
<- as.numeric(my_vector_character)
my_vector_numeric_again my_vector_numeric_again
#> [1] 1 0 1 0
class(my_vector_numeric_again)
#> [1] "numeric"
3.2.2 Functions in R
####Functions
<- function(x, y) {
add + y
x
}
add(2, 3)
#> [1] 5
####The last expression is returned
<- function(x, y) {
add_and_multiply_version1 + y
x * y
x
}
add_and_multiply_version1(2, 3)
#> [1] 6
####you can force R to return a specific object
<- function(x, y) {
add_and_multiply_version2 <- x + y
total <- x * y
product <- c(total, product)
total_and_product
<- x - y
subtract
return(total_and_product)
}
add_and_multiply_version2(2, 3)
#> [1] 5 6
##### R has many basic mathematical functions already built in that can be applied to numbers and vectors of numbers
###add two or more numbers
sum(c(2, 3, 5))
#> [1] 10
####add all the numbers in two vectors
sum(c(1, 2, 3), c(4, 5))
#> [1] 15
#####this does the same thing, just saving the vector to an object
<- c(1, 2, 3, 4, 5)
my_vector sum(my_vector)
#> [1] 15
##### There are many other functions for doing basic math and descriptive statistics
max(my_vector)
#> [1] 5
min(my_vector)
#> [1] 1
median(my_vector)
#> [1] 3
mean(my_vector)
#> [1] 3
sd(my_vector) ###standard deviation
#> [1] 1.581
summary(my_vector) ### a five number summary
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1 2 3 3 4 5
##### Other functions will sort vectors or tell you information about the vector
<- c(2, 2, 1, 3, 5)
my_vector
sort(my_vector)
#> [1] 1 2 2 3 5
rev(my_vector)
#> [1] 5 3 1 2 2
table(my_vector)
#> my_vector
#> 1 2 3 5
#> 1 2 1 1
unique(my_vector)
#> [1] 2 1 3 5
length(my_vector)
#> [1] 5
####create data fast
seq(1, 5)
#> [1] 1 2 3 4 5
seq(1, 9, by = 2)
#> [1] 1 3 5 7 9
rep("a", 5)
#> [1] "a" "a" "a" "a" "a"
rep(10, 5)
#> [1] 10 10 10 10 10
3.3 Dataframe
#####creating dataframes
<- c(1, 2, 3, 4, 5)
variable1 <- c(6, 7, 8, 9, 10)
variable2
data.frame(variable1, variable2)
#> variable1 variable2
#> 1 1 6
#> 2 2 7
#> 3 3 8
#> 4 4 9
#> 5 5 10
<- data.frame("height" = variable1, "weight" = variable2)
my_dat
my_dat
#> height weight
#> 1 1 6
#> 2 2 7
#> 3 3 8
#> 4 4 9
#> 5 5 10
#####get a single column of a dataframe
$weight my_dat
#> [1] 6 7 8 9 10
####use a column in a function
mean(my_dat$weight)
#> [1] 8
####save a column to another object
<- my_dat$weight
my_weights my_weights
#> [1] 6 7 8 9 10
####selecting data from dataframes
#####remember you specify row, then column in the brackets
###first row, all columns
1, ] my_dat[
#> height weight
#> 1 1 6
###first column, all rows
1] my_dat[,
#> [1] 1 2 3 4 5
###first column, first row,
1, 1] my_dat[
#> [1] 1
###first column, first three rows
1:3, 1] my_dat[
#> [1] 1 2 3
#####creating new columns
####single value repeated
$variable3 <- 100
my_dat my_dat
#> height weight variable3
#> 1 1 6 100
#> 2 2 7 100
#> 3 3 8 100
#> 4 4 9 100
#> 5 5 10 100
####add vector with same length
$variable4 <- c("apple", "orange", "grape", "cherry", "melon")
my_dat my_dat
#> height weight variable3 variable4
#> 1 1 6 100 apple
#> 2 2 7 100 orange
#> 3 3 8 100 grape
#> 4 4 9 100 cherry
#> 5 5 10 100 melon
####this won't work because the vector is too short - needs to be same length as other vectors/columns in the dataframe
#my_dat$variable5 <- c("banana","mango")
####get the dimensions of a dataframe (rows and columns)
dim(my_dat)
#> [1] 5 4
####get more information about the structure of a data frame
str(my_dat)
#> 'data.frame': 5 obs. of 4 variables:
#> $ height : num 1 2 3 4 5
#> $ weight : num 6 7 8 9 10
#> $ variable3: num 100 100 100 100 100
#> $ variable4: chr "apple" "orange" "grape" "cherry" ...
nrow(my_dat)
#> [1] 5
ncol(my_dat)
#> [1] 4
###get the column headers of the data frame
names(my_dat)
#> [1] "height" "weight" "variable3" "variable4"
###if you have a big dataframe, use head() to see the first few rows or tail to see the last few
head(my_dat)
#> height weight variable3 variable4
#> 1 1 6 100 apple
#> 2 2 7 100 orange
#> 3 3 8 100 grape
#> 4 4 9 100 cherry
#> 5 5 10 100 melon
tail(my_dat)
#> height weight variable3 variable4
#> 1 1 6 100 apple
#> 2 2 7 100 orange
#> 3 3 8 100 grape
#> 4 4 9 100 cherry
#> 5 5 10 100 melon
####Spreadsheet View
# View(my_dat)
#####Missing data causes lots of problems. Some functions "break" and throw errors if you include missing data
<- c(1, 2, 3, NA)
with_missing sum(with_missing)
#> [1] NA
#### You should look at the documentation for a function to understand how it handles missing data. Sometimes you can use an argument with a function to tell it how to deal with the missing data, often telling the function to ignore the missing cells.
# ?sum
sum(with_missing, na.rm = TRUE)
#> [1] 6
#####combining two dataframes with cbind
<- data.frame("variable4" = 400:499, "variable5" = 500:599)
my_dat2
<- cbind(my_dat, my_dat2)
all_dat
head(all_dat)
#> height weight variable3 variable4 variable4 variable5
#> 1 1 6 100 apple 400 500
#> 2 2 7 100 orange 401 501
#> 3 3 8 100 grape 402 502
#> 4 4 9 100 cherry 403 503
#> 5 5 10 100 melon 404 504
#> 6 1 6 100 apple 405 505
#### adding a row
<- c(1000, 2000, 3000, 4000)
new_row
<- rbind(all_dat, new_row)
all_dat_plus_new_row
tail(all_dat_plus_new_row)
#> height weight variable3 variable4 variable4 variable5
#> 96 1 6 100 apple 495 595
#> 97 2 7 100 orange 496 596
#> 98 3 8 100 grape 497 597
#> 99 4 9 100 cherry 498 598
#> 100 5 10 100 melon 499 599
#> 101 1000 2000 3000 4000 1000 2000
#### combining two dataframes with rbind
#####you have to make sure the two dataframes have the same column names and order
#######this won't work
# my_dat <- data.frame("variable1"=1:100,"variable2"=200:299)
# my_dat2 <- data.frame("variable4"=400:499,"variable5"=500:599)
# rbind(my_dat,my_dat2)
######this will work
<- data.frame("variable1" = 1:100, "variable2" = 200:299)
my_dat
<- data.frame("variable1" = 400:499, "variable2" = 500:599)
my_dat2
rbind(my_dat, my_dat2)
#> variable1 variable2
#> 1 1 200
#> 2 2 201
#> 3 3 202
#> 4 4 203
#> 5 5 204
#> 6 6 205
#> 7 7 206
#> 8 8 207
#> 9 9 208
#> 10 10 209
#> 11 11 210
#> 12 12 211
#> 13 13 212
#> 14 14 213
#> 15 15 214
#> 16 16 215
#> 17 17 216
#> 18 18 217
#> 19 19 218
#> 20 20 219
#> 21 21 220
#> 22 22 221
#> 23 23 222
#> 24 24 223
#> 25 25 224
#> 26 26 225
#> 27 27 226
#> 28 28 227
#> 29 29 228
#> 30 30 229
#> 31 31 230
#> 32 32 231
#> 33 33 232
#> 34 34 233
#> 35 35 234
#> 36 36 235
#> 37 37 236
#> 38 38 237
#> 39 39 238
#> 40 40 239
#> 41 41 240
#> 42 42 241
#> 43 43 242
#> 44 44 243
#> 45 45 244
#> 46 46 245
#> 47 47 246
#> 48 48 247
#> 49 49 248
#> 50 50 249
#> 51 51 250
#> 52 52 251
#> 53 53 252
#> 54 54 253
#> 55 55 254
#> 56 56 255
#> 57 57 256
#> 58 58 257
#> 59 59 258
#> 60 60 259
#> 61 61 260
#> 62 62 261
#> 63 63 262
#> 64 64 263
#> 65 65 264
#> 66 66 265
#> 67 67 266
#> 68 68 267
#> 69 69 268
#> 70 70 269
#> 71 71 270
#> 72 72 271
#> 73 73 272
#> 74 74 273
#> 75 75 274
#> 76 76 275
#> 77 77 276
#> 78 78 277
#> 79 79 278
#> 80 80 279
#> 81 81 280
#> 82 82 281
#> 83 83 282
#> 84 84 283
#> 85 85 284
#> 86 86 285
#> 87 87 286
#> 88 88 287
#> 89 89 288
#> 90 90 289
#> 91 91 290
#> 92 92 291
#> 93 93 292
#> 94 94 293
#> 95 95 294
#> 96 96 295
#> 97 97 296
#> 98 98 297
#> 99 99 298
#> 100 100 299
#> 101 400 500
#> 102 401 501
#> 103 402 502
#> 104 403 503
#> 105 404 504
#> 106 405 505
#> 107 406 506
#> 108 407 507
#> 109 408 508
#> 110 409 509
#> 111 410 510
#> 112 411 511
#> 113 412 512
#> 114 413 513
#> 115 414 514
#> 116 415 515
#> 117 416 516
#> 118 417 517
#> 119 418 518
#> 120 419 519
#> 121 420 520
#> 122 421 521
#> 123 422 522
#> 124 423 523
#> 125 424 524
#> 126 425 525
#> 127 426 526
#> 128 427 527
#> 129 428 528
#> 130 429 529
#> 131 430 530
#> 132 431 531
#> 133 432 532
#> 134 433 533
#> 135 434 534
#> 136 435 535
#> 137 436 536
#> 138 437 537
#> 139 438 538
#> 140 439 539
#> 141 440 540
#> 142 441 541
#> 143 442 542
#> 144 443 543
#> 145 444 544
#> 146 445 545
#> 147 446 546
#> 148 447 547
#> 149 448 548
#> 150 449 549
#> 151 450 550
#> 152 451 551
#> 153 452 552
#> 154 453 553
#> 155 454 554
#> 156 455 555
#> 157 456 556
#> 158 457 557
#> 159 458 558
#> 160 459 559
#> 161 460 560
#> 162 461 561
#> 163 462 562
#> 164 463 563
#> 165 464 564
#> 166 465 565
#> 167 466 566
#> 168 467 567
#> 169 468 568
#> 170 469 569
#> 171 470 570
#> 172 471 571
#> 173 472 572
#> 174 473 573
#> 175 474 574
#> 176 475 575
#> 177 476 576
#> 178 477 577
#> 179 478 578
#> 180 479 579
#> 181 480 580
#> 182 481 581
#> 183 482 582
#> 184 483 583
#> 185 484 584
#> 186 485 585
#> 187 486 586
#> 188 487 587
#> 189 488 588
#> 190 489 589
#> 191 490 590
#> 192 491 591
#> 193 492 592
#> 194 493 593
#> 195 494 594
#> 196 495 595
#> 197 496 596
#> 198 497 597
#> 199 498 598
#> 200 499 599
3.4 Basics of Importing Data into R
####Importing Data in R
####Import CSV
<-
cces_sample read.csv("/Users/haoqiwang/Desktop/2021REUDataScience/week1/cces_sample_coursera.csv")
####Write CSV
write.csv(cces_sample,
"/Users/haoqiwang/Desktop/2021REUDataScience/week1/test.csv")
####type in your directory path in setwd() or use the Session-->Set Working Directory menu options
getwd()
#> [1] "/Users/haoqiwang/Desktop/2021REUDataScience"
setwd("/Users/haoqiwang/Desktop/2021REUDataScience/week1")
#### Don't need the whole file path now
<- read.csv("cces_sample_coursera.csv")
cces_sample
class(cces_sample)
#> [1] "data.frame"
median(cces_sample$pew_religimp, na.rm = T)
#> [1] 2
table(cces_sample$race)
#>
#> 1 2 3 4 5 6 7 8
#> 794 81 67 27 6 14 10 1
3.5 Base R Visualizations
#####Visualizations with Base R
####Univariate Statistics
<- c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6)
numbers1
hist(numbers1)
boxplot(numbers1)
<- c(5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9)
numbers2
boxplot(numbers1, numbers2)
######Bivariate Statistics
#####make some fake data to create a scatterplot
#rnorm - normal distribution
#rpois - poisson
#rbinom - binomial
#runif - uniform
<- runif(50, 0, 100)
variable1 <- runif(50, 0, 100)
variable2 <- data.frame(variable1, variable2)
my_dat
#####this does the same thing as the previous three lines
<-
my_dat data.frame("variable1" = runif(50, 0, 100),
"variable2" = runif(50, 0, 100))
#### Scatterplot
plot(my_dat$variable1, my_dat$variable2)
#####this does the same thing
plot(x = my_dat$variable1, y = my_dat$variable2)
###Base R has some limited graphics customization ability
plot(
$variable1,
my_dat$variable2,
my_datmain = "My First Plot",
###add main title
xlab = "Variable 1",
### x axis label
ylab = "Variable 2",
### y axis label
ylim = c(0, 150),
xlim = c(0, 150)
### set the length of the x and y axes. )
# ? plot
3.6 Practices
# Problem 1
# Create a data frame that includes two columns, one named "Animals" and the other named "Foods". The first column should be this vector (note the intentional repeated values): Dog, Cat, Fish, Fish, Lizard
#The second column should be this vector: Bread, Orange, Chocolate, Carrots, Milk
#### Write your code below:
<- c("Dog", "Cat", "Fish", "Fish", "Lizard")
Animals <- c("Bread", "Orange", "Chocolate", "Carrots", "Milk")
Foods <- data.frame(Animals, Foods)
df df
#> Animals Foods
#> 1 Dog Bread
#> 2 Cat Orange
#> 3 Fish Chocolate
#> 4 Fish Carrots
#> 5 Lizard Milk
# Problem 2
# Using the data frame created in Problem 2, use the table() command to create a frequency table for the column called "Animals".
#### Write your code below:
table(df$Animals)
#>
#> Cat Dog Fish Lizard
#> 1 1 2 1
# Problem 3
# Use read.csv() to import the survey data included in this assignment. Using that data, make a histogram of the column called "pid7".
#### Write your code below:
<- read.csv("week1/cces_sample_coursera.csv") df2
hist(df2$pid7)