Practical 5 Solutions

In this practical we consider two datasets for analysis by polynomial and step-wise function regression. Firstly the seatpos dataset analysed in Practical 4, followed by the Boston data analysed in practical demonstration Section 7.4.

Q1 `seatpos` Data

Load libraries MASS and faraway.

library("MASS")
library("faraway")

We will analyse the effects of predictor variable Ht on the response variable hipcenter. Assign the hipcenter values to a vector y, and the Ht variable values to a vector x.

y = seatpos$hipcenter
x = seatpos$Ht

Plot variable Ht and hipcenter against each other. Remember to include suitable axis labels. From visual inspection of this plot, what sort of polynomial might be appropriate for this data?

y.lab = 'hip center (mm)'
x.lab = 'Height (bare foot) in cm'
plot( x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
      main = "", bty = 'l', pch = 16 )

# mean trend in the data looks almost linear - perhaps only a 
# first-order polynomial model is needed.

Fit a first and second order polynomial to the data using the commands lm and poly. Look at the corresponding summary objects. Do these back up your answer to part (c)?

poly1 = lm(y ~ poly(x,  1))
summary(poly1)

## 
## Call:
## lm(formula = y ~ poly(x, 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -99.956 -27.850   5.656  20.883  72.066 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -164.88       5.90  -27.95  < 2e-16 ***
## poly(x, 1)   -289.87      36.37   -7.97 1.83e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.37 on 36 degrees of freedom
## Multiple R-squared:  0.6383, Adjusted R-squared:  0.6282 
## F-statistic: 63.53 on 1 and 36 DF,  p-value: 1.831e-09

poly2 = lm(y ~ poly(x,  2))
summary(poly2)

## 
## Call:
## lm(formula = y ~ poly(x, 2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -96.068 -25.018   4.418  22.790  75.322 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -164.885      5.919 -27.855  < 2e-16 ***
## poly(x, 2)1 -289.868     36.490  -7.944 2.42e-09 ***
## poly(x, 2)2   31.822     36.490   0.872    0.389    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.49 on 35 degrees of freedom
## Multiple R-squared:  0.646,  Adjusted R-squared:  0.6257 
## F-statistic: 31.93 on 2 and 35 DF,  p-value: 1.282e-08

# p-value for second order term is high, suggesting that it is not 
# necessary in the model.  This backs up our conclusion to part (c).

Plot the first and second polynomial fits to the data, along with $\pm2$ standard deviation confidence intervals, similar to those generated in practical demonstration Section 7.4. What do you notice about the degree-2 polynomial plot?

sort.x = sort(x) # sorted values of x.

# Predicted values.
pred1 = predict(poly1, newdata = list(x = sort.x), se = TRUE)
pred2 = predict(poly2, newdata = list(x = sort.x), se = TRUE)

# Confidence interval bands.
se.bands1 = cbind( pred1$fit - 2*pred1$se.fit, pred1$fit + 2*pred1$se.fit )
se.bands2 = cbind( pred2$fit - 2*pred2$se.fit, pred2$fit + 2*pred2$se.fit )

# Plot both plots on a single graphics device.
par(mfrow = c(1,2))

# Degree-1 polynomial plot.
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
     main = "Degree-1 polynomial", bty = 'l')
lines(sort.x, pred1$fit, lwd = 2, col = "red")
matlines(sort.x, se.bands1, lwd = 2, col = "red", lty = 3)

# Degree-2 polynomial plot.
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
     main = "Degree-2 polynomial", bty = 'l')
lines(sort.x, pred2$fit, lwd = 2, col = "red")
matlines(sort.x, se.bands2, lwd = 2, col = "red", lty = 3)

# Degree-2 polynomial plot looks relatively close to a straight line as
# well - perhaps this suggests the second order component isn't necessary?

Perform an analysis of variance to confirm whether higher order degrees of polynomial are useful for modelling hipcenter based on Ht.

# Degree-3 and 4 polynomials.
poly3 = lm(y ~ poly(x,  3))
poly4 = lm(y ~ poly(x,  4))
anova(poly1, poly2, poly3, poly4)

## Analysis of Variance Table
## 
## Model 1: y ~ poly(x, 1)
## Model 2: y ~ poly(x, 2)
## Model 3: y ~ poly(x, 3)
## Model 4: y ~ poly(x, 4)
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1     36 47616                           
## 2     35 46603  1   1012.64 0.7250 0.4006
## 3     34 46445  1    157.91 0.1131 0.7388
## 4     33 46093  1    352.27 0.2522 0.6189

# Results suggest that use of the first-order polynomial is sufficient, 
# which confirms the results of the previous parts of this question.

Use step function regression with 5 cut-points to model hipcenter based on Ht. Plot the results.

# Remember to define the number of intervals (one more than the number of 
# cut-points).
step6 = lm(y ~ cut(x, 6))
pred6 = predict(step6, newdata = list(x = sort(x)), se = TRUE)
se.bands6 = cbind(pred6$fit + 2*pred6$se.fit, pred6$fit-2*pred6$se.fit)

# Plot the results.
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
     main = "5 cutpoints", bty = 'l')
lines(sort(x), pred6$fit, lwd = 2, col = "red")
matlines(sort(x), se.bands6, lwd = 1.4, col = "red", lty = 3)

# Note that the slopes in this plot are an artifact of the way in which we 
# have plotted the results.  
# Use summary to see where the different intervals start and finish.
summary(step6)

## 
## Call:
## lm(formula = y ~ cut(x, 6))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.209 -25.615   0.936  22.425  70.623 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -87.76      15.25  -5.753 2.22e-06 ***
## cut(x, 6)(158,166]   -44.11      19.29  -2.286    0.029 *  
## cut(x, 6)(166,174]  -102.35      19.69  -5.197 1.12e-05 ***
## cut(x, 6)(174,182]  -100.91      20.18  -5.001 1.98e-05 ***
## cut(x, 6)(182,190]  -142.44      24.12  -5.906 1.43e-06 ***
## cut(x, 6)(190,198]  -191.39      40.36  -4.742 4.20e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.36 on 32 degrees of freedom
## Multiple R-squared:  0.6606, Adjusted R-squared:  0.6076 
## F-statistic: 12.46 on 5 and 32 DF,  p-value: 9.372e-07

# The fitted model can be more accurately fitted over the data as follows:
newx <- seq(from = min(x), to = max(x), length = 100)
pred6 = predict(step6, newdata = list(x = newx), se = TRUE)
se.bands6 = cbind(pred6$fit + 2*pred6$se.fit, pred6$fit-2*pred6$se.fit)
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
     main = "5 cutpoints", bty = 'l')
lines(newx, pred6$fit, lwd = 2, col = "red")
matlines(newx, se.bands6, lwd = 1.4, col = "red", lty = 3)

# Whilst there is still some sloping artifact to the plot, it is more 
# negligible now - that is, the plot of the fitted values more accurately 
# represents the step-wise nature of the model.  
# Note that this was also present in the examples in 
# Section 8.1 and 8.2 - but less noticable due to the amount of data.  
# Change length = 1000 in the command seq for newx and you will see the 
# steps become more step-like still!

Q2 `Boston` Data

The relationship between predictor indus and response medv of the Boston data also looks interesting. Predictor indus is the proportion of non-retail business acres per town. First plot the variables to see how the relationship between indus and medv looks like. Then analyse the data. Feel free to select between polynomials and step functions (of course you can experiment with both if you have time). If you use step functions you can also choose to define the intervals manually if you so wish. Use ANOVA to decide which model fits the data best.

library(MASS)
?Boston
sum(is.na(Boston))

## [1] 0

names(Boston)

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"     "dis"    
##  [9] "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat medv
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98 24.0
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14 21.6
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03 34.7
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94 33.4
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33 36.2
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21 28.7

y = Boston$medv
x = Boston$indus
y.lab = 'Median Property Value'
x.lab = 'Non-retail business acres per town'

# Plot the data for initial visual analysis.
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "", bty = 'l')

# Fit a second-order polynomial model.
poly2 = lm(y ~ poly(x,  2))

sort.x = sort(x)
sort.x[1:10]     # the first 10 sorted values of x

##  [1] 0.46 0.74 1.21 1.22 1.25 1.25 1.32 1.38 1.47 1.47

pred2 = predict(poly2, newdata = list(x = sort.x), se = TRUE)
names(pred2)

## [1] "fit"            "se.fit"         "df"             "residual.scale"

# plus/minus 2 standard deviation confidence intervals.
se.bands2 = cbind(pred2$fit + 2 * pred2$se.fit, pred2$fit - 2 * pred2$se.fit)
pred2$fit[1:10]  # the first 10 fitted values of the curve

##        1        2        3        4        5        6        7        8        9 
## 33.37033 32.90299 32.13412 32.11798 32.06959 32.06959 31.95699 31.86083 31.71718 
##       10 
## 31.71718

se.bands2[1:10,] # the first 10 confidence intervals of the curve

##        [,1]     [,2]
## 1  35.43113 31.30952
## 2  34.85860 30.94738
## 3  33.92209 30.34615
## 4  33.90251 30.33345
## 5  33.84383 30.29535
## 6  33.84383 30.29535
## 7  33.70740 30.20658
## 8  33.59102 30.13063
## 9  33.41742 30.01694
## 10 33.41742 30.01694

# Plot the fitted degree-2 polynomial along with 2 standard deviation 
# confidence interval errors.
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "Degree-2 polynomial", bty = 'l')
lines(sort.x, pred2$fit, lwd = 2, col = "red")
matlines(sort.x, se.bands2, lwd = 1.4, col = "red", lty = 3)

# Repeat the above process for polynomials of degree 3, 4 and 5.
poly3 = lm(y ~ poly(x,  3))
poly4 = lm(y ~ poly(x,  4))
poly5 = lm(y ~ poly(x, 5))

pred3 = predict(poly3, newdata = list(x = sort.x), se = TRUE)
pred4 = predict(poly4, newdata = list(x = sort.x), se = TRUE)
pred5 = predict(poly5, newdata = list(x = sort.x), se = TRUE)

se.bands3 = cbind(pred3$fit + 2*pred3$se.fit, pred3$fit-2*pred3$se.fit)
se.bands4 = cbind(pred4$fit + 2*pred4$se.fit, pred4$fit-2*pred4$se.fit)
se.bands5 = cbind(pred5$fit + 2*pred5$se.fit, pred5$fit-2*pred5$se.fit)

# Display all on one graphics device.
par(mfrow = c(2,2))
# Degree-2
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "Degree-2 polynomial", bty = 'l')
lines(sort.x, pred2$fit, lwd = 2, col = "red")
matlines(sort.x, se.bands2, lwd = 2, col = "red", lty = 3)

# Degree-3
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "Degree-3 polynomial", bty = 'l')
lines(sort.x, pred3$fit, lwd = 2, col = "darkviolet")
matlines(sort.x, se.bands3, lwd = 2, col = "darkviolet", lty = 3)

# Degree-4
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "Degree-4 polynomial", bty = 'l')
lines(sort.x, pred4$fit, lwd = 2, col = "blue")
matlines(sort.x, se.bands4, lwd = 2, col = "blue", lty = 3)

# Degree-5
plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "Degree-5 polynomial", bty = 'l')
lines(sort.x, pred5$fit, lwd = 2, col = "black")
matlines(sort.x, se.bands5, lwd = 2, col = "black", lty = 3)

# Lets add in degree 1 and 6 polynomials and perform an ANOVA.
poly1 = lm(y ~ x)
poly6 = lm(y ~ poly(x, 6))
anova(poly1, poly2, poly3, poly4, poly5, poly6)

## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ poly(x, 2)
## Model 3: y ~ poly(x, 3)
## Model 4: y ~ poly(x, 4)
## Model 5: y ~ poly(x, 5)
## Model 6: y ~ poly(x, 6)
##   Res.Df   RSS Df Sum of Sq       F    Pr(>F)    
## 1    504 32721                                   
## 2    503 31237  1   1483.67 24.1885 1.188e-06 ***
## 3    502 30891  1    346.48  5.6487   0.01784 *  
## 4    501 30803  1     87.50  1.4265   0.23291    
## 5    500 30790  1     13.15  0.2143   0.64358    
## 6    499 30608  1    182.74  2.9793   0.08495 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# anova selects the degree-3 polynomial!

# Step Functions
step2 = lm(y ~ cut(x, 2))
step3 = lm(y ~ cut(x, 3))
step4 = lm(y ~ cut(x, 4))
step5 = lm(y ~ cut(x, 5))

pred2 = predict(step2, newdata = list(x = sort(x)), se = TRUE)
pred3 = predict(step3, newdata = list(x = sort(x)), se = TRUE)
pred4 = predict(step4, newdata = list(x = sort(x)), se = TRUE)
pred5 = predict(step5, newdata = list(x = sort(x)), se = TRUE)

se.bands2 = cbind(pred2$fit + 2*pred2$se.fit, pred2$fit-2*pred2$se.fit)
se.bands3 = cbind(pred3$fit + 2*pred3$se.fit, pred3$fit-2*pred3$se.fit)
se.bands4 = cbind(pred4$fit + 2*pred4$se.fit, pred4$fit-2*pred4$se.fit)
se.bands5 = cbind(pred5$fit + 2*pred5$se.fit, pred5$fit-2*pred5$se.fit)

par(mfrow = c(2,2))

plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "1 cutoff", bty = 'l')
lines(sort(x), pred2$fit, lwd = 2, col = "red")
matlines(sort(x), se.bands2, lwd = 1.4, col = "red", lty = 3)

plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "2 cutoffs", bty = 'l')
lines(sort(x), pred3$fit, lwd = 2, col = "darkviolet")
matlines(sort(x), se.bands3, lwd = 1.4, col = "darkviolet", lty = 3)

plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab,
     main = "3 cutoffs", bty = 'l')
lines(sort(x), pred4$fit, lwd = 2, col = "blue")
matlines(sort(x), se.bands4, lwd = 1.4, col = "blue", lty = 3)

plot(x, y, cex.lab = 1.1, col="darkgrey", xlab = x.lab, ylab = y.lab, 
     main = "4 cutoffs", bty = 'l')
lines(sort(x), pred5$fit, lwd = 2, col = "black")
matlines(sort(x), se.bands5, lwd = 1.4, col = "black", lty = 3)

Practical 5 Solutions

Q1 seatpos Data

Q2 Boston Data

Q1 `seatpos` Data

Q2 `Boston` Data