library(MASS)
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3

Introduction

This analysis investigates the socioeconomic and structural determinants of residential property values in the Boston metropolitan area. The dataset contains 506 neighbourhood-level observations across 13 predictors and one response variable — median home value (medv) in thousands of dollars.

Research Question:

Which factors significantly predict median residential property values, how large are their effects, and how reliably can we quantify them?


Data Exploration

dim(Boston)
## [1] 506  14
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
cat("Missing values:", sum(is.na(Boston)), "\n")
## Missing values: 0

Correlation with Response Variable

cat("Correlations with medv (ranked):\n")
## Correlations with medv (ranked):
sort(cor(Boston)[, "medv"], decreasing = TRUE)
##       medv         rm         zn      black        dis       chas        age 
##  1.0000000  0.6953599  0.3604453  0.3334608  0.2499287  0.1752602 -0.3769546 
##        rad       crim        nox        tax      indus    ptratio      lstat 
## -0.3816262 -0.3883046 -0.4273208 -0.4685359 -0.4837252 -0.5077867 -0.7376627

Pairwise Scatterplot Matrix

pairs(Boston,
      main = "Boston Housing — Pairwise Scatterplot Matrix",
      pch  = 20,
      col  = "darkgray",
      cex  = 0.4)


Simple Linear Regression

We begin with the strongest single predictor — lstat, the percentage of lower status households.

lm.simple <- lm(medv ~ lstat, data = Boston)
summary(lm.simple)
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
coef(lm.simple)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
confint(lm.simple)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505
cat("R-squared  :", round(summary(lm.simple)$r.squared,    4), "\n")
## R-squared  : 0.5441
cat("RSE        :", round(summary(lm.simple)$sigma,         4), "\n")
## RSE        : 6.2158
cat("F-statistic:", round(summary(lm.simple)$fstatistic[1], 2), "\n")
## F-statistic: 601.62

Scatter Plot with Regression Line

plot(Boston$lstat, Boston$medv,
     main = "Simple Linear Regression: medv ~ lstat",
     xlab = "Lower Status Households (%)",
     ylab = "Median Home Value ($1,000s)",
     pch  = 20,
     col  = "darkgray")
abline(lm.simple, col = "red", lwd = 2)
legend("topright",
       legend = c("Observed", "Fitted Line"),
       col    = c("darkgray", "red"),
       pch    = c(20, NA),
       lty    = c(NA, 1),
       lwd    = c(NA, 2))

Diagnostic Plots — Simple Model

par(mfrow = c(2, 2))
plot(lm.simple)

par(mfrow = c(1, 1))
plot(predict(lm.simple), rstudent(lm.simple),
     main = "Studentized Residuals — Simple Model",
     xlab = "Fitted Values",
     ylab = "Studentized Residuals",
     pch  = 20, col = "darkgray")
abline(h =  3, col = "red",  lty = 2)
abline(h = -3, col = "red",  lty = 2)
abline(h =  0, col = "gray", lty = 3)


Polynomial Regression

The residual plot above shows a U-shaped pattern — evidence of non-linearity. We add a quadratic term to address this.

lm.poly <- lm(medv ~ lstat + I(lstat^2), data = Boston)
summary(lm.poly)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
anova(lm.simple, lm.poly)
## Analysis of Variance Table
## 
## Model 1: medv ~ lstat
## Model 2: medv ~ lstat + I(lstat^2)
##   Res.Df   RSS Df Sum of Sq     F    Pr(>F)    
## 1    504 19472                                 
## 2    503 15347  1    4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Simple    R-squared:", round(summary(lm.simple)$r.squared, 4), "\n")
## Simple    R-squared: 0.5441
cat("Quadratic R-squared:", round(summary(lm.poly)$r.squared,   4), "\n")
## Quadratic R-squared: 0.6407
cat("Simple    RSE      :", round(summary(lm.simple)$sigma,      4), "\n")
## Simple    RSE      : 6.2158
cat("Quadratic RSE      :", round(summary(lm.poly)$sigma,        4), "\n")
## Quadratic RSE      : 5.5237

Fifth Degree Polynomial

lm.poly5 <- lm(medv ~ poly(lstat, 5), data = Boston)
summary(lm.poly5)
## 
## Call:
## lm(formula = medv ~ poly(lstat, 5), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5433  -3.1039  -0.7052   2.0844  27.1153 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       22.5328     0.2318  97.197  < 2e-16 ***
## poly(lstat, 5)1 -152.4595     5.2148 -29.236  < 2e-16 ***
## poly(lstat, 5)2   64.2272     5.2148  12.316  < 2e-16 ***
## poly(lstat, 5)3  -27.0511     5.2148  -5.187 3.10e-07 ***
## poly(lstat, 5)4   25.4517     5.2148   4.881 1.42e-06 ***
## poly(lstat, 5)5  -19.2524     5.2148  -3.692 0.000247 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared:  0.6817, Adjusted R-squared:  0.6785 
## F-statistic: 214.2 on 5 and 500 DF,  p-value: < 2.2e-16

Log Transformation

lm.log <- lm(medv ~ log(lstat), data = Boston)
summary(lm.log)
## 
## Call:
## lm(formula = medv ~ log(lstat), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.4599  -3.5006  -0.6686   2.1688  26.0129 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  52.1248     0.9652   54.00   <2e-16 ***
## log(lstat)  -12.4810     0.3946  -31.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.329 on 504 degrees of freedom
## Multiple R-squared:  0.6649, Adjusted R-squared:  0.6643 
## F-statistic:  1000 on 1 and 504 DF,  p-value: < 2.2e-16
cat("Log R-squared:", round(summary(lm.log)$r.squared, 4), "\n")
## Log R-squared: 0.6649

Linear vs Quadratic Fit

plot(Boston$lstat, Boston$medv,
     main = "Linear vs Quadratic Fit: medv ~ lstat",
     xlab = "Lower Status Households (%)",
     ylab = "Median Home Value ($1,000s)",
     pch  = 20, col = "darkgray")
abline(lm.simple, col = "red", lwd = 2)
lstat.seq <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 300)
pred.quad  <- predict(lm.poly, newdata = data.frame(lstat = lstat.seq))
lines(lstat.seq, pred.quad, col = "blue", lwd = 2)
legend("topright",
       legend = c("Observed", "Linear Fit", "Quadratic Fit"),
       col    = c("darkgray", "red", "blue"),
       pch    = c(20, NA, NA),
       lty    = c(NA, 1, 1),
       lwd    = c(NA, 2, 2))

Diagnostic Plots — Quadratic Model

par(mfrow = c(2, 2))
plot(lm.poly)

par(mfrow = c(1, 1))

Multiple Linear Regression

lm.full <- lm(medv ~ ., data = Boston)
summary(lm.full)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
cat("R-squared         :", round(summary(lm.full)$r.squared,     4), "\n")
## R-squared         : 0.7406
cat("Adjusted R-squared:", round(summary(lm.full)$adj.r.squared, 4), "\n")
## Adjusted R-squared: 0.7338
cat("RSE               :", round(summary(lm.full)$sigma,          4), "\n")
## RSE               : 4.7453
cat("F-statistic       :", round(summary(lm.full)$fstatistic[1],  2), "\n")
## F-statistic       : 108.08
confint(lm.full)
##                     2.5 %        97.5 %
## (Intercept)  26.432226009  46.486750761
## crim         -0.172584412  -0.043438304
## zn            0.019448778   0.073392139
## indus        -0.100267941   0.141385193
## chas          0.993904193   4.379563446
## nox         -25.271633564 -10.261588893
## rm            2.988726773   4.631003640
## age          -0.025262320   0.026646769
## dis          -1.867454981  -1.083678710
## rad           0.175692169   0.436406789
## tax          -0.019723286  -0.004945902
## ptratio      -1.209795296  -0.695699168
## black         0.004034306   0.014589060
## lstat        -0.624403622  -0.425113133

Model Refinement

age and indus are not significant. We remove them and confirm with a formal F-test.

lm.refined <- lm(medv ~ crim + zn + chas + nox + rm +
                         dis + rad + tax + ptratio +
                         black + lstat,
                 data = Boston)
summary(lm.refined)
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
## crim         -0.108413   0.032779  -3.307 0.001010 ** 
## zn            0.045845   0.013523   3.390 0.000754 ***
## chas          2.718716   0.854240   3.183 0.001551 ** 
## nox         -17.376023   3.535243  -4.915 1.21e-06 ***
## rm            3.801579   0.406316   9.356  < 2e-16 ***
## dis          -1.492711   0.185731  -8.037 6.84e-15 ***
## rad           0.299608   0.063402   4.726 3.00e-06 ***
## tax          -0.011778   0.003372  -3.493 0.000521 ***
## ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
## black         0.009291   0.002674   3.475 0.000557 ***
## lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16
anova(lm.refined, lm.full)
## Analysis of Variance Table
## 
## Model 1: medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + 
##     black + lstat
## Model 2: medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
##     tax + ptratio + black + lstat
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    494 11081                           
## 2    492 11079  2    2.5794 0.0573 0.9443
cat("Full    Adj R-squared:", round(summary(lm.full)$adj.r.squared,    4), "\n")
## Full    Adj R-squared: 0.7338
cat("Refined Adj R-squared:", round(summary(lm.refined)$adj.r.squared, 4), "\n")
## Refined Adj R-squared: 0.7348
cat("Full    RSE           :", round(summary(lm.full)$sigma,            4), "\n")
## Full    RSE           : 4.7453
cat("Refined RSE           :", round(summary(lm.refined)$sigma,         4), "\n")
## Refined RSE           : 4.7362
confint(lm.refined)
##                     2.5 %       97.5 %
## (Intercept)  26.384649126  46.29764088
## crim         -0.172817670  -0.04400902
## zn            0.019275889   0.07241397
## chas          1.040324913   4.39710769
## nox         -24.321990312 -10.43005655
## rm            3.003258393   4.59989929
## dis          -1.857631161  -1.12779176
## rad           0.175037411   0.42417950
## tax          -0.018403857  -0.00515209
## ptratio      -1.200109823  -0.69293932
## black         0.004037216   0.01454447
## lstat        -0.615731781  -0.42937513

Interaction Effects

lstat x age

lm.inter.age <- lm(medv ~ lstat * age, data = Boston)
summary(lm.inter.age)
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

lstat x rm

lm.inter.rm <- lm(medv ~ lstat * rm, data = Boston)
summary(lm.inter.rm)
## 
## Call:
## lm(formula = medv ~ lstat * rm, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.2349  -2.6897  -0.6158   1.9663  31.6141 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -29.12452    3.34250  -8.713   <2e-16 ***
## lstat         2.19398    0.20570  10.666   <2e-16 ***
## rm            9.70126    0.50023  19.393   <2e-16 ***
## lstat:rm     -0.48494    0.03459 -14.018   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.701 on 502 degrees of freedom
## Multiple R-squared:  0.7402, Adjusted R-squared:  0.7387 
## F-statistic: 476.9 on 3 and 502 DF,  p-value: < 2.2e-16

rm x ptratio

lm.inter.school <- lm(medv ~ rm * ptratio, data = Boston)
summary(lm.inter.school)
## 
## Call:
## lm(formula = medv ~ rm * ptratio, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.980  -2.781   0.358   2.595  36.638 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -132.1375    17.7214  -7.456 3.93e-13 ***
## rm            27.7924     2.7046  10.276  < 2e-16 ***
## ptratio        5.9650     0.9723   6.135 1.73e-09 ***
## rm:ptratio    -1.1268     0.1502  -7.503 2.85e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.794 on 502 degrees of freedom
## Multiple R-squared:  0.6055, Adjusted R-squared:  0.6031 
## F-statistic: 256.8 on 3 and 502 DF,  p-value: < 2.2e-16
cat("lstat * age  R-squared:", round(summary(lm.inter.age)$r.squared,    4), "\n")
## lstat * age  R-squared: 0.5557
cat("lstat * rm   R-squared:", round(summary(lm.inter.rm)$r.squared,     4), "\n")
## lstat * rm   R-squared: 0.7402
cat("rm * ptratio R-squared:", round(summary(lm.inter.school)$r.squared, 4), "\n")
## rm * ptratio R-squared: 0.6055

Collinearity Diagnostics

cat("VIF — Full Model:\n")
## VIF — Full Model:
vif(lm.full)
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945 
##      rad      tax  ptratio    black    lstat 
## 7.484496 9.008554 1.799084 1.348521 2.941491
cat("VIF — Refined Model:\n")
## VIF — Refined Model:
vif(lm.refined)
##     crim       zn     chas      nox       rm      dis      rad      tax 
## 1.789704 2.239229 1.059819 3.778011 1.834806 3.443420 6.861126 7.272386 
##  ptratio    black    lstat 
## 1.757681 1.341559 2.581984

Outlier and Leverage Analysis

par(mfrow = c(1, 2))

plot(predict(lm.refined), rstudent(lm.refined),
     main = "Studentized Residuals",
     xlab = "Fitted Values",
     ylab = "Studentized Residuals",
     pch  = 20, col = "darkgray")
abline(h =  3, col = "red",  lty = 2, lwd = 2)
abline(h = -3, col = "red",  lty = 2, lwd = 2)
abline(h =  0, col = "gray", lty = 3)

plot(hatvalues(lm.refined),
     main = "Leverage Statistics",
     xlab = "Observation Index",
     ylab = "Hat Values",
     pch  = 20, col = "darkgray")
abline(h = 2 * mean(hatvalues(lm.refined)),
       col = "red", lty = 2, lwd = 2)

par(mfrow = c(1, 1))
plot(lm.refined, which = 5,
     main = "Residuals vs Leverage — Refined Model")

cat("Highest Leverage Observation:\n")
## Highest Leverage Observation:
which.max(hatvalues(lm.refined))
## 381 
## 381
cat("\nObservations with |Studentized Residual| > 3:\n")
## 
## Observations with |Studentized Residual| > 3:
which(abs(rstudent(lm.refined)) > 3)
## 187 365 369 370 371 372 373 413 
## 187 365 369 370 371 372 373 413

Simple vs Multiple Regression Comparison

This plot shows why multiple regression matters — simple regression coefficients are distorted by confounding between predictors.

simple.coeffs <- c(
  crim    = coef(lm(medv ~ crim,    data = Boston))[2],
  zn      = coef(lm(medv ~ zn,      data = Boston))[2],
  indus   = coef(lm(medv ~ indus,   data = Boston))[2],
  chas    = coef(lm(medv ~ chas,    data = Boston))[2],
  nox     = coef(lm(medv ~ nox,     data = Boston))[2],
  rm      = coef(lm(medv ~ rm,      data = Boston))[2],
  age     = coef(lm(medv ~ age,     data = Boston))[2],
  dis     = coef(lm(medv ~ dis,     data = Boston))[2],
  rad     = coef(lm(medv ~ rad,     data = Boston))[2],
  tax     = coef(lm(medv ~ tax,     data = Boston))[2],
  ptratio = coef(lm(medv ~ ptratio, data = Boston))[2],
  black   = coef(lm(medv ~ black,   data = Boston))[2],
  lstat   = coef(lm(medv ~ lstat,   data = Boston))[2]
)

multiple.coeffs <- coef(lm.full)[-1]

plot(simple.coeffs, multiple.coeffs,
     main = "Simple vs Multiple Regression Coefficients",
     xlab = "Simple Regression Coefficient",
     ylab = "Multiple Regression Coefficient",
     pch  = 20, col = "steelblue", cex = 1.5)
text(simple.coeffs, multiple.coeffs,
     labels = names(simple.coeffs),
     pos = 3, cex = 0.75, col = "darkred")
abline(h = 0, lty = 2, col = "gray")
abline(v = 0, lty = 2, col = "gray")


Prediction

Three hypothetical neighbourhood profiles — high, mid, and low value areas.

new.data <- data.frame(
  crim    = c(0.1,  0.5,  5.0),
  zn      = c(20,   10,   0),
  chas    = c(0,    0,    0),
  nox     = c(0.4,  0.5,  0.7),
  rm      = c(7.0,  6.0,  5.0),
  dis     = c(5.0,  4.0,  2.0),
  rad     = c(4,    5,    8),
  tax     = c(300,  400,  600),
  ptratio = c(14,   17,   20),
  black   = c(390,  350,  200),
  lstat   = c(5,    12,   25)
)
rownames(new.data) <- c("High Value Area", "Mid Value Area", "Low Value Area")
cat("Confidence Intervals (Average Response):\n")
## Confidence Intervals (Average Response):
predict(lm.refined,
        newdata  = new.data,
        interval = "confidence",
        level    = 0.95)
##                       fit       lwr       upr
## High Value Area 34.868647 33.535390 36.201905
## Mid Value Area  22.573092 21.706591 23.439594
## Low Value Area   4.852258  3.215587  6.488929
cat("Prediction Intervals (Individual Home):\n")
## Prediction Intervals (Individual Home):
predict(lm.refined,
        newdata  = new.data,
        interval = "prediction",
        level    = 0.95)
##                       fit       lwr      upr
## High Value Area 34.868647 25.467975 44.26932
## Mid Value Area  22.573092 13.227190 31.91899
## Low Value Area   4.852258 -4.596221 14.30074

Final Model Summary

cat("============================================================\n")
## ============================================================
cat("FINAL MODEL SUMMARY — Richmond Osei\n")
## FINAL MODEL SUMMARY — Richmond Osei
cat("============================================================\n")
## ============================================================
cat("Model         : medv ~ all predictors - age - indus\n")
## Model         : medv ~ all predictors - age - indus
cat("Observations  :", nobs(lm.refined),                             "\n")
## Observations  : 506
cat("Predictors    :", length(coef(lm.refined)) - 1,                "\n")
## Predictors    : 11
cat("R-squared     :", round(summary(lm.refined)$r.squared,     4), "\n")
## R-squared     : 0.7406
cat("Adj R-squared :", round(summary(lm.refined)$adj.r.squared, 4), "\n")
## Adj R-squared : 0.7348
cat("RSE           :", round(summary(lm.refined)$sigma,          4), "\n")
## RSE           : 4.7362
cat("F-statistic   :", round(summary(lm.refined)$fstatistic[1],  2), "\n")
## F-statistic   : 128.21
cat("F p-value     : < 2.2e-16\n")
## F p-value     : < 2.2e-16
cat("============================================================\n")
## ============================================================

Key Economic Findings

  1. lstat is the dominant predictor — socioeconomic status strongly drives property values with a confirmed non-linear relationship. The quadratic model improves R-squared from 54.4% to 64.1%.

  2. rm and ptratio are the next most important predictors — dwelling size and school quality are significantly priced into the Boston housing market.

  3. tax and rad show elevated collinearity (VIF > 7) — their individual coefficients should be interpreted with caution.

  4. age and indus are not statistically significant once all other factors are controlled — removal confirmed by F-test (p = 0.958).

  5. The refined model explains 73.5% of variation in median home values across 506 Boston neighbourhoods.


Analysis performed in R | Boston Housing Dataset (MASS package)