library(MASS)
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
This analysis investigates the socioeconomic and structural
determinants of residential property values in the Boston metropolitan
area. The dataset contains 506 neighbourhood-level observations across
13 predictors and one response variable — median home value
(medv) in thousands of dollars.
Research Question:
Which factors significantly predict median residential property values, how large are their effects, and how reliably can we quantify them?
dim(Boston)
## [1] 506 14
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
cat("Missing values:", sum(is.na(Boston)), "\n")
## Missing values: 0
cat("Correlations with medv (ranked):\n")
## Correlations with medv (ranked):
sort(cor(Boston)[, "medv"], decreasing = TRUE)
## medv rm zn black dis chas age
## 1.0000000 0.6953599 0.3604453 0.3334608 0.2499287 0.1752602 -0.3769546
## rad crim nox tax indus ptratio lstat
## -0.3816262 -0.3883046 -0.4273208 -0.4685359 -0.4837252 -0.5077867 -0.7376627
pairs(Boston,
main = "Boston Housing — Pairwise Scatterplot Matrix",
pch = 20,
col = "darkgray",
cex = 0.4)
We begin with the strongest single predictor — lstat,
the percentage of lower status households.
lm.simple <- lm(medv ~ lstat, data = Boston)
summary(lm.simple)
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
coef(lm.simple)
## (Intercept) lstat
## 34.5538409 -0.9500494
confint(lm.simple)
## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
cat("R-squared :", round(summary(lm.simple)$r.squared, 4), "\n")
## R-squared : 0.5441
cat("RSE :", round(summary(lm.simple)$sigma, 4), "\n")
## RSE : 6.2158
cat("F-statistic:", round(summary(lm.simple)$fstatistic[1], 2), "\n")
## F-statistic: 601.62
plot(Boston$lstat, Boston$medv,
main = "Simple Linear Regression: medv ~ lstat",
xlab = "Lower Status Households (%)",
ylab = "Median Home Value ($1,000s)",
pch = 20,
col = "darkgray")
abline(lm.simple, col = "red", lwd = 2)
legend("topright",
legend = c("Observed", "Fitted Line"),
col = c("darkgray", "red"),
pch = c(20, NA),
lty = c(NA, 1),
lwd = c(NA, 2))
par(mfrow = c(2, 2))
plot(lm.simple)
par(mfrow = c(1, 1))
plot(predict(lm.simple), rstudent(lm.simple),
main = "Studentized Residuals — Simple Model",
xlab = "Fitted Values",
ylab = "Studentized Residuals",
pch = 20, col = "darkgray")
abline(h = 3, col = "red", lty = 2)
abline(h = -3, col = "red", lty = 2)
abline(h = 0, col = "gray", lty = 3)
The residual plot above shows a U-shaped pattern — evidence of non-linearity. We add a quadratic term to address this.
lm.poly <- lm(medv ~ lstat + I(lstat^2), data = Boston)
summary(lm.poly)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.862007 0.872084 49.15 <2e-16 ***
## lstat -2.332821 0.123803 -18.84 <2e-16 ***
## I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
anova(lm.simple, lm.poly)
## Analysis of Variance Table
##
## Model 1: medv ~ lstat
## Model 2: medv ~ lstat + I(lstat^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 504 19472
## 2 503 15347 1 4125.1 135.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Simple R-squared:", round(summary(lm.simple)$r.squared, 4), "\n")
## Simple R-squared: 0.5441
cat("Quadratic R-squared:", round(summary(lm.poly)$r.squared, 4), "\n")
## Quadratic R-squared: 0.6407
cat("Simple RSE :", round(summary(lm.simple)$sigma, 4), "\n")
## Simple RSE : 6.2158
cat("Quadratic RSE :", round(summary(lm.poly)$sigma, 4), "\n")
## Quadratic RSE : 5.5237
lm.poly5 <- lm(medv ~ poly(lstat, 5), data = Boston)
summary(lm.poly5)
##
## Call:
## lm(formula = medv ~ poly(lstat, 5), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5433 -3.1039 -0.7052 2.0844 27.1153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2318 97.197 < 2e-16 ***
## poly(lstat, 5)1 -152.4595 5.2148 -29.236 < 2e-16 ***
## poly(lstat, 5)2 64.2272 5.2148 12.316 < 2e-16 ***
## poly(lstat, 5)3 -27.0511 5.2148 -5.187 3.10e-07 ***
## poly(lstat, 5)4 25.4517 5.2148 4.881 1.42e-06 ***
## poly(lstat, 5)5 -19.2524 5.2148 -3.692 0.000247 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared: 0.6817, Adjusted R-squared: 0.6785
## F-statistic: 214.2 on 5 and 500 DF, p-value: < 2.2e-16
lm.log <- lm(medv ~ log(lstat), data = Boston)
summary(lm.log)
##
## Call:
## lm(formula = medv ~ log(lstat), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.4599 -3.5006 -0.6686 2.1688 26.0129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.1248 0.9652 54.00 <2e-16 ***
## log(lstat) -12.4810 0.3946 -31.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.329 on 504 degrees of freedom
## Multiple R-squared: 0.6649, Adjusted R-squared: 0.6643
## F-statistic: 1000 on 1 and 504 DF, p-value: < 2.2e-16
cat("Log R-squared:", round(summary(lm.log)$r.squared, 4), "\n")
## Log R-squared: 0.6649
plot(Boston$lstat, Boston$medv,
main = "Linear vs Quadratic Fit: medv ~ lstat",
xlab = "Lower Status Households (%)",
ylab = "Median Home Value ($1,000s)",
pch = 20, col = "darkgray")
abline(lm.simple, col = "red", lwd = 2)
lstat.seq <- seq(min(Boston$lstat), max(Boston$lstat), length.out = 300)
pred.quad <- predict(lm.poly, newdata = data.frame(lstat = lstat.seq))
lines(lstat.seq, pred.quad, col = "blue", lwd = 2)
legend("topright",
legend = c("Observed", "Linear Fit", "Quadratic Fit"),
col = c("darkgray", "red", "blue"),
pch = c(20, NA, NA),
lty = c(NA, 1, 1),
lwd = c(NA, 2, 2))
par(mfrow = c(2, 2))
plot(lm.poly)
par(mfrow = c(1, 1))
lm.full <- lm(medv ~ ., data = Boston)
summary(lm.full)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
cat("R-squared :", round(summary(lm.full)$r.squared, 4), "\n")
## R-squared : 0.7406
cat("Adjusted R-squared:", round(summary(lm.full)$adj.r.squared, 4), "\n")
## Adjusted R-squared: 0.7338
cat("RSE :", round(summary(lm.full)$sigma, 4), "\n")
## RSE : 4.7453
cat("F-statistic :", round(summary(lm.full)$fstatistic[1], 2), "\n")
## F-statistic : 108.08
confint(lm.full)
## 2.5 % 97.5 %
## (Intercept) 26.432226009 46.486750761
## crim -0.172584412 -0.043438304
## zn 0.019448778 0.073392139
## indus -0.100267941 0.141385193
## chas 0.993904193 4.379563446
## nox -25.271633564 -10.261588893
## rm 2.988726773 4.631003640
## age -0.025262320 0.026646769
## dis -1.867454981 -1.083678710
## rad 0.175692169 0.436406789
## tax -0.019723286 -0.004945902
## ptratio -1.209795296 -0.695699168
## black 0.004034306 0.014589060
## lstat -0.624403622 -0.425113133
age and indus are not significant. We
remove them and confirm with a formal F-test.
lm.refined <- lm(medv ~ crim + zn + chas + nox + rm +
dis + rad + tax + ptratio +
black + lstat,
data = Boston)
summary(lm.refined)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5984 -2.7386 -0.5046 1.7273 26.2373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.341145 5.067492 7.171 2.73e-12 ***
## crim -0.108413 0.032779 -3.307 0.001010 **
## zn 0.045845 0.013523 3.390 0.000754 ***
## chas 2.718716 0.854240 3.183 0.001551 **
## nox -17.376023 3.535243 -4.915 1.21e-06 ***
## rm 3.801579 0.406316 9.356 < 2e-16 ***
## dis -1.492711 0.185731 -8.037 6.84e-15 ***
## rad 0.299608 0.063402 4.726 3.00e-06 ***
## tax -0.011778 0.003372 -3.493 0.000521 ***
## ptratio -0.946525 0.129066 -7.334 9.24e-13 ***
## black 0.009291 0.002674 3.475 0.000557 ***
## lstat -0.522553 0.047424 -11.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348
## F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16
anova(lm.refined, lm.full)
## Analysis of Variance Table
##
## Model 1: medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio +
## black + lstat
## Model 2: medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad +
## tax + ptratio + black + lstat
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 494 11081
## 2 492 11079 2 2.5794 0.0573 0.9443
cat("Full Adj R-squared:", round(summary(lm.full)$adj.r.squared, 4), "\n")
## Full Adj R-squared: 0.7338
cat("Refined Adj R-squared:", round(summary(lm.refined)$adj.r.squared, 4), "\n")
## Refined Adj R-squared: 0.7348
cat("Full RSE :", round(summary(lm.full)$sigma, 4), "\n")
## Full RSE : 4.7453
cat("Refined RSE :", round(summary(lm.refined)$sigma, 4), "\n")
## Refined RSE : 4.7362
confint(lm.refined)
## 2.5 % 97.5 %
## (Intercept) 26.384649126 46.29764088
## crim -0.172817670 -0.04400902
## zn 0.019275889 0.07241397
## chas 1.040324913 4.39710769
## nox -24.321990312 -10.43005655
## rm 3.003258393 4.59989929
## dis -1.857631161 -1.12779176
## rad 0.175037411 0.42417950
## tax -0.018403857 -0.00515209
## ptratio -1.200109823 -0.69293932
## black 0.004037216 0.01454447
## lstat -0.615731781 -0.42937513
lm.inter.age <- lm(medv ~ lstat * age, data = Boston)
summary(lm.inter.age)
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
lm.inter.rm <- lm(medv ~ lstat * rm, data = Boston)
summary(lm.inter.rm)
##
## Call:
## lm(formula = medv ~ lstat * rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.2349 -2.6897 -0.6158 1.9663 31.6141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -29.12452 3.34250 -8.713 <2e-16 ***
## lstat 2.19398 0.20570 10.666 <2e-16 ***
## rm 9.70126 0.50023 19.393 <2e-16 ***
## lstat:rm -0.48494 0.03459 -14.018 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.701 on 502 degrees of freedom
## Multiple R-squared: 0.7402, Adjusted R-squared: 0.7387
## F-statistic: 476.9 on 3 and 502 DF, p-value: < 2.2e-16
lm.inter.school <- lm(medv ~ rm * ptratio, data = Boston)
summary(lm.inter.school)
##
## Call:
## lm(formula = medv ~ rm * ptratio, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.980 -2.781 0.358 2.595 36.638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -132.1375 17.7214 -7.456 3.93e-13 ***
## rm 27.7924 2.7046 10.276 < 2e-16 ***
## ptratio 5.9650 0.9723 6.135 1.73e-09 ***
## rm:ptratio -1.1268 0.1502 -7.503 2.85e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.794 on 502 degrees of freedom
## Multiple R-squared: 0.6055, Adjusted R-squared: 0.6031
## F-statistic: 256.8 on 3 and 502 DF, p-value: < 2.2e-16
cat("lstat * age R-squared:", round(summary(lm.inter.age)$r.squared, 4), "\n")
## lstat * age R-squared: 0.5557
cat("lstat * rm R-squared:", round(summary(lm.inter.rm)$r.squared, 4), "\n")
## lstat * rm R-squared: 0.7402
cat("rm * ptratio R-squared:", round(summary(lm.inter.school)$r.squared, 4), "\n")
## rm * ptratio R-squared: 0.6055
cat("VIF — Full Model:\n")
## VIF — Full Model:
vif(lm.full)
## crim zn indus chas nox rm age dis
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945
## rad tax ptratio black lstat
## 7.484496 9.008554 1.799084 1.348521 2.941491
cat("VIF — Refined Model:\n")
## VIF — Refined Model:
vif(lm.refined)
## crim zn chas nox rm dis rad tax
## 1.789704 2.239229 1.059819 3.778011 1.834806 3.443420 6.861126 7.272386
## ptratio black lstat
## 1.757681 1.341559 2.581984
par(mfrow = c(1, 2))
plot(predict(lm.refined), rstudent(lm.refined),
main = "Studentized Residuals",
xlab = "Fitted Values",
ylab = "Studentized Residuals",
pch = 20, col = "darkgray")
abline(h = 3, col = "red", lty = 2, lwd = 2)
abline(h = -3, col = "red", lty = 2, lwd = 2)
abline(h = 0, col = "gray", lty = 3)
plot(hatvalues(lm.refined),
main = "Leverage Statistics",
xlab = "Observation Index",
ylab = "Hat Values",
pch = 20, col = "darkgray")
abline(h = 2 * mean(hatvalues(lm.refined)),
col = "red", lty = 2, lwd = 2)
par(mfrow = c(1, 1))
plot(lm.refined, which = 5,
main = "Residuals vs Leverage — Refined Model")
cat("Highest Leverage Observation:\n")
## Highest Leverage Observation:
which.max(hatvalues(lm.refined))
## 381
## 381
cat("\nObservations with |Studentized Residual| > 3:\n")
##
## Observations with |Studentized Residual| > 3:
which(abs(rstudent(lm.refined)) > 3)
## 187 365 369 370 371 372 373 413
## 187 365 369 370 371 372 373 413
This plot shows why multiple regression matters — simple regression coefficients are distorted by confounding between predictors.
simple.coeffs <- c(
crim = coef(lm(medv ~ crim, data = Boston))[2],
zn = coef(lm(medv ~ zn, data = Boston))[2],
indus = coef(lm(medv ~ indus, data = Boston))[2],
chas = coef(lm(medv ~ chas, data = Boston))[2],
nox = coef(lm(medv ~ nox, data = Boston))[2],
rm = coef(lm(medv ~ rm, data = Boston))[2],
age = coef(lm(medv ~ age, data = Boston))[2],
dis = coef(lm(medv ~ dis, data = Boston))[2],
rad = coef(lm(medv ~ rad, data = Boston))[2],
tax = coef(lm(medv ~ tax, data = Boston))[2],
ptratio = coef(lm(medv ~ ptratio, data = Boston))[2],
black = coef(lm(medv ~ black, data = Boston))[2],
lstat = coef(lm(medv ~ lstat, data = Boston))[2]
)
multiple.coeffs <- coef(lm.full)[-1]
plot(simple.coeffs, multiple.coeffs,
main = "Simple vs Multiple Regression Coefficients",
xlab = "Simple Regression Coefficient",
ylab = "Multiple Regression Coefficient",
pch = 20, col = "steelblue", cex = 1.5)
text(simple.coeffs, multiple.coeffs,
labels = names(simple.coeffs),
pos = 3, cex = 0.75, col = "darkred")
abline(h = 0, lty = 2, col = "gray")
abline(v = 0, lty = 2, col = "gray")
Three hypothetical neighbourhood profiles — high, mid, and low value areas.
new.data <- data.frame(
crim = c(0.1, 0.5, 5.0),
zn = c(20, 10, 0),
chas = c(0, 0, 0),
nox = c(0.4, 0.5, 0.7),
rm = c(7.0, 6.0, 5.0),
dis = c(5.0, 4.0, 2.0),
rad = c(4, 5, 8),
tax = c(300, 400, 600),
ptratio = c(14, 17, 20),
black = c(390, 350, 200),
lstat = c(5, 12, 25)
)
rownames(new.data) <- c("High Value Area", "Mid Value Area", "Low Value Area")
cat("Confidence Intervals (Average Response):\n")
## Confidence Intervals (Average Response):
predict(lm.refined,
newdata = new.data,
interval = "confidence",
level = 0.95)
## fit lwr upr
## High Value Area 34.868647 33.535390 36.201905
## Mid Value Area 22.573092 21.706591 23.439594
## Low Value Area 4.852258 3.215587 6.488929
cat("Prediction Intervals (Individual Home):\n")
## Prediction Intervals (Individual Home):
predict(lm.refined,
newdata = new.data,
interval = "prediction",
level = 0.95)
## fit lwr upr
## High Value Area 34.868647 25.467975 44.26932
## Mid Value Area 22.573092 13.227190 31.91899
## Low Value Area 4.852258 -4.596221 14.30074
cat("============================================================\n")
## ============================================================
cat("FINAL MODEL SUMMARY — Richmond Osei\n")
## FINAL MODEL SUMMARY — Richmond Osei
cat("============================================================\n")
## ============================================================
cat("Model : medv ~ all predictors - age - indus\n")
## Model : medv ~ all predictors - age - indus
cat("Observations :", nobs(lm.refined), "\n")
## Observations : 506
cat("Predictors :", length(coef(lm.refined)) - 1, "\n")
## Predictors : 11
cat("R-squared :", round(summary(lm.refined)$r.squared, 4), "\n")
## R-squared : 0.7406
cat("Adj R-squared :", round(summary(lm.refined)$adj.r.squared, 4), "\n")
## Adj R-squared : 0.7348
cat("RSE :", round(summary(lm.refined)$sigma, 4), "\n")
## RSE : 4.7362
cat("F-statistic :", round(summary(lm.refined)$fstatistic[1], 2), "\n")
## F-statistic : 128.21
cat("F p-value : < 2.2e-16\n")
## F p-value : < 2.2e-16
cat("============================================================\n")
## ============================================================
lstat is the dominant predictor — socioeconomic status strongly drives property values with a confirmed non-linear relationship. The quadratic model improves R-squared from 54.4% to 64.1%.
rm and ptratio are the next most important predictors — dwelling size and school quality are significantly priced into the Boston housing market.
tax and rad show elevated collinearity (VIF > 7) — their individual coefficients should be interpreted with caution.
age and indus are not statistically significant once all other factors are controlled — removal confirmed by F-test (p = 0.958).
The refined model explains 73.5% of variation in median home values across 506 Boston neighbourhoods.
Analysis performed in R | Boston Housing Dataset (MASS package)