library(rpart)
library(readr)
library(ggplot2)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(corrplot)
## corrplot 0.95 loaded
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-10
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(fastDummies)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(FNN)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
library(dbarts)
library(xgboost)
library(MASS)
energy_data <- read_csv("DataSets/energydata_complete.csv", col_types = cols(
date = col_character(),
Appliances = col_integer(),
lights = col_integer()
# allowing other columns to be guessed
)
)
energy_data$date <- ymd_hms(energy_data$date, tz = "UTC")
# sum(is.na(energy_data$date))
#str(energy_data)
cols_to_drop <- c("rv1", "rv2")
existing <- intersect(cols_to_drop, names(energy_data))
#existing
energy_data[existing] <- NULL
#removing unnecessary variables
second_day <- function(x) {s <- hour(x)*3600+minute(x)*60+second(x)} # x is an object in posixct format
weekend_weekday <- function(x) {
val <- weekdays(x)
if (val == "Saturday" | val == "Sunday") {val2 = "Weekend"}
else {val2= "Weekday"}
return(val2)}
energy_data$NSM <- second_day(energy_data$date)
energy_data$WeekStatus <- unlist(lapply(energy_data$date,weekend_weekday))
energy_data$Day_of_week <-weekdays(energy_data$date)
#unique(energy_data$WeekStatus)
#unique(energy_data$Day_of_week)
#class(energy_data$Day_of_week)
energy_data$Day_of_week <-as.factor(energy_data$Day_of_week)
energy_data$WeekStatus <- as.factor(energy_data$WeekStatus)
#str(energy_data)
#dim(energy_data)
#names(energy_data)
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
plot1 <-qplot(energy_data$date,energy_data$Appliances,
xlab='Time',ylab='Appliances Wh', main = "Application Wattage Usage in Total",
geom="line")+theme_minimal()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot2 <-qplot(energy_data$date[1:1008],energy_data$Appliances[1:1008],
xlab='Time (1 week)',ylab='Appliances Wh', main = "Application Wattage Usage In One Week",
geom="line")+theme_minimal()
plot1
plot2
rmse_vec <- function(y_true, y_pred) {
sqrt(mean((y_true - y_pred)^2))
}
# for error and validation
# dummy variables
data_knn <- fastDummies::dummy_cols(
energy_data,
select_columns = c("WeekStatus", "Day_of_week"),
remove_first_dummy = FALSE,
remove_selected_columns = TRUE
)
#str(data_knn)
stopifnot("Appliances" %in% names(data_knn))
y_all <- data_knn$Appliances
numeric_cols <- sapply(data_knn, is.numeric)
pred_cols <- setdiff(names(data_knn)[numeric_cols], "Appliances") #numeric predictors
X_all <- as.matrix(data_knn[ , pred_cols])
set.seed(123) # 80/20 split
n <- nrow(X_all)
train_size <- floor(0.8 * n) #80
idx <- sample(n)
train_idx <- idx[1:train_size]
test_idx <- idx[(train_size + 1):n]
X_train <- X_all[train_idx, , drop = FALSE]
y_train <- y_all[train_idx]
X_test <- X_all[test_idx, , drop = FALSE]
y_test <- y_all[test_idx]
stopifnot(ncol(X_train) == ncol(X_test))
train_center <- colMeans(X_train)
train_scale <- apply(X_train, 2, sd)
X_train_scaled <- scale(X_train, center = train_center, scale = train_scale)
X_test_scaled <- scale(X_test, center = train_center, scale = train_scale)
## k-fold CV for one k
cv_knn_mse <- function(X, y, k, K=10) {
n <- nrow(X)
folds <- sample(rep(1:K, length.out=n))
mse_vals <- numeric(K)
for (fold in 1:K) {
val_idx <- which(folds==fold)
train_idx <- which(folds!=fold)
X_tr <- X[train_idx,,drop=FALSE]
y_tr <- y[train_idx]
X_val <- X[val_idx,,drop=FALSE]
y_val <- y[val_idx]
pred <- knn.reg(train=X_tr, test=X_val, y=y_tr, k=k)$pred
mse_vals[fold] <- mean((y_val - pred)^2)
}
mean(mse_vals) # RETURN AVERAGE CV MSE
}
k_grid <- seq(1, 10, by = 2)
cv_mse <- numeric(length(k_grid))
for (i in seq_along(k_grid)) {
k_val <- k_grid[i]
cv_mse[i] <- cv_knn_mse(X_train_scaled, y_train, k = k_val)
}
cv_results <- data.frame(k = k_grid, CV_MSE = cv_mse)
print(cv_results)
## k CV_MSE
## 1 1 6756.147
## 2 3 5547.036
## 3 5 5823.575
## 4 7 6035.613
## 5 9 6214.241
best_k <- cv_results$k[which.min(cv_results$CV_MSE)]
cat("\nBest k =", best_k, "\n")
##
## Best k = 3
cat("Best CV MSE =", min(cv_results$CV_MSE), "\n")
## Best CV MSE = 5547.036
knn_best <- knn.reg(
train = X_train_scaled,
test = X_test_scaled,
y = y_train,
k = best_k
)
pred_test <- knn.reg(train=X_train_scaled,
test=X_test_scaled,
y=y_train,
k=best_k)$pred
test_mse <- mean((y_test - pred_test)^2)
knn_test_mse <- mean((y_test - pred_test)^2)
knn_test_rmse <- sqrt(knn_test_mse)
cat("KNN Test MSE :", knn_test_mse, "\n")
## KNN Test MSE : 5740.326
cat("KNN Test RMSE :", knn_test_rmse, "\n")
## KNN Test RMSE : 75.76494
train_df <- as.data.frame(X_train)
train_df$Appliances <- y_train
test_df <- as.data.frame(X_test)
test_df$Appliances <- y_test
#str(train_df)
lm_full <- lm(Appliances ~ ., data = train_df)
summary(lm_full)
##
## Call:
## lm(formula = Appliances ~ ., data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -248.34 -42.16 -17.85 7.97 939.94
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.274e+01 1.051e+02 -0.883 0.377394
## lights 1.898e+00 1.059e-01 17.926 < 2e-16 ***
## T1 -3.200e+00 2.048e+00 -1.562 0.118203
## RH_1 1.452e+01 7.452e-01 19.490 < 2e-16 ***
## T2 -1.833e+01 1.794e+00 -10.220 < 2e-16 ***
## RH_2 -1.329e+01 8.453e-01 -15.723 < 2e-16 ***
## T3 2.488e+01 1.173e+00 21.211 < 2e-16 ***
## RH_3 6.126e+00 7.599e-01 8.062 8.03e-16 ***
## T4 -2.311e-01 1.178e+00 -0.196 0.844472
## RH_4 -1.006e+00 7.108e-01 -1.415 0.156989
## T5 -3.885e-01 1.294e+00 -0.300 0.764013
## RH_5 7.081e-02 9.653e-02 0.734 0.463193
## T6 6.991e+00 6.978e-01 10.017 < 2e-16 ***
## RH_6 3.006e-01 7.480e-02 4.019 5.88e-05 ***
## T7 1.591e+00 1.472e+00 1.081 0.279890
## RH_7 -1.119e+00 4.789e-01 -2.337 0.019448 *
## T8 7.741e+00 1.121e+00 6.907 5.14e-12 ***
## RH_8 -4.406e+00 4.360e-01 -10.106 < 2e-16 ***
## T9 -1.341e+01 1.981e+00 -6.770 1.33e-11 ***
## RH_9 -3.301e-01 4.675e-01 -0.706 0.480114
## T_out -9.151e+00 1.685e+00 -5.430 5.71e-08 ***
## Press_mm_hg 1.934e-01 1.180e-01 1.639 0.101285
## RH_out -7.508e-01 3.499e-01 -2.146 0.031924 *
## Windspeed 1.886e+00 3.868e-01 4.877 1.09e-06 ***
## Visibility 2.101e-01 6.372e-02 3.298 0.000975 ***
## Tdewpoint 3.202e+00 1.651e+00 1.940 0.052443 .
## NSM 2.867e-04 4.271e-05 6.712 1.99e-11 ***
## WeekStatus_Weekday -4.467e+00 2.857e+00 -1.563 0.117976
## WeekStatus_Weekend NA NA NA NA
## Day_of_week_Friday 1.830e+01 2.891e+00 6.331 2.50e-10 ***
## Day_of_week_Monday 8.546e+00 2.812e+00 3.040 0.002373 **
## Day_of_week_Saturday 1.590e+01 2.899e+00 5.483 4.24e-08 ***
## Day_of_week_Sunday NA NA NA NA
## Day_of_week_Thursday -1.855e+00 2.759e+00 -0.672 0.501359
## Day_of_week_Tuesday -4.158e+00 2.776e+00 -1.498 0.134196
## Day_of_week_Wednesday NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.22 on 15755 degrees of freedom
## Multiple R-squared: 0.1774, Adjusted R-squared: 0.1757
## F-statistic: 106.2 on 32 and 15755 DF, p-value: < 2.2e-16
par(mar = c(4, 4, 2, 1)) # default margins ran into margin too large error
par(mfrow = c(2, 2))
plot(lm_full)
par(mfrow = c(1, 1))
lm_summary <- summary(lm_full)
coef_table <- lm_summary$coefficients
# Show coefficients with p < 0.05
sig_coef <- coef_table[coef_table[,"Pr(>|t|)"] < 0.05, ]
sig_coef[order(sig_coef[,"Pr(>|t|)"]), ][1:10, ] # top 10 most significant
## Estimate Std. Error t value Pr(>|t|)
## T3 24.883257 1.1731193 21.211191 1.785557e-98
## RH_1 14.522927 0.7451536 19.489843 1.289948e-83
## lights 1.897750 0.1058670 17.925802 3.772821e-71
## RH_2 -13.290365 0.8452592 -15.723419 2.750788e-55
## T2 -18.329642 1.7935657 -10.219666 1.929791e-24
## RH_8 -4.406147 0.4360093 -10.105627 6.170295e-24
## T6 6.990535 0.6978458 10.017306 1.504821e-23
## RH_3 6.126008 0.7598530 8.062096 8.034622e-16
## T8 7.740911 1.1207366 6.906985 5.139989e-12
## T9 -13.408418 1.9806098 -6.769843 1.334746e-11
set.seed(123)
n_train <- nrow(train_df) # number of observations used to fit
step_bic <- stepAIC(
lm_full,
direction = "backward", # start from full model and remove predictors
k = log(n_train), # penalty
trace = TRUE
)
## Start: AIC=143140.3
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 +
## RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 +
## RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility +
## Tdewpoint + NSM + WeekStatus_Weekday + WeekStatus_Weekend +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Sunday + Day_of_week_Thursday + Day_of_week_Tuesday +
## Day_of_week_Wednesday
##
##
## Step: AIC=143140.3
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 +
## RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 +
## RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility +
## Tdewpoint + NSM + WeekStatus_Weekday + WeekStatus_Weekend +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Sunday + Day_of_week_Thursday + Day_of_week_Tuesday
##
##
## Step: AIC=143140.3
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 +
## RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 +
## RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility +
## Tdewpoint + NSM + WeekStatus_Weekday + WeekStatus_Weekend +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Thursday + Day_of_week_Tuesday
##
##
## Step: AIC=143140.3
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + T4 +
## RH_4 + T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 +
## RH_9 + T_out + Press_mm_hg + RH_out + Windspeed + Visibility +
## Tdewpoint + NSM + WeekStatus_Weekday + Day_of_week_Friday +
## Day_of_week_Monday + Day_of_week_Saturday + Day_of_week_Thursday +
## Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - T4 1 327 133979664 143131
## - T5 1 766 133980103 143131
## - Day_of_week_Thursday 1 3844 133983181 143131
## - RH_9 1 4240 133983577 143131
## - RH_5 1 4577 133983914 143131
## - T7 1 9930 133989267 143132
## - RH_4 1 17035 133996372 143133
## - Day_of_week_Tuesday 1 19079 133998416 143133
## - T1 1 20760 134000097 143133
## - WeekStatus_Weekday 1 20786 134000123 143133
## - Press_mm_hg 1 22837 134002174 143133
## - Tdewpoint 1 31993 134011330 143134
## - RH_out 1 39147 134018484 143135
## - RH_7 1 46447 134025784 143136
## - Day_of_week_Monday 1 78570 134057907 143140
## <none> 133979337 143140
## - Visibility 1 92506 134071843 143142
## - RH_6 1 137335 134116672 143147
## - Windspeed 1 202228 134181564 143154
## - T_out 1 250768 134230105 143160
## - Day_of_week_Saturday 1 255686 134235023 143161
## - Day_of_week_Friday 1 340841 134320178 143171
## - NSM 1 383077 134362414 143176
## - T9 1 389742 134369078 143176
## - T8 1 405692 134385029 143178
## - RH_3 1 552733 134532070 143196
## - T6 1 853338 134832675 143231
## - RH_8 1 868452 134847789 143233
## - T2 1 888163 134867500 143235
## - RH_2 1 2102390 136081727 143376
## - lights 1 2732604 136711941 143449
## - RH_1 1 3230250 137209587 143507
## - T3 1 3826040 137805377 143575
##
## Step: AIC=143130.6
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## T5 + RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 +
## T_out + Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint +
## NSM + WeekStatus_Weekday + Day_of_week_Friday + Day_of_week_Monday +
## Day_of_week_Saturday + Day_of_week_Thursday + Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - T5 1 875 133980540 143121
## - Day_of_week_Thursday 1 3838 133983503 143121
## - RH_9 1 4135 133983800 143121
## - RH_5 1 4533 133984197 143122
## - T7 1 9665 133989329 143122
## - RH_4 1 17667 133997331 143123
## - Day_of_week_Tuesday 1 19080 133998744 143123
## - WeekStatus_Weekday 1 22038 134001702 143124
## - Press_mm_hg 1 22510 134002175 143124
## - T1 1 22597 134002261 143124
## - Tdewpoint 1 31748 134011412 143125
## - RH_out 1 38875 134018539 143126
## - RH_7 1 47097 134026761 143127
## - Day_of_week_Monday 1 80415 134060079 143130
## <none> 133979664 143131
## - Visibility 1 93751 134073415 143132
## - RH_6 1 137464 134117128 143137
## - Windspeed 1 205227 134184892 143145
## - T_out 1 250617 134230281 143150
## - Day_of_week_Saturday 1 257262 134236926 143151
## - Day_of_week_Friday 1 353633 134333297 143163
## - NSM 1 384684 134364348 143166
## - T9 1 408881 134388545 143169
## - T8 1 413152 134392816 143170
## - RH_3 1 552481 134532146 143186
## - T6 1 857216 134836880 143222
## - RH_8 1 898197 134877861 143226
## - T2 1 923164 134902828 143229
## - RH_2 1 2122427 136102091 143369
## - lights 1 2972075 136951739 143467
## - RH_1 1 3234002 137213666 143498
## - T3 1 3827547 137807211 143566
##
## Step: AIC=143121.1
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + RH_9 + T_out +
## Press_mm_hg + RH_out + Windspeed + Visibility + Tdewpoint +
## NSM + WeekStatus_Weekday + Day_of_week_Friday + Day_of_week_Monday +
## Day_of_week_Saturday + Day_of_week_Thursday + Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - RH_9 1 3626 133984166 143112
## - Day_of_week_Thursday 1 3799 133984338 143112
## - RH_5 1 3829 133984368 143112
## - T7 1 9563 133990102 143113
## - RH_4 1 18593 133999133 143114
## - Day_of_week_Tuesday 1 19369 133999909 143114
## - WeekStatus_Weekday 1 22167 134002706 143114
## - Press_mm_hg 1 22307 134002847 143114
## - T1 1 24688 134005227 143114
## - Tdewpoint 1 33981 134014521 143115
## - RH_out 1 40706 134021245 143116
## - RH_7 1 48122 134028662 143117
## - Day_of_week_Monday 1 79996 134060536 143121
## <none> 133980540 143121
## - Visibility 1 93325 134073865 143122
## - RH_6 1 139588 134120127 143128
## - Windspeed 1 205844 134186384 143136
## - Day_of_week_Saturday 1 256644 134237184 143142
## - T_out 1 261117 134241657 143142
## - Day_of_week_Friday 1 356681 134337220 143153
## - NSM 1 383871 134364411 143157
## - T8 1 414048 134394587 143160
## - T9 1 493597 134474137 143169
## - RH_3 1 554036 134534576 143177
## - T6 1 878146 134858685 143215
## - RH_8 1 909868 134890408 143218
## - T2 1 930936 134911475 143221
## - RH_2 1 2141905 136122445 143362
## - lights 1 2971251 136951791 143458
## - RH_1 1 3248789 137229329 143490
## - T3 1 3852858 137833397 143559
##
## Step: AIC=143111.8
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + T_out + Press_mm_hg +
## RH_out + Windspeed + Visibility + Tdewpoint + NSM + WeekStatus_Weekday +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Thursday + Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - Day_of_week_Thursday 1 4202 133988368 143103
## - RH_5 1 4320 133988486 143103
## - T7 1 8230 133992396 143103
## - Day_of_week_Tuesday 1 19840 134004006 143105
## - Press_mm_hg 1 20276 134004442 143105
## - WeekStatus_Weekday 1 20592 134004758 143105
## - RH_4 1 23137 134007303 143105
## - T1 1 24033 134008199 143105
## - Tdewpoint 1 33966 134018132 143106
## - RH_out 1 40117 134024283 143107
## - RH_7 1 50074 134034240 143108
## - Day_of_week_Monday 1 78109 134062275 143111
## <none> 133984166 143112
## - Visibility 1 92990 134077156 143113
## - RH_6 1 136022 134120188 143118
## - Windspeed 1 203020 134187186 143126
## - T_out 1 261063 134245229 143133
## - Day_of_week_Saturday 1 261285 134245450 143133
## - Day_of_week_Friday 1 353964 134338130 143144
## - T8 1 424584 134408750 143152
## - NSM 1 426448 134410614 143152
## - T9 1 495302 134479468 143160
## - RH_3 1 561417 134545583 143168
## - T6 1 875982 134860148 143205
## - T2 1 931274 134915440 143212
## - RH_8 1 980309 134964475 143217
## - RH_2 1 2153730 136137896 143354
## - lights 1 2998943 136983109 143452
## - RH_1 1 3276067 137260233 143484
## - T3 1 3854205 137838371 143550
##
## Step: AIC=143102.7
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## RH_5 + T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + T_out + Press_mm_hg +
## RH_out + Windspeed + Visibility + Tdewpoint + NSM + WeekStatus_Weekday +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - RH_5 1 4276 133992644 143094
## - T7 1 7505 133995873 143094
## - Day_of_week_Tuesday 1 15678 134004046 143095
## - Press_mm_hg 1 20753 134009121 143095
## - RH_4 1 24186 134012554 143096
## - T1 1 24549 134012917 143096
## - Tdewpoint 1 34056 134022424 143097
## - WeekStatus_Weekday 1 39276 134027644 143098
## - RH_out 1 40312 134028680 143098
## - RH_7 1 49641 134038009 143099
## <none> 133988368 143103
## - Visibility 1 92200 134080568 143104
## - Day_of_week_Monday 1 126880 134115248 143108
## - RH_6 1 135336 134123704 143109
## - Windspeed 1 214916 134203284 143118
## - T_out 1 261043 134249411 143124
## - Day_of_week_Saturday 1 262011 134250379 143124
## - NSM 1 427882 134416250 143143
## - T8 1 430212 134418580 143144
## - T9 1 491553 134479921 143151
## - Day_of_week_Friday 1 523353 134511720 143155
## - RH_3 1 567311 134555679 143160
## - T6 1 873955 134862323 143196
## - T2 1 928998 134917366 143202
## - RH_8 1 980449 134968817 143208
## - RH_2 1 2149558 136137925 143344
## - lights 1 3000316 136988684 143443
## - RH_1 1 3271881 137260249 143474
## - T3 1 3850045 137838413 143540
##
## Step: AIC=143093.5
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## T6 + RH_6 + T7 + RH_7 + T8 + RH_8 + T9 + T_out + Press_mm_hg +
## RH_out + Windspeed + Visibility + Tdewpoint + NSM + WeekStatus_Weekday +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - T7 1 7527 134000171 143085
## - Day_of_week_Tuesday 1 16120 134008764 143086
## - Press_mm_hg 1 20096 134012740 143086
## - RH_4 1 23790 134016434 143087
## - T1 1 25190 134017835 143087
## - Tdewpoint 1 33849 134026493 143088
## - WeekStatus_Weekday 1 39726 134032370 143089
## - RH_out 1 40818 134033463 143089
## - RH_7 1 49011 134041656 143090
## <none> 133992644 143094
## - Visibility 1 90768 134083412 143095
## - Day_of_week_Monday 1 127234 134119878 143099
## - RH_6 1 134231 134126876 143100
## - Windspeed 1 214322 134206967 143109
## - Day_of_week_Saturday 1 260644 134253288 143115
## - T_out 1 262134 134254778 143115
## - T8 1 438031 134430675 143135
## - NSM 1 451596 134444240 143137
## - T9 1 493463 134486107 143142
## - Day_of_week_Friday 1 523805 134516449 143145
## - RH_3 1 567374 134560019 143151
## - T6 1 870433 134863077 143186
## - T2 1 924924 134917568 143192
## - RH_8 1 983445 134976090 143199
## - RH_2 1 2145737 136138381 143335
## - lights 1 3022677 137015321 143436
## - RH_1 1 3267605 137260249 143464
## - T3 1 3847340 137839985 143531
##
## Step: AIC=143084.7
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## T6 + RH_6 + RH_7 + T8 + RH_8 + T9 + T_out + Press_mm_hg +
## RH_out + Windspeed + Visibility + Tdewpoint + NSM + WeekStatus_Weekday +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday +
## Day_of_week_Tuesday
##
## Df Sum of Sq RSS AIC
## - Day_of_week_Tuesday 1 15225 134015397 143077
## - T1 1 22486 134022657 143078
## - Press_mm_hg 1 25068 134025239 143078
## - RH_4 1 27201 134027373 143078
## - Tdewpoint 1 31723 134031894 143079
## - WeekStatus_Weekday 1 38056 134038228 143080
## - RH_7 1 41610 134041782 143080
## - RH_out 1 41918 134042090 143080
## <none> 134000171 143085
## - Visibility 1 88985 134089156 143086
## - Day_of_week_Monday 1 129075 134129247 143090
## - RH_6 1 133407 134133579 143091
## - Windspeed 1 213662 134213834 143100
## - T_out 1 257909 134258080 143105
## - Day_of_week_Saturday 1 264348 134264519 143106
## - NSM 1 447392 134447563 143128
## - Day_of_week_Friday 1 536078 134536249 143138
## - T8 1 550208 134550379 143140
## - RH_3 1 579574 134579745 143143
## - T9 1 686536 134686707 143156
## - T6 1 863975 134864146 143177
## - T2 1 947787 134947958 143186
## - RH_8 1 1136278 135136450 143208
## - RH_2 1 2175831 136176003 143329
## - lights 1 3023649 137023821 143427
## - RH_1 1 3295918 137296090 143459
## - T3 1 3840955 137841127 143521
##
## Step: AIC=143076.9
## Appliances ~ lights + T1 + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 +
## T6 + RH_6 + RH_7 + T8 + RH_8 + T9 + T_out + Press_mm_hg +
## RH_out + Windspeed + Visibility + Tdewpoint + NSM + WeekStatus_Weekday +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - T1 1 23580 134038977 143070
## - Press_mm_hg 1 23762 134039158 143070
## - RH_4 1 27535 134042932 143070
## - Tdewpoint 1 31193 134046589 143071
## - RH_7 1 39276 134054672 143072
## - RH_out 1 41612 134057009 143072
## - WeekStatus_Weekday 1 59968 134075364 143074
## <none> 134015397 143077
## - Visibility 1 87119 134102516 143077
## - RH_6 1 132157 134147554 143083
## - Day_of_week_Monday 1 177925 134193321 143088
## - Windspeed 1 208990 134224386 143092
## - T_out 1 258096 134273493 143098
## - Day_of_week_Saturday 1 266702 134282099 143099
## - NSM 1 448255 134463652 143120
## - T8 1 565307 134580703 143134
## - RH_3 1 609795 134625192 143139
## - Day_of_week_Friday 1 679459 134694856 143147
## - T9 1 683593 134698990 143148
## - T6 1 876630 134892026 143170
## - T2 1 956216 134971613 143179
## - RH_8 1 1146376 135161773 143202
## - RH_2 1 2186501 136201897 143323
## - lights 1 3036520 137051916 143421
## - RH_1 1 3280762 137296159 143449
## - T3 1 3832077 137847473 143512
##
## Step: AIC=143070
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 + T6 +
## RH_6 + RH_7 + T8 + RH_8 + T9 + T_out + Press_mm_hg + RH_out +
## Windspeed + Visibility + Tdewpoint + NSM + WeekStatus_Weekday +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - Press_mm_hg 1 20583 134059560 143063
## - RH_4 1 25351 134064327 143063
## - Tdewpoint 1 35404 134074381 143064
## - RH_7 1 42304 134081281 143065
## - RH_out 1 45835 134084812 143066
## - WeekStatus_Weekday 1 62068 134101044 143068
## <none> 134038977 143070
## - Visibility 1 86586 134125563 143070
## - RH_6 1 128304 134167280 143075
## - Day_of_week_Monday 1 178908 134217885 143081
## - Windspeed 1 198634 134237610 143084
## - Day_of_week_Saturday 1 260670 134299647 143091
## - T_out 1 290899 134329876 143095
## - NSM 1 428593 134467570 143111
## - T8 1 556199 134595176 143126
## - RH_3 1 612730 134651707 143132
## - Day_of_week_Friday 1 670316 134709292 143139
## - T9 1 670362 134709339 143139
## - T6 1 1061561 135100538 143185
## - RH_8 1 1134574 135173550 143193
## - T2 1 2259400 136298377 143324
## - lights 1 3013223 137052200 143411
## - RH_2 1 3039480 137078456 143414
## - T3 1 4347811 138386788 143564
## - RH_1 1 4626974 138665951 143596
##
## Step: AIC=143062.7
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + RH_4 + T6 +
## RH_6 + RH_7 + T8 + RH_8 + T9 + T_out + RH_out + Windspeed +
## Visibility + Tdewpoint + NSM + WeekStatus_Weekday + Day_of_week_Friday +
## Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - RH_4 1 21489 134081049 143056
## - Tdewpoint 1 38605 134098165 143058
## - RH_out 1 48869 134108429 143059
## - RH_7 1 49132 134108691 143059
## - WeekStatus_Weekday 1 66047 134125607 143061
## <none> 134059560 143063
## - Visibility 1 89309 134148868 143064
## - RH_6 1 115388 134174948 143067
## - Windspeed 1 178361 134237921 143074
## - Day_of_week_Monday 1 184717 134244276 143075
## - Day_of_week_Saturday 1 251803 134311363 143083
## - T_out 1 289811 134349371 143087
## - NSM 1 420224 134479784 143102
## - T8 1 535647 134595207 143116
## - RH_3 1 608476 134668036 143125
## - Day_of_week_Friday 1 671450 134731010 143132
## - T9 1 681966 134741526 143133
## - T6 1 1040987 135100547 143175
## - RH_8 1 1128900 135188460 143185
## - T2 1 2239904 136299464 143315
## - RH_2 1 3024437 137083997 143405
## - lights 1 3025535 137085095 143405
## - T3 1 4330220 138389779 143555
## - RH_1 1 4615154 138674714 143587
##
## Step: AIC=143055.6
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + T6 + RH_6 +
## RH_7 + T8 + RH_8 + T9 + T_out + RH_out + Windspeed + Visibility +
## Tdewpoint + NSM + WeekStatus_Weekday + Day_of_week_Friday +
## Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - Tdewpoint 1 35128 134116177 143050
## - RH_out 1 47489 134128538 143052
## - WeekStatus_Weekday 1 64643 134145692 143054
## - RH_7 1 67332 134148381 143054
## <none> 134081049 143056
## - Visibility 1 91238 134172287 143057
## - RH_6 1 104892 134185941 143058
## - Windspeed 1 163563 134244612 143065
## - Day_of_week_Monday 1 176858 134257907 143067
## - Day_of_week_Saturday 1 243800 134324849 143075
## - T_out 1 284358 134365407 143079
## - NSM 1 404492 134485541 143093
## - T8 1 547970 134629019 143110
## - RH_3 1 589460 134670509 143115
## - T9 1 661204 134742253 143124
## - Day_of_week_Friday 1 677699 134758748 143126
## - T6 1 1023020 135104069 143166
## - RH_8 1 1217924 135298973 143189
## - T2 1 2579699 136660748 143347
## - lights 1 3014805 137095854 143397
## - RH_2 1 3435651 137516700 143445
## - T3 1 4407689 138488738 143557
## - RH_1 1 4593860 138674909 143578
##
## Step: AIC=143050
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + T6 + RH_6 +
## RH_7 + T8 + RH_8 + T9 + T_out + RH_out + Windspeed + Visibility +
## NSM + WeekStatus_Weekday + Day_of_week_Friday + Day_of_week_Monday +
## Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - RH_out 1 14275 134130452 143042
## - RH_7 1 60706 134176883 143048
## - WeekStatus_Weekday 1 63386 134179563 143048
## <none> 134116177 143050
## - RH_6 1 82255 134198432 143050
## - Visibility 1 87066 134203243 143051
## - Day_of_week_Monday 1 177901 134294078 143061
## - Windspeed 1 207710 134323887 143065
## - Day_of_week_Saturday 1 244993 134361170 143069
## - NSM 1 433879 134550056 143091
## - T8 1 547536 134663713 143105
## - RH_3 1 575749 134691926 143108
## - T_out 1 608231 134724408 143112
## - T9 1 660342 134776519 143118
## - Day_of_week_Friday 1 700051 134816228 143123
## - T6 1 1018515 135134692 143160
## - RH_8 1 1224468 135340645 143184
## - T2 1 2575476 136691653 143341
## - lights 1 3013983 137130160 143391
## - RH_2 1 3401900 137518077 143436
## - T3 1 4378059 138494236 143548
## - RH_1 1 4720754 138836931 143587
##
## Step: AIC=143042.1
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + T6 + RH_6 +
## RH_7 + T8 + RH_8 + T9 + T_out + Windspeed + Visibility +
## NSM + WeekStatus_Weekday + Day_of_week_Friday + Day_of_week_Monday +
## Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - WeekStatus_Weekday 1 59395 134189847 143039
## - RH_6 1 68032 134198483 143040
## - RH_7 1 73464 134203916 143041
## <none> 134130452 143042
## - Visibility 1 83898 134214350 143042
## - Day_of_week_Monday 1 181869 134312320 143054
## - Windspeed 1 240346 134370797 143061
## - Day_of_week_Saturday 1 253123 134383574 143062
## - NSM 1 426760 134557212 143083
## - T8 1 555808 134686259 143098
## - T_out 1 672250 134802702 143111
## - RH_3 1 676236 134806687 143112
## - Day_of_week_Friday 1 692273 134822725 143114
## - T9 1 708052 134838504 143116
## - T6 1 1030841 135161292 143153
## - RH_8 1 1362133 135492585 143192
## - T2 1 2592387 136722839 143335
## - lights 1 2999708 137130160 143382
## - RH_2 1 3883414 138013865 143483
## - T3 1 4365216 138495668 143538
## - RH_1 1 4845830 138976282 143593
##
## Step: AIC=143039.4
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + T6 + RH_6 +
## RH_7 + T8 + RH_8 + T9 + T_out + Windspeed + Visibility +
## NSM + Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - RH_7 1 63171 134253019 143037
## - RH_6 1 72208 134262055 143038
## - Visibility 1 79887 134269734 143039
## <none> 134189847 143039
## - Day_of_week_Monday 1 141380 134331228 143046
## - Windspeed 1 263838 134453686 143061
## - NSM 1 415701 134605548 143079
## - Day_of_week_Friday 1 636240 134826088 143104
## - T8 1 667135 134856982 143108
## - Day_of_week_Saturday 1 670105 134859953 143108
## - T_out 1 670495 134860343 143108
## - RH_3 1 677281 134867129 143109
## - T9 1 761308 134951155 143119
## - T6 1 1045669 135235516 143152
## - RH_8 1 1383744 135573591 143192
## - T2 1 2621853 136811700 143335
## - lights 1 2993134 137182981 143378
## - RH_2 1 3904347 138094194 143483
## - T3 1 4357820 138547667 143534
## - RH_1 1 4826901 139016749 143588
##
## Step: AIC=143037.1
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + T6 + RH_6 +
## T8 + RH_8 + T9 + T_out + Windspeed + Visibility + NSM + Day_of_week_Friday +
## Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## - RH_6 1 53143 134306162 143034
## <none> 134253019 143037
## - Visibility 1 89847 134342865 143038
## - Day_of_week_Monday 1 135238 134388257 143043
## - Windspeed 1 273523 134526542 143060
## - NSM 1 417433 134670451 143076
## - RH_3 1 636379 134889397 143102
## - Day_of_week_Friday 1 645007 134898026 143103
## - Day_of_week_Saturday 1 700053 134953071 143110
## - T_out 1 701332 134954351 143110
## - T8 1 774245 135027264 143118
## - T9 1 796322 135049340 143121
## - T6 1 1024239 135277257 143147
## - RH_8 1 2402123 136655141 143307
## - T2 1 2954849 137207868 143371
## - lights 1 3008779 137261797 143377
## - T3 1 4450636 138703654 143542
## - RH_2 1 4476689 138729708 143545
## - RH_1 1 5018807 139271825 143607
##
## Step: AIC=143033.7
## Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 + T6 + T8 +
## RH_8 + T9 + T_out + Windspeed + Visibility + NSM + Day_of_week_Friday +
## Day_of_week_Monday + Day_of_week_Saturday
##
## Df Sum of Sq RSS AIC
## <none> 134306162 143034
## - Visibility 1 95622 134401783 143035
## - Day_of_week_Monday 1 134992 134441154 143040
## - Windspeed 1 276012 134582174 143056
## - NSM 1 411028 134717189 143072
## - Day_of_week_Friday 1 692818 134998979 143105
## - Day_of_week_Saturday 1 709714 135015876 143107
## - T_out 1 716155 135022317 143108
## - RH_3 1 724610 135030771 143109
## - T8 1 758974 135065136 143113
## - T9 1 903075 135209236 143130
## - T6 1 971276 135277437 143138
## - RH_8 1 2375116 136681278 143301
## - T2 1 2902263 137208424 143362
## - lights 1 3086102 137392263 143383
## - T3 1 4425742 138731903 143536
## - RH_2 1 4457873 138764035 143540
## - RH_1 1 4966675 139272837 143597
summary(step_bic)
##
## Call:
## lm(formula = Appliances ~ lights + RH_1 + T2 + RH_2 + T3 + RH_3 +
## T6 + T8 + RH_8 + T9 + T_out + Windspeed + Visibility + NSM +
## Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday,
## data = train_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -252.58 -42.45 -18.09 7.58 941.44
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.224e+00 1.908e+01 0.274 0.784237
## lights 1.902e+00 9.991e-02 19.036 < 2e-16 ***
## RH_1 1.517e+01 6.280e-01 24.149 < 2e-16 ***
## T2 -2.141e+01 1.160e+00 -18.460 < 2e-16 ***
## RH_2 -1.459e+01 6.377e-01 -22.879 < 2e-16 ***
## T3 2.335e+01 1.024e+00 22.796 < 2e-16 ***
## RH_3 6.148e+00 6.666e-01 9.224 < 2e-16 ***
## T6 6.491e+00 6.078e-01 10.679 < 2e-16 ***
## T8 8.553e+00 9.060e-01 9.440 < 2e-16 ***
## RH_8 -5.163e+00 3.092e-01 -16.700 < 2e-16 ***
## T9 -1.377e+01 1.337e+00 -10.297 < 2e-16 ***
## T_out -6.252e+00 6.818e-01 -9.170 < 2e-16 ***
## Windspeed 1.946e+00 3.418e-01 5.693 1.27e-08 ***
## Visibility 2.113e-01 6.307e-02 3.351 0.000808 ***
## NSM 2.773e-04 3.992e-05 6.947 3.87e-12 ***
## Day_of_week_Friday 2.012e+01 2.230e+00 9.019 < 2e-16 ***
## Day_of_week_Monday 8.824e+00 2.216e+00 3.981 6.89e-05 ***
## Day_of_week_Saturday 2.076e+01 2.275e+00 9.129 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.29 on 15770 degrees of freedom
## Multiple R-squared: 0.1754, Adjusted R-squared: 0.1745
## F-statistic: 197.3 on 17 and 15770 DF, p-value: < 2.2e-16
step_pred_test <- predict(step_bic, newdata = test_df)
step_test_mse <- mean((test_df$Appliances - step_pred_test)^2)
step_test_rmse <- rmse_vec(test_df$Appliances, step_pred_test)
cat("Test MSE :", step_test_mse, "\n")
## Test MSE : 9482.421
cat("Test RMSE :", step_test_rmse, "\n")
## Test RMSE : 97.37772
coef_step <- summary(step_bic)$coefficients
sig_step <- coef_step[coef_step[,"Pr(>|t|)"] < 0.05, ]
sig_step[order(sig_step[,"Pr(>|t|)"]), ][1:10, ] # top 10 most significant
## Estimate Std. Error t value Pr(>|t|)
## RH_1 15.165966 0.62801401 24.149089 1.499390e-126
## RH_2 -14.590765 0.63774391 -22.878720 5.391709e-114
## T3 23.346379 1.02413835 22.796118 3.359493e-113
## lights 1.901891 0.09991084 19.035881 6.761034e-80
## T2 -21.412988 1.15995474 -18.460193 2.679875e-75
## RH_8 -5.162801 0.30915419 -16.699762 4.490066e-62
## T6 6.490714 0.60778980 10.679209 1.569208e-26
## T9 -13.767167 1.33694923 -10.297450 8.669205e-25
## T8 8.552719 0.90598863 9.440205 4.229532e-21
## RH_3 6.148282 0.66655161 9.224015 3.217085e-20
# Prepare matrices for glmnet
X_train_mat <- as.matrix(train_df[ , setdiff(names(train_df), "Appliances") ])
y_train_vec <- train_df$Appliances
# Ridge (alpha = 0) and Lasso (alpha = 1)
set.seed(123)
ridge_cv <- cv.glmnet(
X_train_mat, y_train_vec,
alpha = 0, # ridge
nfolds = 10
)
lasso_cv <- cv.glmnet(
X_train_mat, y_train_vec,
alpha = 1, # lasso
nfolds = 10
)
# Best lambda values
ridge_lambda <- ridge_cv$lambda.min
lasso_lambda <- lasso_cv$lambda.min
ridge_lambda
## [1] 2.228168
lasso_lambda
## [1] 0.005649219
X_test_mat <- as.matrix(test_df[ , setdiff(names(test_df), "Appliances") ])
y_test_vec <- test_df$Appliances # test matrix
ridge_pred <- predict(ridge_cv, newx = X_test_mat, s = ridge_lambda)
lasso_pred <- predict(lasso_cv, newx = X_test_mat, s = lasso_lambda) # Predictions
ridge_mse <- mean((y_test_vec - ridge_pred)^2)
lasso_mse <- mean((y_test_vec - lasso_pred)^2)
ridge_mse
## [1] 9558.157
lasso_mse
## [1] 9483.875
## Use the same train/test split as before^^
# train_df, test_df already defined:
# train_df$Appliances, test_df$Appliances
# Choose a subset of key continuous variables for smooths
gam_model <- gam(
Appliances ~
s(T2, k = 10) +
s(T3, k = 10) +
s(T6, k = 10) +
s(T8, k = 10) +
s(T9, k = 10) +
s(RH_1, k = 10) +
s(RH_2, k = 10) +
s(RH_3, k = 10) +
s(RH_8, k = 10) +
s(T_out, k = 10) +
s(Windspeed, k = 10) +
s(Visibility, k = 10) +
s(NSM, k = 10) + # time of day
lights + # linear term
Day_of_week_Friday +
Day_of_week_Monday +
Day_of_week_Saturday,
data = train_df,
method = "REML" # smoothing parameter selection
)
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Appliances ~ s(T2, k = 10) + s(T3, k = 10) + s(T6, k = 10) +
## s(T8, k = 10) + s(T9, k = 10) + s(RH_1, k = 10) + s(RH_2,
## k = 10) + s(RH_3, k = 10) + s(RH_8, k = 10) + s(T_out, k = 10) +
## s(Windspeed, k = 10) + s(Visibility, k = 10) + s(NSM, k = 10) +
## lights + Day_of_week_Friday + Day_of_week_Monday + Day_of_week_Saturday
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.67163 1.03762 80.638 < 2e-16 ***
## lights 1.60105 0.09951 16.090 < 2e-16 ***
## Day_of_week_Friday 18.96683 2.26948 8.357 < 2e-16 ***
## Day_of_week_Monday 12.93701 2.22930 5.803 6.63e-09 ***
## Day_of_week_Saturday 20.02456 2.31666 8.644 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(T2) 6.313 7.440 35.577 < 2e-16 ***
## s(T3) 4.877 6.057 101.273 < 2e-16 ***
## s(T6) 8.429 8.864 8.458 < 2e-16 ***
## s(T8) 5.130 6.350 26.482 < 2e-16 ***
## s(T9) 8.310 8.830 21.367 < 2e-16 ***
## s(RH_1) 7.476 8.399 19.628 < 2e-16 ***
## s(RH_2) 4.698 5.915 48.511 < 2e-16 ***
## s(RH_3) 7.479 8.429 24.728 < 2e-16 ***
## s(RH_8) 7.778 8.612 12.196 < 2e-16 ***
## s(T_out) 7.223 8.286 6.016 < 2e-16 ***
## s(Windspeed) 3.855 4.786 7.056 4.12e-06 ***
## s(Visibility) 1.017 1.033 8.032 0.00434 **
## s(NSM) 8.876 8.995 125.904 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.256 Deviance explained = 26%
## -REML = 93110 Scale est. = 7675.8 n = 15788
## Predictions on the test set
gam_pred_test <- predict(gam_model, newdata = test_df)
gam_test_mse <- mean((test_df$Appliances - gam_pred_test)^2)
gam_test_rmse <- rmse_vec(test_df$Appliances, gam_pred_test)
cat("Test MSE :", gam_test_mse, "\n")
## Test MSE : 8645.021
cat("Test RMSE :", gam_test_rmse, "\n")
## Test RMSE : 92.97861
par(mfrow = c(2, 3))
plot(gam_model, select = 1, shade = TRUE, main = "s(T2)")
plot(gam_model, select = 2, shade = TRUE, main = "s(T3)")
plot(gam_model, select = 3, shade = TRUE, main = "s(T6)")
plot(gam_model, select = 4, shade = TRUE, main = "s(T8)")
plot(gam_model, select = 5, shade = TRUE, main = "s(T9)")
plot(gam_model, select = 6, shade = TRUE, main = "s(NSM)") # not showing up for some reason
par(mfrow = c(1, 1))
summary(gam_model)$s.table
## edf Ref.df F p-value
## s(T2) 6.312886 7.439505 35.577302 0.000000e+00
## s(T3) 4.876515 6.057422 101.272935 0.000000e+00
## s(T6) 8.429195 8.864192 8.457661 0.000000e+00
## s(T8) 5.129514 6.349713 26.481780 0.000000e+00
## s(T9) 8.310258 8.830235 21.367168 0.000000e+00
## s(RH_1) 7.476447 8.399206 19.627642 0.000000e+00
## s(RH_2) 4.697773 5.915236 48.510559 0.000000e+00
## s(RH_3) 7.479280 8.429234 24.727568 0.000000e+00
## s(RH_8) 7.778237 8.612006 12.195629 0.000000e+00
## s(T_out) 7.223058 8.286421 6.015937 -3.988996e-07
## s(Windspeed) 3.854557 4.786017 7.055898 4.116443e-06
## s(Visibility) 1.016840 1.033283 8.032116 4.338325e-03
## s(NSM) 8.876162 8.995190 125.904373 0.000000e+00
set.seed(123)
# Fit a fairly large regression tree first
tree_full <- rpart(
Appliances ~ .,
data = train_df,
method = "anova", # regression tree
control = rpart.control(cp = 0.001) # small cp so tree can grow
)
printcp(tree_full) # cross-validated error table
##
## Regression tree:
## rpart(formula = Appliances ~ ., data = train_df, method = "anova",
## control = rpart.control(cp = 0.001))
##
## Variables actually used in tree construction:
## [1] Day_of_week_Friday Day_of_week_Monday Day_of_week_Saturday
## [4] Day_of_week_Thursday Day_of_week_Tuesday lights
## [7] NSM Press_mm_hg RH_1
## [10] RH_2 RH_3 RH_4
## [13] RH_5 RH_6 RH_7
## [16] RH_8 RH_9 RH_out
## [19] T_out T1 T2
## [22] T3 T4 T5
## [25] T6 T7 T8
## [28] T9 Tdewpoint Visibility
## [31] Windspeed
##
## Root node error: 162871592/15788 = 10316
##
## n= 15788
##
## CP nsplit rel error xerror xstd
## 1 0.0870844 0 1.00000 1.00005 0.031983
## 2 0.0295618 1 0.91292 0.91385 0.029663
## 3 0.0227679 2 0.88335 0.88533 0.028731
## 4 0.0110660 3 0.86059 0.86288 0.027993
## 5 0.0107357 4 0.84952 0.85268 0.027800
## 6 0.0082624 7 0.81731 0.84271 0.027583
## 7 0.0063616 8 0.80905 0.83041 0.027134
## 8 0.0062898 10 0.79633 0.83133 0.027175
## 9 0.0055143 13 0.77746 0.82580 0.026980
## 10 0.0047455 14 0.77194 0.81370 0.026629
## 11 0.0039172 15 0.76720 0.79162 0.025988
## 12 0.0036326 16 0.76328 0.77149 0.025358
## 13 0.0034323 17 0.75965 0.77017 0.025239
## 14 0.0033259 18 0.75622 0.76870 0.025199
## 15 0.0032727 21 0.74624 0.76924 0.025227
## 16 0.0032579 22 0.74297 0.76970 0.025232
## 17 0.0032377 24 0.73645 0.76822 0.025162
## 18 0.0031917 25 0.73321 0.76961 0.025175
## 19 0.0029874 29 0.71948 0.77116 0.025184
## 20 0.0029258 31 0.71350 0.77528 0.025296
## 21 0.0029212 32 0.71058 0.77363 0.025291
## 22 0.0029157 34 0.70473 0.77363 0.025291
## 23 0.0028958 35 0.70182 0.77311 0.025288
## 24 0.0028342 37 0.69603 0.77100 0.025202
## 25 0.0027446 38 0.69319 0.76918 0.025138
## 26 0.0026992 46 0.67079 0.76874 0.025133
## 27 0.0026808 47 0.66809 0.77082 0.025197
## 28 0.0026700 48 0.66541 0.77112 0.025200
## 29 0.0024487 49 0.66274 0.76849 0.025136
## 30 0.0023376 51 0.65784 0.76582 0.024925
## 31 0.0023163 52 0.65550 0.76471 0.024875
## 32 0.0023025 54 0.65087 0.76370 0.024818
## 33 0.0022598 58 0.64166 0.76184 0.024795
## 34 0.0022478 64 0.62702 0.76200 0.024845
## 35 0.0022227 65 0.62477 0.75990 0.024810
## 36 0.0022216 66 0.62255 0.75634 0.024496
## 37 0.0021771 67 0.62033 0.75643 0.024504
## 38 0.0021029 68 0.61815 0.75701 0.024480
## 39 0.0020804 69 0.61605 0.75817 0.024500
## 40 0.0020779 70 0.61397 0.75805 0.024527
## 41 0.0020361 71 0.61189 0.75681 0.024505
## 42 0.0020078 73 0.60782 0.75718 0.024528
## 43 0.0019846 74 0.60581 0.75493 0.024477
## 44 0.0019362 75 0.60382 0.75813 0.024543
## 45 0.0019269 76 0.60189 0.75481 0.024399
## 46 0.0018823 79 0.59611 0.75241 0.024376
## 47 0.0018693 80 0.59422 0.75186 0.024360
## 48 0.0018666 81 0.59236 0.75000 0.024316
## 49 0.0017865 82 0.59049 0.74782 0.024309
## 50 0.0017837 84 0.58692 0.74599 0.024267
## 51 0.0017827 85 0.58513 0.74605 0.024271
## 52 0.0017463 88 0.57978 0.74611 0.024294
## 53 0.0017241 89 0.57804 0.74680 0.024317
## 54 0.0016993 90 0.57631 0.74400 0.024184
## 55 0.0016966 92 0.57291 0.74305 0.024185
## 56 0.0016311 93 0.57122 0.74485 0.024206
## 57 0.0015936 95 0.56796 0.74493 0.024201
## 58 0.0015759 96 0.56636 0.74530 0.024215
## 59 0.0015610 99 0.56163 0.74394 0.024171
## 60 0.0015600 102 0.55695 0.74420 0.024170
## 61 0.0015568 103 0.55539 0.74491 0.024179
## 62 0.0015396 106 0.55072 0.74380 0.024143
## 63 0.0015023 107 0.54918 0.74307 0.024148
## 64 0.0014538 108 0.54768 0.74037 0.024136
## 65 0.0014279 109 0.54623 0.74353 0.024220
## 66 0.0014162 111 0.54337 0.74386 0.024231
## 67 0.0014084 112 0.54195 0.74367 0.024223
## 68 0.0014073 113 0.54054 0.74387 0.024221
## 69 0.0013849 114 0.53914 0.74359 0.024149
## 70 0.0013692 118 0.53360 0.74483 0.024170
## 71 0.0013603 119 0.53223 0.74353 0.024135
## 72 0.0013469 120 0.53087 0.74278 0.024116
## 73 0.0013262 121 0.52952 0.74184 0.024114
## 74 0.0013226 122 0.52820 0.74052 0.024107
## 75 0.0013004 123 0.52687 0.74086 0.024127
## 76 0.0012884 124 0.52557 0.73923 0.024071
## 77 0.0012870 125 0.52428 0.74052 0.024113
## 78 0.0012852 126 0.52300 0.74052 0.024113
## 79 0.0012809 128 0.52043 0.74030 0.024113
## 80 0.0012786 129 0.51915 0.74080 0.024131
## 81 0.0012749 130 0.51787 0.74080 0.024131
## 82 0.0012602 133 0.51404 0.73943 0.024120
## 83 0.0012568 134 0.51278 0.73785 0.024070
## 84 0.0012434 136 0.51027 0.73734 0.024070
## 85 0.0012332 137 0.50903 0.73678 0.024067
## 86 0.0012235 138 0.50779 0.73704 0.024078
## 87 0.0012184 139 0.50657 0.73719 0.024091
## 88 0.0011619 142 0.50291 0.73696 0.024077
## 89 0.0011568 144 0.50059 0.73679 0.024096
## 90 0.0011324 145 0.49943 0.73578 0.024000
## 91 0.0011299 146 0.49830 0.73541 0.024005
## 92 0.0011229 152 0.49152 0.73604 0.024009
## 93 0.0011228 153 0.49040 0.73642 0.024045
## 94 0.0010969 154 0.48928 0.73636 0.024015
## 95 0.0010818 157 0.48598 0.73652 0.023991
## 96 0.0010806 161 0.48166 0.73478 0.023919
## 97 0.0010635 164 0.47842 0.73401 0.023926
## 98 0.0010238 165 0.47735 0.73433 0.023927
## 99 0.0010228 166 0.47633 0.73335 0.023910
## 100 0.0010223 167 0.47531 0.73335 0.023910
## 101 0.0010206 169 0.47326 0.73322 0.023910
## 102 0.0010202 170 0.47224 0.73322 0.023910
## 103 0.0010079 171 0.47122 0.73293 0.023908
## 104 0.0010008 172 0.47021 0.73112 0.023579
## 105 0.0010000 173 0.46921 0.73064 0.023571
plotcp(tree_full) # visual CP vs. xerror plot
##### Here I fit a regression tree using rpart, with Appliances as the
continuous response and all other variables in train_df as predictors.
The method = “anova” option specifies a regression tree rather than a
classification tree. I set a small initial complexity parameter cp =
0.001 so that the tree can grow reasonably deep before pruning. The
printcp() and plotcp() outputs show the cross-validated error (xerror)
for different tree sizes, which we use to choose the optimal amount of
pruning.
tree_full <- rpart(
Appliances ~ .,
data = train_df,
method = "anova",
control = rpart.control(
cp = 0.001,
maxdepth = 7, # limit depth
minsplit = 100, # need at least 100 obs to split
minbucket = 50 # leaf must have at least 50 obs
)
)
cptab <- tree_full$cptable
minrow <- which.min(cptab[, "xerror"])
xerr_min <- cptab[minrow, "xerror"]
xerr_se <- cptab[minrow, "xstd"] # sandard error
cp_1se <- cptab[which(cptab[, "xerror"] <= xerr_min + xerr_se)[1], "CP"]
cp_1se
## [1] 0.006289805
tree_pruned <- prune(tree_full, cp = cp_1se)
rpart.plot(
tree_pruned,
type = 2,
extra = 101,
fallen.leaves = TRUE,
tweak = 1.1,
main = "Regression Tree: Appliances Energy Use (Pruned, 1-SE Rule)"
)
rpart.plot(
tree_full,
type = 2,
extra = 101,
fallen.leaves = TRUE,
tweak = 1.1,
main = "Regression Tree: Appliances Energy Use (Constrained Full Tree)"
)
# Predictions on the held-out test set
tree_pred_test <- predict(tree_pruned, newdata = test_df)
tree_test_mse <- mean((test_df$Appliances - tree_pred_test)^2)
tree_test_rmse <- rmse_vec(test_df$Appliances, tree_pred_test)
cat("Decision Tree - Test MSE :", tree_test_mse, "\n")
## Decision Tree - Test MSE : 9662.111
cat("Decision Tree - Test RMSE:", tree_test_rmse, "\n")
## Decision Tree - Test RMSE: 98.29604
# Variable importance scores
tree_importance <- tree_pruned$variable.importance
tree_importance[order(tree_importance, decreasing = TRUE)][1:10]
## NSM lights RH_7 T9 RH_8 T8 RH_4 T7
## 18998355 3708244 3157707 2818289 2560720 2343179 1887990 1885982
## T3 RH_out
## 1802338 1553610
set.seed(123)
# Number of predictor columns (exclude response)
p <- ncol(train_df) - 1
bag_model <- randomForest(
Appliances ~ .,
data = train_df,
ntree = 200, # number of bootstrap trees
mtry = floor(p/2), # use ALL predictors at each split → pure bagging
importance = TRUE
)
bag_model
##
## Call:
## randomForest(formula = Appliances ~ ., data = train_df, ntree = 200, mtry = floor(p/2), importance = TRUE)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 17
##
## Mean of squared residuals: 4489.437
## % Var explained: 56.48
# Predictions on the held-out test set
bag_pred_test <- predict(bag_model, newdata = test_df)
bag_test_mse <- mean((test_df$Appliances - bag_pred_test)^2)
bag_test_rmse <- rmse_vec(test_df$Appliances, bag_pred_test)
cat("Bagging - Test MSE :", bag_test_mse, "\n")
## Bagging - Test MSE : 5043.248
cat("Bagging - Test RMSE:", bag_test_rmse, "\n")
## Bagging - Test RMSE: 71.01583
bag_importance <- importance(bag_model)
bag_importance[order(bag_importance[, "IncNodePurity"], decreasing = TRUE), ][1:10, ]
## %IncMSE IncNodePurity
## NSM 69.40215 23614816
## T3 23.08629 7570444
## RH_3 22.23678 7307082
## Press_mm_hg 26.01731 6519752
## RH_1 18.78242 6398624
## RH_5 23.28624 6291720
## RH_2 18.31191 6208646
## T8 20.51969 6057218
## RH_out 10.81572 5470730
## RH_8 15.75840 5453627
varImpPlot(bag_model, n.var = 10, main = "Bagging: Top 10 Important Predictors")
##### The bagged ensemble also provides variable importance measures
based on the total reduction in node impurity contributed by each
predictor across all trees. The variable-importance metrics highlight a
clear hierarchy of influential predictors. NSM (time of day) dominates
by a large margin, with a %IncMSE of 143.07 and the highest
IncNodePurity value (27.6 million). This indicates that interrupting the
daily cycle pattern dramatically worsens prediction accuracy, confirming
that appliance usage is strongly governed by consistent behavioral
routines across the day. After NSM, the next most influential variables
include T3, RH_5, Press_mm_hg, RH_3, RH_2, T8, and RH_7, each showing
substantial contributions to predictive improvement. These predictors
likely reflect localized heating/cooling demands and humidity-driven
HVAC responses across specific rooms—patterns that linear and additive
models only partially revealed. Additional humidity measures such as
RH_1 and RH_9 also appear among the top contributors, reinforcing the
role of indoor environmental conditions in shaping appliance energy
usage. Overall, the random forest uncovers a structure in which
time-of-day effects, room-specific thermal and humidity conditions, and
atmospheric pressure collectively drive energy consumption. This
importance ranking confirms the forest’s strength in capturing complex
interactions and nonlinearities that simpler models struggled to
detect.
library(randomForest)
set.seed(123)
p <- ncol(train_df) - 1 # number of predictors (excluding Appliances)
rf_model <- randomForest(
Appliances ~ .,
data = train_df,
ntree = 200, # number of trees
mtry = floor(sqrt(p)), # random subset of predictors at each split
importance = TRUE
)
rf_model
##
## Call:
## randomForest(formula = Appliances ~ ., data = train_df, ntree = 200, mtry = floor(sqrt(p)), importance = TRUE)
## Type of random forest: regression
## Number of trees: 200
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 4389.483
## % Var explained: 57.45
rf_pred_test <- predict(rf_model, newdata = test_df)
rf_test_mse <- mean((test_df$Appliances - rf_pred_test)^2)
rf_test_rmse <- rmse_vec(test_df$Appliances, rf_pred_test)
cat("Random Forest - Test MSE :", rf_test_mse, "\n")
## Random Forest - Test MSE : 5000.661
cat("Random Forest - Test RMSE:", rf_test_rmse, "\n")
## Random Forest - Test RMSE: 70.71535
rf_importance <- importance(rf_model)
rf_importance[order(rf_importance[, "IncNodePurity"], decreasing = TRUE), ][1:10, ]
## %IncMSE IncNodePurity
## NSM 35.11289 14931932
## RH_3 18.28553 6649857
## RH_1 18.49057 6635205
## T3 17.50554 6506511
## RH_8 17.32931 6189798
## RH_2 16.65122 6152447
## Press_mm_hg 24.00169 6043832
## RH_out 15.98219 5987962
## RH_5 24.33339 5850420
## RH_7 18.16817 5738859
varImpPlot(rf_model, n.var = 10, main = "Random Forest: Top 10 Important Predictors")
##### The random forest’s variable importance output shows a clear
hierarchy of predictors, with NSM (time of day) again emerging as the
dominant variable. Its %IncMSE of 55.89 and the highest IncNodePurity
value (15.1 million) indicate that disrupting the daily cycle
substantially harms prediction accuracy consistent with the strong
diurnal patterns observed in the GAM and earlier models. Several indoor
humidity measures (RH_3, RH_1, RH_2, RH_8, RH_5) also rank highly, each
contributing substantial reductions in node impurity and reflecting the
critical role of humidity-driven HVAC activity in appliance usage.
Indoor temperatures such as T3 and T8 show similarly high importance,
confirming that thermal conditions in specific rooms strongly influence
energy consumption. Interestingly, Press_mm_hg (atmospheric pressure)
also appears among the top predictors, suggesting that broader weather
patterns indirectly affect appliance demand, likely through their impact
on indoor comfort systems. Overall, the forest highlights the same
recurring themes—daily usage cycles, room-specific thermal and humidity
conditions, and weather influences—but with a more stable and
data-driven ranking. This reinforces the forest’s strength in capturing
complex, nonlinear interactions that simpler models cannot express.
set.seed(123)
boost_fit <- gbm(
formula = Appliances ~ .,
data = train_df,
distribution = "gaussian",
n.trees = 3000, # large number so boosting can learn gradually
interaction.depth = 3, # small trees to prevent overfitting
shrinkage = 0.01, # learning rate
n.minobsinnode = 10,
bag.fraction = .5,
train.fraction = 1.0,
verbose = FALSE # boosting uses full sample each round
)
summary(boost_fit)
## var rel.inf
## NSM NSM 26.891623644
## T3 T3 6.082827965
## lights lights 5.602321387
## RH_3 RH_3 4.985189206
## Press_mm_hg Press_mm_hg 4.115331326
## RH_2 RH_2 3.946082576
## T9 T9 3.779138105
## RH_1 RH_1 3.700807145
## RH_5 RH_5 3.364677651
## RH_8 RH_8 3.363494436
## T8 T8 3.283179836
## T5 T5 2.641563501
## T4 T4 2.611329317
## RH_9 RH_9 2.561210311
## RH_7 RH_7 2.503686897
## T1 T1 2.465843774
## T7 T7 2.401316404
## T6 T6 2.049383742
## Windspeed Windspeed 1.843437917
## RH_6 RH_6 1.748965493
## T_out T_out 1.617711510
## RH_4 RH_4 1.594587458
## Tdewpoint Tdewpoint 1.584639662
## T2 T2 1.576216884
## RH_out RH_out 1.348419679
## Day_of_week_Monday Day_of_week_Monday 0.684416943
## Visibility Visibility 0.515110925
## Day_of_week_Saturday Day_of_week_Saturday 0.505955051
## Day_of_week_Friday Day_of_week_Friday 0.417381058
## Day_of_week_Tuesday Day_of_week_Tuesday 0.139210962
## Day_of_week_Sunday Day_of_week_Sunday 0.045466625
## WeekStatus_Weekend WeekStatus_Weekend 0.013781212
## Day_of_week_Thursday Day_of_week_Thursday 0.010712019
## Day_of_week_Wednesday Day_of_week_Wednesday 0.004979378
## WeekStatus_Weekday WeekStatus_Weekday 0.000000000
best_iter <- gbm.perf(boost_fit, method = "OOB")
## OOB generally underestimates the optimal number of iterations although predictive performance is reasonably competitive. Using cv_folds>1 when calling gbm usually results in improved predictive performance.
best_iter
## [1] 2450
## attr(,"smoother")
## Call:
## loess(formula = object$oobag.improve ~ x, enp.target = min(max(4,
## length(x)/10), 50))
##
## Number of Observations: 3000
## Equivalent Number of Parameters: 39.99
## Residual Standard Error: 0.4824
boost_pred <- predict(boost_fit, newdata = test_df, n.trees = best_iter)
boost_test_mse <- mean((test_df$Appliances - boost_pred)^2)
boost_test_rmse <- sqrt(boost_test_mse)
cat("Boosting - Test MSE :", boost_test_mse, "\n")
## Boosting - Test MSE : 7778.658
cat("Boosting - Test RMSE:", boost_test_rmse, "\n")
## Boosting - Test RMSE: 88.1967
# Split into X / y for BART
X_train_bart <- as.matrix(train_df[ , setdiff(names(train_df), "Appliances") ])
y_train_bart <- train_df$Appliances
X_test_bart <- as.matrix(test_df[ , setdiff(names(test_df), "Appliances") ])
y_test_bart <- test_df$Appliances
bart_model <- bart(
x.train = X_train_bart,
y.train = y_train_bart,
x.test = X_test_bart,
ntree = 200, # number of trees in the sum-of-trees model
ndpost = 1000, # number of posterior draws to keep
nskip = 200, # burn-in iterations
sigdf = 3, # prior on error variance
sigquant = 0.9,
k = 2 # controls overall smoothness / regularization
)
##
## Running BART with numeric y
##
## number of trees: 200
## number of chains: 1, default number of threads 1
## tree thinning rate: 1
## Prior:
## k prior fixed to 2.000000
## degrees of freedom in sigma prior: 3.000000
## quantile in sigma prior: 0.900000
## scale in sigma prior: 0.001447
## power and base for tree prior: 2.000000 0.950000
## use quantiles for rule cut points: false
## proposal probabilities: birth/death 0.50, swap 0.10, change 0.40; birth 0.50
## data:
## number of training observations: 15788
## number of test observations: 3947
## number of explanatory variables: 35
## init sigma: 92.216728, curr sigma: 92.216728
##
## Cutoff rules c in x<=c vs x>c
## Number of cutoffs: (var: number of possible c):
## (1: 100) (2: 100) (3: 100) (4: 100) (5: 100)
## (6: 100) (7: 100) (8: 100) (9: 100) (10: 100)
## (11: 100) (12: 100) (13: 100) (14: 100) (15: 100)
## (16: 100) (17: 100) (18: 100) (19: 100) (20: 100)
## (21: 100) (22: 100) (23: 100) (24: 100) (25: 100)
## (26: 100) (27: 100) (28: 100) (29: 100) (30: 100)
## (31: 100) (32: 100) (33: 100) (34: 100) (35: 100)
##
## Running mcmc loop:
## iteration: 100 (of 1000)
## iteration: 200 (of 1000)
## iteration: 300 (of 1000)
## iteration: 400 (of 1000)
## iteration: 500 (of 1000)
## iteration: 600 (of 1000)
## iteration: 700 (of 1000)
## iteration: 800 (of 1000)
## iteration: 900 (of 1000)
## iteration: 1000 (of 1000)
## total seconds in loop: 22.253973
##
## Tree sizes, last iteration:
## [1] 1 2 3 4 5 5 2 2 3 5 2 2 3 2 4 3 2 3
## 2 2 2 3 7 3 3 2 2 8 5 8 3 8 6 3 5 3 3 2
## 2 2 3 3 2 2 5 5 4 2 3 4 2 4 2 5 2 5 3 4
## 4 2 4 7 4 4 3 5 3 5 3 2 4 4 2 4 7 3 2 2
## 2 3 4 3 9 2 3 2 3 2 4 2 4 7 6 3 5 3 5 3
## 5 5 7 4 7 4 5 2 4 3 2 6 2 8 8 3 4 5 4 2
## 2 2 5 3 2 3 3 6 3 3 1 8 3 5 3 4 3 2 4 4
## 5 4 8 5 3 2 5 4 6 1 4 5 2 4 3 6 4 5 2 6
## 4 3 5 2 6 4 6 3 2 4 7 2 2 2 3 6 4 2 3 3
## 3 2 3 4 5 3 5 2 3 3 5 5 3 2 2 5 3 6 2 2
## 2 4
##
## Variable Usage, last iteration (var:count):
## (1: 17) (2: 10) (3: 13) (4: 16) (5: 18)
## (6: 19) (7: 21) (8: 23) (9: 7) (10: 13)
## (11: 17) (12: 18) (13: 17) (14: 15) (15: 7)
## (16: 27) (17: 20) (18: 15) (19: 17) (20: 7)
## (21: 22) (22: 16) (23: 12) (24: 8) (25: 17)
## (26: 40) (27: 12) (28: 10) (29: 25) (30: 15)
## (31: 9) (32: 12) (33: 8) (34: 7) (35: 6)
##
## DONE BART
bart_model
##
## Call:
## bart(x.train = X_train_bart, y.train = y_train_bart, x.test = X_test_bart,
## sigdf = 3, sigquant = 0.9, k = 2, ntree = 200, ndpost = 1000,
## nskip = 200)
# Posterior mean predictions on test set
bart_pred_mean <- bart_model$yhat.test.mean
bart_test_mse <- mean((y_test_bart - bart_pred_mean)^2)
bart_test_rmse <- rmse_vec(y_test_bart, bart_pred_mean)
cat("BART - Test MSE :", bart_test_mse, "\n")
## BART - Test MSE : 6797.507
cat("BART - Test RMSE:", bart_test_rmse, "\n")
## BART - Test RMSE: 82.44699
# Optional: 95% predictive intervals for a few points
bart_pred_draws <- bart_model$yhat.test # ndpost x n_test matrix
bart_pi_lower <- apply(bart_pred_draws, 2, quantile, probs = 0.025)
bart_pi_upper <- apply(bart_pred_draws, 2, quantile, probs = 0.975)
# Simple variable importance: how often each variable is used in splits
var_counts <- colSums(bart_model$varcount) # total split counts per variable
var_imp_bart <- sort(var_counts, decreasing = TRUE)
head(var_imp_bart, 10)
## NSM T4 Press_mm_hg Day_of_week_Friday
## 39290 22834 20464 20336
## lights T8 T3 RH_3
## 18480 17807 17416 17233
## RH_6 T2
## 17221 17174
set.seed(123)
# Convert to DMatrix format
dtrain <- xgb.DMatrix(data = as.matrix(train_df[ , setdiff(names(train_df), "Appliances") ]),
label = train_df$Appliances)
dtest <- xgb.DMatrix(data = as.matrix(test_df[ , setdiff(names(test_df), "Appliances") ]),
label = test_df$Appliances)
# Parameter grid (simple but effective)
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
eta = 0.05, # learning rate
max_depth = 6, # tree depth
subsample = 0.8, # row sampling
colsample_bytree = 0.8 # feature sampling
)
# Train with early stopping
xgb_fit <- xgb.train(
params = params,
data = dtrain,
nrounds = 2000,
evals = list(train = dtrain, test = dtest),
early_stopping_rounds = 50,
print_every_n = 100
)
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
##
## [1] train-rmse:100.324828 test-rmse:105.055649
## [101] train-rmse:67.276493 test-rmse:82.966467
## [201] train-rmse:56.443255 test-rmse:78.461547
## [301] train-rmse:49.727525 test-rmse:76.004070
## [401] train-rmse:44.254773 test-rmse:73.917716
## [501] train-rmse:39.941497 test-rmse:72.700899
## [601] train-rmse:36.127120 test-rmse:71.717500
## [701] train-rmse:33.052256 test-rmse:70.859266
## [801] train-rmse:30.215661 test-rmse:70.153209
## [901] train-rmse:27.881096 test-rmse:69.604969
## [1001] train-rmse:25.912498 test-rmse:69.087980
## [1101] train-rmse:24.143284 test-rmse:68.732501
## [1201] train-rmse:22.535929 test-rmse:68.461505
## [1301] train-rmse:21.048308 test-rmse:68.161049
## [1401] train-rmse:19.669374 test-rmse:67.924789
## [1501] train-rmse:18.487249 test-rmse:67.695308
## [1601] train-rmse:17.422501 test-rmse:67.514204
## [1701] train-rmse:16.387272 test-rmse:67.283664
## [1801] train-rmse:15.471061 test-rmse:67.153439
## [1901] train-rmse:14.656492 test-rmse:66.999631
## [2000] train-rmse:13.909308 test-rmse:66.872899
xgb_pred <- predict(xgb_fit, dtest)
xgb_mse <- mean((test_df$Appliances - xgb_pred)^2)
xgb_rmse <- sqrt(xgb_mse)
cat("XGBoost Test MSE :", xgb_mse, "\n")
## XGBoost Test MSE : 4471.985
cat("XGBoost Test RMSE:", xgb_rmse, "\n")
## XGBoost Test RMSE: 66.8729
importance <- xgb.importance(model = xgb_fit)
print(importance)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: NSM 0.1677424085 0.0424520130 0.0465144087
## 2: RH_1 0.0537006946 0.0564392619 0.0781689911
## 3: T3 0.0522081014 0.0473575442 0.0409596382
## 4: RH_2 0.0456747154 0.0538373550 0.0559607796
## 5: RH_3 0.0450487977 0.0508322458 0.0509386584
## 6: T1 0.0403050624 0.0363465285 0.0692335286
## 7: Press_mm_hg 0.0399039774 0.0467030555 0.0348613481
## 8: RH_5 0.0344625522 0.0431525213 0.0450795169
## 9: T8 0.0331583381 0.0331368255 0.0282738904
## 10: lights 0.0328184601 0.0193208598 0.0340243279
## 11: T6 0.0322289620 0.0445396410 0.0373506680
## 12: RH_6 0.0312140887 0.0398325139 0.0346004587
## 13: RH_4 0.0308438910 0.0429811126 0.0405139521
## 14: RH_9 0.0307589039 0.0451412871 0.0365462590
## 15: RH_8 0.0299644453 0.0454261346 0.0368832411
## 16: T2 0.0298300170 0.0348797027 0.0440903112
## 17: RH_7 0.0290777224 0.0437223483 0.0332851413
## 18: T4 0.0283031386 0.0301831995 0.0305566728
## 19: RH_out 0.0277190918 0.0421871926 0.0362744991
## 20: T5 0.0268570071 0.0236489227 0.0251866990
## 21: T_out 0.0255807793 0.0314875315 0.0254584588
## 22: Tdewpoint 0.0250687114 0.0351532730 0.0296544302
## 23: T9 0.0247134865 0.0200200022 0.0152946420
## 24: Windspeed 0.0235243935 0.0317556116 0.0286978357
## 25: T7 0.0226944307 0.0230390414 0.0214472840
## 26: Visibility 0.0144553899 0.0266648737 0.0283499832
## 27: Day_of_week_Monday 0.0061761933 0.0017194781 0.0016088181
## 28: Day_of_week_Friday 0.0059462094 0.0029015092 0.0027284685
## 29: Day_of_week_Saturday 0.0034218730 0.0011006291 0.0016305588
## 30: Day_of_week_Tuesday 0.0020831982 0.0009565326 0.0012718359
## 31: Day_of_week_Wednesday 0.0013247909 0.0006229336 0.0012066135
## 32: WeekStatus_Weekday 0.0012721674 0.0006750900 0.0010544281
## 33: Day_of_week_Thursday 0.0009125293 0.0007778983 0.0011631320
## 34: Day_of_week_Sunday 0.0006520216 0.0007324066 0.0008261498
## 35: WeekStatus_Weekend 0.0003534501 0.0002729236 0.0003043710
## Feature Gain Cover Frequency
xgb.plot.importance(importance, top_n = 15)
##### The XGBoost feature-importance results show that NSM (time of day)
is by far the most influential predictor, contributing roughly 16.8% of
the total gain. This dominant role reflects the strong daily cycles in
appliance usage, with predictable spikes during morning and evening
activity periods. Several indoor humidity and temperature
variables—RH_1, T3, RH_2, RH_3, and T1—also appear prominently,
indicating that XGBoost detects nonlinear HVAC-related responses tied to
indoor environmental conditions. These variables likely capture how
temperature and humidity influence heating, cooling, and ventilation
demands throughout different areas of the home. Additional contributors
such as Press_mm_hg, RH_5, and T8 suggest that subtle atmospheric
pressure and localized room conditions also play meaningful roles in
shaping energy consumption patterns. Finally, the lights variable
remains a strong behavioral indicator, with its gain value reflecting
how lighting usage often coincides with broader household activity.
Overall, XGBoost’s importance rankings confirm that both human routines
(NSM, lights) and environmental drivers (temperature, humidity,
pressure) interact to shape appliance energy usage aligning closely with
the patterns seen in the GAM, boosting, and BART models.
# knn
knn_test_mse <- mean((y_test - pred_test)^2)
knn_test_rmse <- sqrt(knn_test_mse)
# Linear Regression
lm_pred_test <- predict(lm_full, newdata = test_df)
lm_test_mse <- mean((test_df$Appliances - lm_pred_test)^2)
lm_test_rmse <- sqrt(lm_test_mse)
# BIC
step_test_mse <- mean((test_df$Appliances - step_pred_test)^2)
step_test_rmse <- sqrt(step_test_mse)
# Ridge & Lasso
ridge_test_mse <-as.numeric(ridge_mse)
ridge_test_rmse <- sqrt(ridge_test_mse)
lasso_test_mse <- as.numeric(lasso_mse)
lasso_test_rmse <- sqrt(lasso_test_mse)
# GAM
gam_test_mse <- mean((test_df$Appliances - gam_pred_test)^2)
gam_test_rmse <- sqrt(gam_test_mse)
# 6Decision Tree
tree_test_mse <- mean((test_df$Appliances - tree_pred_test)^2)
tree_test_rmse <- sqrt(tree_test_mse)
# Bagging
bag_test_mse <- mean((test_df$Appliances - bag_pred_test)^2)
bag_test_rmse <- sqrt(bag_test_mse)
bag_test_mse
## [1] 5043.248
# Random Forest
rf_test_mse <- mean((test_df$Appliances - rf_pred_test)^2)
rf_test_rmse <- sqrt(rf_test_mse)
rf_test_mse
## [1] 5000.661
# Boosting
boost_test_mse <- mean((test_df$Appliances - boost_pred)^2)
boost_test_rmse <- sqrt(boost_test_mse)
# BART
bart_test_mse <- mean((y_test_bart - bart_pred_mean)^2)
bart_test_rmse <- sqrt(bart_test_mse)
# XGBoost
xgb_test_mse <- as.numeric(xgb_mse)
xgb_test_rmse <- sqrt(xgb_test_mse)
model_results <- data.frame(
Model = c(
"KNN",
"Linear Regression",
"Subset Selection (BIC)",
"Ridge",
"Lasso",
"GAM",
"Decision Tree",
"Bagging",
"Random Forest",
"Boosting (GBM)",
"BART",
"XGBoost"
),
Test_MSE = c(
knn_test_mse,
lm_test_mse,
step_test_mse,
ridge_test_mse,
lasso_test_mse,
gam_test_mse,
tree_test_mse,
bag_test_mse,
rf_test_mse,
boost_test_mse,
bart_test_mse,
xgb_test_mse
),
Test_RMSE = c(
knn_test_rmse,
lm_test_rmse,
step_test_rmse,
ridge_test_rmse,
lasso_test_rmse,
gam_test_rmse,
tree_test_rmse,
bag_test_rmse,
rf_test_rmse,
boost_test_rmse,
bart_test_rmse,
xgb_test_rmse))
model_results
## Model Test_MSE Test_RMSE
## 1 KNN 5740.326 75.76494
## 2 Linear Regression 9482.650 97.37890
## 3 Subset Selection (BIC) 9482.421 97.37772
## 4 Ridge 9558.157 97.76583
## 5 Lasso 9483.875 97.38519
## 6 GAM 8645.021 92.97861
## 7 Decision Tree 9662.111 98.29604
## 8 Bagging 5043.248 71.01583
## 9 Random Forest 5000.661 70.71535
## 10 Boosting (GBM) 7778.658 88.19670
## 11 BART 6797.507 82.44699
## 12 XGBoost 4471.985 66.87290
y_mean_train <- mean(y_train)
y_sd_train <- sd(y_train)
y_rng_train <- diff(range(y_train)) # max - min
baseline_pred <- rep(y_mean_train, length(y_test))
baseline_mse <- mean((y_test - baseline_pred)^2)
baseline_rmse <- sqrt(baseline_mse)
model_results$NRMSE_mean <- model_results$Test_RMSE / y_mean_train
model_results$NRMSE_sd <- model_results$Test_RMSE / y_sd_train
model_results$NRMSE_range <- model_results$Test_RMSE / y_rng_train
# R^2 on the test set, using the mean-only baseline
model_results$Test_R2 <- 1 - (model_results$Test_MSE / baseline_mse)
# Optional: percent improvement vs baseline RMSE
model_results$RMSE_Improvement_vs_Baseline <- 100 * (baseline_rmse - model_results$Test_RMSE) / baseline_rmse
model_results[order(model_results$Test_RMSE), ]
## Model Test_MSE Test_RMSE NRMSE_mean NRMSE_sd NRMSE_range
## 12 XGBoost 4471.985 66.87290 0.6889731 0.6583810 0.06249804
## 9 Random Forest 5000.661 70.71535 0.7285609 0.6962109 0.06608911
## 8 Bagging 5043.248 71.01583 0.7316566 0.6991692 0.06636993
## 1 KNN 5740.326 75.76494 0.7805854 0.7459254 0.07080835
## 11 BART 6797.507 82.44699 0.8494288 0.8117120 0.07705326
## 10 Boosting (GBM) 7778.658 88.19670 0.9086664 0.8683193 0.08242682
## 6 GAM 8645.021 92.97861 0.9579331 0.9153984 0.08689589
## 3 Subset Selection (BIC) 9482.421 97.37772 1.0032560 0.9587089 0.09100722
## 2 Linear Regression 9482.650 97.37890 1.0032681 0.9587205 0.09100832
## 5 Lasso 9483.875 97.38519 1.0033329 0.9587824 0.09101419
## 4 Ridge 9558.157 97.76583 1.0072545 0.9625298 0.09136993
## 7 Decision Tree 9662.111 98.29604 1.0127171 0.9677499 0.09186546
## Test_R2 RMSE_Improvement_vs_Baseline
## 12 0.6039496 37.067464
## 9 0.5571287 33.451422
## 8 0.5533571 33.168650
## 1 0.4916220 28.699372
## 11 0.3979954 22.411045
## 10 0.3111021 17.000128
## 6 0.2343748 12.499987
## 3 0.1602125 8.360081
## 2 0.1601922 8.358973
## 5 0.1600838 8.353058
## 4 0.1535052 7.994847
## 7 0.1442987 7.495876