library(knitr)
data_dictionary <- data.frame(
Variable = c("survival", "pclass", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked"),
Definition = c("Survival", "Ticket class", "Sex", "Age in years", "# of siblings/spouses aboard the Titanic",
"# of parents/children aboard the Titanic","Ticket number", "Passenger fare", "Cabin number", "Port of Embarkation"),
Key = c("0 = No, 1 = Yes",
"1 = 1st, 2 = 2nd, 3 = 3rd", "", "", "", "", "", "", "", "C = Cherbourg, Q = Queenstown, S = Southampton"))
kable(data_dictionary, align = "r", caption = "Titanic Variable Definitions")
| Variable | Definition | Key |
|---|---|---|
| survival | Survival | 0 = No, 1 = Yes |
| pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex | |
| age | Age in years | |
| sibsp | # of siblings/spouses aboard the Titanic | |
| parch | # of parents/children aboard the Titanic | |
| ticket | Ticket number | |
| fare | Passenger fare | |
| cabin | Cabin number | |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |
library(ggplot2)
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
library(knitr)
library(dplyr)
library(stringr)
library(class)
library(tidyr)
library(rpart)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-10
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(dbarts)
##
## Attaching package: 'dbarts'
##
## The following object is masked from 'package:tidyr':
##
## extract
library(xgboost)
test_file <- "/Users/miada/Downloads/STT 481/DataSets/titanic_test.csv"
train_file <- "/Users/miada/Downloads/STT 481/DataSets/titanic_train.csv"
test <- read.csv(test_file)
train <- read.csv(train_file)
# glimpse(test)
# glimpse(train)
# dim(train); dim(test)
train <- train %>%
mutate( across(where(is.character), ~ na_if(str_trim(.), "")),
across(where(is.factor), ~
factor(na_if(str_trim(as.character(.)), ""))))
miss_tbl <- train %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
tidyr::pivot_longer(everything(),
names_to = "Feature",
values_to = "Missing_Count") %>%
mutate(Missing_Percent = round(Missing_Count / nrow(train) * 100, 2)) %>%
arrange(desc(Missing_Percent))
miss_tbl
## # A tibble: 12 × 3
## Feature Missing_Count Missing_Percent
## <chr> <int> <dbl>
## 1 Cabin 687 77.1
## 2 Age 177 19.9
## 3 Embarked 2 0.22
## 4 PassengerId 0 0
## 5 Survived 0 0
## 6 Pclass 0 0
## 7 Name 0 0
## 8 Sex 0 0
## 9 SibSp 0 0
## 10 Parch 0 0
## 11 Ticket 0 0
## 12 Fare 0 0
train %>%
mutate(Cabin_Recorded = !is.na(Cabin)) %>%
group_by(Pclass) %>%
summarise(
Total_Passengers = n(),
Cabin_Recorded_Count = sum(Cabin_Recorded),
Cabin_Recorded_Percent = round(mean(Cabin_Recorded) * 100, 2)
)
## # A tibble: 3 × 4
## Pclass Total_Passengers Cabin_Recorded_Count Cabin_Recorded_Percent
## <int> <int> <int> <dbl>
## 1 1 216 176 81.5
## 2 2 184 16 8.7
## 3 3 491 12 2.44
train <- train %>%
select(-Cabin)
test <- test %>%
select(-Cabin)
train <- train %>%
group_by(Sex, Pclass) %>%
mutate(Age = ifelse(is.na(Age), median(Age, na.rm = TRUE), Age)) %>%
ungroup()
test <- test %>%
group_by(Sex, Pclass) %>%
mutate(Age = ifelse(is.na(Age), median(Age, na.rm = TRUE), Age)) %>%
ungroup()
train_y <- train$Survived
train_x <- train %>% select(-Survived)
combined <- bind_rows(train_x, test)
combined_dummies <- model.matrix(~ . - 1, data = combined)
n_train <- nrow(train_x)
x_train <- combined_dummies[1:n_train, , drop = FALSE]
x_test <- combined_dummies[(n_train + 1):nrow(combined_dummies), , drop = FALSE]
set.seed(10)
n <- nrow(x_train)
val_idx <- sample(seq_len(n), size = floor(0.2 * n))
train_idx <- setdiff(seq_len(n), val_idx)
X_train_raw <- x_train[train_idx, , drop = FALSE]
X_val_raw <- x_train[val_idx, , drop = FALSE]
y_train <- factor(train_y[train_idx], levels = c(0, 1))
y_val <- factor(train_y[val_idx], levels = c(0, 1))
keep_cols <- apply(X_train_raw, 2, sd) > 0
X_train_raw <- X_train_raw[, keep_cols, drop = FALSE]
X_val_raw <- X_val_raw[, keep_cols, drop = FALSE]
X_test_raw <- x_test[, keep_cols, drop = FALSE]
center <- colMeans(X_train_raw)
scalev <- apply(X_train_raw, 2, sd)
scalev[scalev == 0] <- 1
scale_mat <- function(M, center, scalev) {
scale(M, center = center, scale = scalev)
}
X_train <- scale_mat(X_train_raw, center, scalev)
X_val <- scale_mat(X_val_raw, center, scalev)
X_test <- scale_mat(X_test_raw, center, scalev)
k_grid <- seq(1, 25, by = 2)
cv_err <- numeric(length(k_grid))
# 10-fold CV on the 80% training split
set.seed(10)
K <- 10
folds <- sample(rep(1:K, length.out = nrow(X_train)))
for (i in seq_along(k_grid)) {
k_val <- k_grid[i]
fold_err <- numeric(K)
for (f in 1:K) {
tr <- folds != f
te <- folds == f
pred <- knn(
train = X_train[tr, , drop = FALSE],
test = X_train[te, , drop = FALSE],
cl = y_train[tr],
k = k_val
)
fold_err[f] <- mean(pred != y_train[te])
}
cv_err[i] <- mean(fold_err)
}
cv_results <- data.frame(
k = k_grid,
CV_Error = cv_err,
CV_Accuracy = 1 - cv_err
)
cv_results
## k CV_Error CV_Accuracy
## 1 1 0.5160603 0.4839397
## 2 3 0.3955790 0.6044210
## 3 5 0.3843897 0.6156103
## 4 7 0.3829812 0.6170188
## 5 9 0.3829812 0.6170188
## 6 11 0.3843897 0.6156103
## 7 13 0.3829812 0.6170188
## 8 15 0.3843897 0.6156103
## 9 17 0.3843897 0.6156103
## 10 19 0.3843897 0.6156103
## 11 21 0.3829812 0.6170188
## 12 23 0.3843897 0.6156103
## 13 25 0.3843897 0.6156103
best_k <- cv_results$k[which.min(cv_results$CV_Error)]
cat("Chosen k (CV):", best_k, "\n")
## Chosen k (CV): 7
pred_val <- knn(
train = X_train,
test = X_val,
cl = y_train,
k = best_k
)
val_err <- mean(pred_val != y_val)
val_acc <- 1 - val_err
cat(sprintf("Validation error = %.4f and Validation accuracy = %.4f",
val_err, val_acc))
## Validation error = 0.3876 and Validation accuracy = 0.6124
test_ids <- test$PassengerId
drop_cols <- intersect(c("PassengerId", "Name", "Ticket"), names(train))
train_lr <- train %>% select(-all_of(drop_cols))
test_lr <- test %>% select(-all_of(drop_cols))
mode_fun <- function(x) {
ux <- na.omit(x)
ux[which.max(tabulate(match(ux, ux)))]
}
emb_mode <- mode_fun(train_lr$Embarked)
train_lr <- train_lr %>%
mutate(Embarked = ifelse(is.na(Embarked) | Embarked == "", emb_mode, Embarked))
test_lr <- test_lr %>%
mutate(Embarked = ifelse(is.na(Embarked) | Embarked == "", emb_mode, Embarked))
# --- Fare: Kaggle test often has 1 NA Fare; fill with TRAIN mean ---
fare_mean <- mean(train_lr$Fare, na.rm = TRUE)
train_lr <- train_lr %>%
mutate(Fare = ifelse(is.na(Fare), fare_mean, Fare))
test_lr <- test_lr %>%
mutate(Fare = ifelse(is.na(Fare), fare_mean, Fare))
train_lr <- train_lr %>%
mutate(
Sex = factor(Sex),
Embarked = factor(Embarked),
Pclass = factor(Pclass) # treat class as categorical
)
test_lr <- test_lr %>%
mutate(
Sex = factor(Sex, levels = levels(train_lr$Sex)),
Embarked = factor(Embarked, levels = levels(train_lr$Embarked)),
Pclass = factor(Pclass, levels = levels(train_lr$Pclass))
)
stratified_split <- function(df, ycol, p = 0.2, seed = 19) {
set.seed(seed)
y <- df[[ycol]]
idx_by_class <- split(seq_len(nrow(df)), y)
val_idx <- unlist(lapply(idx_by_class, function(idx) {
n_val <- max(1, floor(length(idx) * p))
sample(idx, n_val)
}))
list(
train = df[-val_idx, , drop = FALSE],
valid = df[ val_idx, , drop = FALSE],
val_idx = val_idx
)
}
splits <- stratified_split(train_lr, "Survived", p = 0.20, seed = 42)
tr_df <- splits$train
vl_df <- splits$valid
form_logit <- Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
logit_fit <- glm(form_logit, data = tr_df, family = binomial())
summary(logit_fit)
##
## Call:
## glm(formula = form_logit, family = binomial(), data = tr_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.2464394 0.5429476 7.821 5.24e-15 ***
## Pclass2 -1.0654184 0.3344394 -3.186 0.00144 **
## Pclass3 -2.3379105 0.3426427 -6.823 8.91e-12 ***
## Sexmale -2.6046467 0.2207095 -11.801 < 2e-16 ***
## Age -0.0431169 0.0089903 -4.796 1.62e-06 ***
## SibSp -0.3228002 0.1243911 -2.595 0.00946 **
## Parch -0.0794657 0.1433318 -0.554 0.57929
## Fare 0.0009757 0.0027712 0.352 0.72478
## EmbarkedQ -0.0899537 0.4278010 -0.210 0.83346
## EmbarkedS -0.4148501 0.2632794 -1.576 0.11509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 638.70 on 704 degrees of freedom
## AIC: 658.7
##
## Number of Fisher Scoring iterations: 5
coef_table <- data.frame(
Term = names(coef(logit_fit)),
Estimate = coef(logit_fit),
OddsRatio = exp(coef(logit_fit))
)
coef_table
## Term Estimate OddsRatio
## (Intercept) (Intercept) 4.2464394426 69.85624186
## Pclass2 Pclass2 -1.0654183826 0.34458366
## Pclass3 Pclass3 -2.3379104619 0.09652913
## Sexmale Sexmale -2.6046466610 0.07392925
## Age Age -0.0431168870 0.95779943
## SibSp SibSp -0.3228002379 0.72411849
## Parch Parch -0.0794656593 0.92360974
## Fare Fare 0.0009756742 1.00097615
## EmbarkedQ EmbarkedQ -0.0899537498 0.91397346
## EmbarkedS EmbarkedS -0.4148501037 0.66043927
make_strat_folds <- function(y, K = 10, seed = 19) {
set.seed(seed)
y <- factor(y)
by_class <- lapply(split(seq_along(y), y), sample)
# avoid empty folds if a class is small
K <- min(K, min(lengths(by_class)))
pieces <- lapply(by_class, function(ix) split(ix, cut(seq_along(ix), K, labels = FALSE)))
lapply(seq_len(K), function(k) {
unname(unlist(lapply(pieces, `[[`, k), use.names = FALSE))
})
}
K <- 10
folds <- make_strat_folds(tr_df$Survived, K = K, seed = 19)
oof_prob <- rep(NA_real_, nrow(tr_df))
fold_err <- numeric(length(folds))
for (f in seq_along(folds)) {
te_idx <- folds[[f]]
tr_idx <- setdiff(seq_len(nrow(tr_df)), te_idx)
fit_f <- glm(form_logit, data = tr_df[tr_idx, , drop = FALSE], family = binomial())
prob_f <- predict(fit_f, newdata = tr_df[te_idx, , drop = FALSE], type = "response")
oof_prob[te_idx] <- prob_f
pred_f <- ifelse(prob_f >= 0.5, 1, 0)
fold_err[f] <- mean(pred_f != tr_df$Survived[te_idx])
}
cv_err <- mean(fold_err)
cv_acc <- 1 - cv_err
cv_se <- sd(fold_err) / sqrt(length(folds))
cv_auc <- as.numeric(pROC::auc(pROC::roc(tr_df$Survived, oof_prob, quiet = TRUE)))
cv_brier <- mean((tr_df$Survived - oof_prob)^2)
cat(sprintf("Mean CV misclass error: %.4f (SE = %.4f)\n", cv_err, cv_se))
## Mean CV misclass error: 0.1988 (SE = 0.0182)
cat(sprintf("Mean CV accuracy: %.4f\n", cv_acc))
## Mean CV accuracy: 0.8012
cat(sprintf("CV AUC: %.4f\n", cv_auc))
## CV AUC: 0.8438
cat(sprintf("CV Brier score: %.4f\n", cv_brier))
## CV Brier score: 0.1449
logit_full <- glm(form_logit, data = train_lr, family = binomial())
test_prob <- predict(logit_full, newdata = test_lr, type = "response")
test_pred <- ifelse(test_prob >= 0.5, 1, 0)
submission_logit <- data.frame(
PassengerId = test_ids,
Survived = test_pred
)
write.csv(submission_logit, "submission_logistic.csv", row.names = FALSE)
head(submission_logit)
## PassengerId Survived
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 1
## 6 897 0
vars <- c("Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")
tr2 <- tr_df[, vars, drop = FALSE]
vl2 <- vl_df[, vars, drop = FALSE]
tr2 <- tr2 %>%
mutate(
Sex = factor(Sex),
Embarked = factor(Embarked),
Pclass = factor(Pclass)
) %>%
droplevels() %>%
filter(complete.cases(.))
null_mod <- glm(Survived ~ 1, data = tr2, family = binomial())
full_form <- Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
k_bic <- log(nrow(tr2))
forward_bic <- MASS::stepAIC(
object = null_mod,
scope = list(lower = ~1, upper = full_form),
direction = "forward",
k = k_bic,
trace = FALSE
)
summary(forward_bic)
##
## Call:
## glm(formula = Survived ~ Sex + Pclass + Age + SibSp, family = binomial(),
## data = tr2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.121938 0.452664 9.106 < 2e-16 ***
## Sexmale -2.625893 0.214152 -12.262 < 2e-16 ***
## Pclass2 -1.246980 0.291867 -4.272 1.93e-05 ***
## Pclass3 -2.463331 0.279876 -8.802 < 2e-16 ***
## Age -0.044688 0.008876 -5.035 4.78e-07 ***
## SibSp -0.364762 0.116439 -3.133 0.00173 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 642.29 on 708 degrees of freedom
## AIC: 654.29
##
## Number of Fisher Scoring iterations: 5
formula(forward_bic)
## Survived ~ Sex + Pclass + Age + SibSp
vl2 <- vl2 %>%
mutate(
Sex = factor(Sex, levels = levels(tr2$Sex)),
Embarked = factor(Embarked, levels = levels(tr2$Embarked)),
Pclass = factor(Pclass, levels = levels(tr2$Pclass))
) %>%
filter(complete.cases(.))
vl_prob_bic <- predict(forward_bic, newdata = vl2, type = "response")
vl_pred_bic <- ifelse(vl_prob_bic >= 0.5, 1, 0)
acc_bic <- mean(vl_pred_bic == vl2$Survived)
auc_bic <- as.numeric(pROC::auc(pROC::roc(vl2$Survived, vl_prob_bic, quiet = TRUE)))
cat(sprintf("Forward-BIC validation accuracy: %.3f\n", acc_bic))
## Forward-BIC validation accuracy: 0.825
cat(sprintf("Forward-BIC validation AUC: %.3f\n", auc_bic))
## Forward-BIC validation AUC: 0.880
cm_bic <- table(Predicted = vl_pred_bic, True = vl2$Survived)
cm_bic
## True
## Predicted 0 1
## 0 100 22
## 1 9 46
acc_bic
## [1] 0.8248588
set.seed(19)
K <- 10
y_all <- factor(tr2$Survived)
by_class <- lapply(split(seq_along(y_all), y_all), sample)
K <- min(K, min(lengths(by_class)))
pieces <- lapply(by_class, function(ix) split(ix, cut(seq_along(ix), K, labels = FALSE)))
folds <- lapply(seq_len(K), function(k) unname(unlist(lapply(pieces, `[[`, k), use.names = FALSE)))
oof_prob <- rep(NA_real_, nrow(tr2))
fold_err <- numeric(K)
g_sex <- levels(tr2$Sex)
g_emb <- levels(tr2$Embarked)
g_cls <- levels(tr2$Pclass)
for (f in seq_len(K)) {
te_idx <- folds[[f]]
tr_idx <- setdiff(seq_len(nrow(tr2)), te_idx)
tr_f <- tr2[tr_idx, , drop = FALSE]
te_f <- tr2[te_idx, , drop = FALSE]
# keep factor levels consistent fold-to-fold
tr_f$Sex <- factor(tr_f$Sex, levels = g_sex)
te_f$Sex <- factor(te_f$Sex, levels = g_sex)
tr_f$Embarked <- factor(tr_f$Embarked, levels = g_emb)
te_f$Embarked <- factor(te_f$Embarked, levels = g_emb)
tr_f$Pclass <- factor(tr_f$Pclass, levels = g_cls)
te_f$Pclass <- factor(te_f$Pclass, levels = g_cls)
tr_f <- tr_f[complete.cases(tr_f), , drop = FALSE]
te_f <- te_f[complete.cases(te_f), , drop = FALSE]
k_bic_f <- log(nrow(tr_f))
fit_f <- MASS::stepAIC(
object = glm(Survived ~ 1, data = tr_f, family = binomial()),
scope = list(lower = ~1, upper = full_form),
direction = "forward",
k = k_bic_f,
trace = FALSE
)
pr <- as.numeric(predict(fit_f, newdata = te_f, type = "response"))
oof_prob[te_idx] <- pr
fold_err[f] <- mean((pr >= 0.5) != te_f$Survived)
}
cv_err <- mean(fold_err)
cv_acc <- 1 - cv_err
cv_se <- sd(fold_err) / sqrt(K)
cv_auc <- as.numeric(pROC::auc(pROC::roc(tr2$Survived, oof_prob, quiet = TRUE)))
cat(sprintf("Forward-BIC 10-fold CV misclass error = %.4f (SE = %.4f)\n", cv_err, cv_se))
## Forward-BIC 10-fold CV misclass error = 0.2002 (SE = 0.0165)
cat(sprintf("Forward-BIC 10-fold CV accuracy = %.4f\n", cv_acc))
## Forward-BIC 10-fold CV accuracy = 0.7998
cat(sprintf("Forward-BIC 10-fold CV AUC = %.4f\n", cv_auc))
## Forward-BIC 10-fold CV AUC = 0.8455
# Build a clean modeling frame from the full training data (same variables)
train2 <- train[, vars, drop = FALSE] %>%
mutate(
Sex = factor(Sex),
Embarked = factor(Embarked),
Pclass = factor(Pclass)
) %>%
filter(complete.cases(.))
final_fit <- glm(formula(forward_bic), data = train2, family = binomial())
test2 <- test[, vars[-1], drop = FALSE]
test2$Sex <- factor(test2$Sex, levels = levels(train2$Sex))
test2$Embarked <- factor(test2$Embarked, levels = levels(train2$Embarked))
test2$Pclass <- factor(test2$Pclass, levels = levels(train2$Pclass))
test_prob <- predict(final_fit, newdata = test2, type = "response")
test_pred <- ifelse(test_prob >= 0.5, 1, 0)
submission_forward_bic <- data.frame(
PassengerId = test_ids,
Survived = test_pred
)
write.csv(submission_forward_bic, "submission_forward_bic.csv", row.names = FALSE)
head(submission_forward_bic)
## PassengerId Survived
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 1
## 6 897 0
x_all <- model.matrix(Survived ~ . - 1, data = train2) # -1 removes intercept column
y_all <- train2$Survived
tr2 <- tr_df[, vars, drop = FALSE]
vl2 <- vl_df[, vars, drop = FALSE]
x_tr <- model.matrix(Survived ~ . - 1, data = tr2)
y_tr <- tr2$Survived
x_vl <- model.matrix(Survived ~ . - 1, data = vl2)
y_vl <- vl2$Survived
set.seed(19)
cv_lasso <- cv.glmnet(
x = x_tr,
y = y_tr,
family = "binomial",
alpha = 1, # LASSO
nfolds = 10
)
cv_lasso$lambda.min
## [1] 0.006229709
cv_lasso$lambda.1se
## [1] 0.03029259
test_ids <- test$PassengerId # or test$PassengerId, whichever is your raw test
test2 <- test[, vars[-1], drop = FALSE] # do NOT use complete.cases here
vl_prob_lasso <- as.numeric(predict(cv_lasso, newx = x_vl, s = cv_lasso$lambda.min, type = "response"))
vl_pred_lasso <- ifelse(vl_prob_lasso >= 0.5, 1, 0)
acc_lasso <- mean(vl_pred_lasso == y_vl)
auc_lasso <- as.numeric(pROC::auc(pROC::roc(y_vl, vl_prob_lasso, quiet = TRUE)))
cat(sprintf("LASSO: validation accuracy = %.3f\n", acc_lasso))
## LASSO: validation accuracy = 0.836
cat(sprintf("LASSO: validation AUC = %.3f\n", auc_lasso))
## LASSO: validation AUC = 0.880
cm_lasso <- table(Pred = vl_pred_lasso, True = y_vl)
cm_lasso
## True
## Pred 0 1
## 0 99 19
## 1 10 49
coef_lasso <- coef(cv_lasso, s = cv_lasso$lambda.min)
nonzero_terms <- coef_lasso[coef_lasso[, 1] != 0, , drop = FALSE]
nonzero_terms
## 9 x 1 sparse Matrix of class "dgCMatrix"
## s=0.006229709
## (Intercept) 2.7263369651
## Pclass1 0.9685717371
## Pclass3 -1.1730202227
## Sexmale -2.4386981122
## Age -0.0349435118
## SibSp -0.2479982933
## Parch -0.0098521680
## Fare 0.0001968139
## EmbarkedS -0.3441260487
# Fix the 1 missing Fare in the Kaggle test set using TRAIN mean (no row dropping)
fare_mean <- mean(train2$Fare, na.rm = TRUE)
test2$Fare[is.na(test2$Fare)] <- fare_mean
stopifnot(sum(is.na(test2$Fare)) == 0)
# was running a Error in predict.glmnet(object$glmnet.fit, newx, s = lambda, ...) : The number of variables in newx must be 10 this is what an AI (copilot) told me to do
# MUST be 418
stopifnot(nrow(test2) == length(test_ids))
x_test_raw <- model.matrix(~ . - 1, data = test2)
missing_cols <- setdiff(colnames(x_all), colnames(x_test_raw))
if (length(missing_cols) > 0) {
x_test_raw <- cbind(
x_test_raw,
matrix(0, nrow = nrow(x_test_raw), ncol = length(missing_cols),
dimnames = list(NULL, missing_cols))
)
}
x_test <- x_test_raw[, colnames(x_all), drop = FALSE]
set.seed(19)
cv_lasso_full <- cv.glmnet(
x = x_all,
y = y_all,
family = "binomial",
alpha = 1,
nfolds = 10
)
test_prob_lasso <- as.numeric(predict(cv_lasso_full, newx=x_test, s=cv_lasso_full$lambda.min, type="response"))
test_pred_lasso <- ifelse(test_prob_lasso >= 0.5, 1, 0)
submission_lasso <- data.frame(PassengerId = test_ids, Survived = test_pred_lasso)
stopifnot(nrow(submission_lasso) == length(test_ids))
write.csv(submission_lasso, "submission_lasso.csv", row.names = FALSE)
set.seed(19)
cv_ridge <- cv.glmnet(
x = x_tr,
y = y_tr,
family = "binomial",
alpha = 0, # Ridge
nfolds = 10
)
cv_ridge$lambda.min
## [1] 0.02574123
cv_ridge$lambda.1se
## [1] 0.09468603
vl_prob_ridge <- as.numeric(
predict(cv_ridge, newx = x_vl, s = cv_ridge$lambda.min, type = "response")
)
vl_pred_ridge <- ifelse(vl_prob_ridge >= 0.5, 1, 0)
acc_ridge <- mean(vl_pred_ridge == y_vl)
auc_ridge <- as.numeric(pROC::auc(pROC::roc(y_vl, vl_prob_ridge, quiet = TRUE)))
cat(sprintf("Ridge: validation accuracy = %.3f\n", acc_ridge))
## Ridge: validation accuracy = 0.836
cat(sprintf("Ridge: validation AUC = %.3f\n", auc_ridge))
## Ridge: validation AUC = 0.882
cm_ridge <- table(Pred = vl_pred_ridge, True = y_vl)
cm_ridge
## True
## Pred 0 1
## 0 100 20
## 1 9 48
coef_ridge <- coef(cv_ridge, s = cv_ridge$lambda.min)
coef_ridge
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s=0.02574123
## (Intercept) 2.168030056
## Pclass1 0.944913185
## Pclass2 0.212301569
## Pclass3 -0.852172267
## Sexmale -2.139716608
## Age -0.029914799
## SibSp -0.215746833
## Parch -0.043148731
## Fare 0.002183531
## EmbarkedQ -0.020114527
## EmbarkedS -0.385969930
tr2$Sex <- factor(tr2$Sex)
tr2$Embarked <- factor(tr2$Embarked)
vl2$Sex <- factor(vl2$Sex, levels = levels(tr2$Sex))
vl2$Embarked <- factor(vl2$Embarked, levels = levels(tr2$Embarked))
gam_fit <- gam(
Survived ~
Sex +
Pclass +
s(Age, k = 10) +
s(SibSp, k = 6) +
s(Fare, k = 10) +
Embarked,
data = tr2,
family = binomial(),
method = "REML"
)
summary(gam_fit)
##
## Family: binomial
## Link function: logit
##
## Formula:
## Survived ~ Sex + Pclass + s(Age, k = 10) + s(SibSp, k = 6) +
## s(Fare, k = 10) + Embarked
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.65441 0.35615 7.453 9.11e-14 ***
## Sexmale -2.56727 0.21953 -11.695 < 2e-16 ***
## Pclass2 -1.06559 0.33815 -3.151 0.00163 **
## Pclass3 -2.16743 0.34616 -6.261 3.82e-10 ***
## EmbarkedQ 0.06252 0.42502 0.147 0.88306
## EmbarkedS -0.40144 0.26767 -1.500 0.13367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(Age) 3.576 4.458 27.70 2.54e-05 ***
## s(SibSp) 2.105 2.569 10.75 0.00771 **
## s(Fare) 1.001 1.001 0.13 0.71927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.414 Deviance explained = 34.4%
## -REML = 324.13 Scale est. = 1 n = 714
vl_prob_gam <- as.numeric(predict(gam_fit, newdata = vl2, type = "response"))
vl_pred_gam <- ifelse(vl_prob_gam >= 0.5, 1, 0)
acc_gam <- mean(vl_pred_gam == vl2$Survived)
auc_gam <- as.numeric(pROC::auc(pROC::roc(vl2$Survived, vl_prob_gam, quiet = TRUE)))
cat(sprintf("GAM: validation accuracy = %.3f\n", acc_gam))
## GAM: validation accuracy = 0.836
cat(sprintf("GAM: validation AUC = %.3f\n", auc_gam))
## GAM: validation AUC = 0.884
cm_gam <- table(Pred = vl_pred_gam, True = vl2$Survived)
cm_gam
## True
## Pred 0 1
## 0 98 18
## 1 11 50
# GAM is already fit as gam_fit (from your earlier chunk)
# Smooth terms were: s(Age), s(SibSp), s(Fare) in that order
par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))
plot(gam_fit, select = 1, shade = TRUE, main = "s(Age)")
plot(gam_fit, select = 2, shade = TRUE, main = "s(SibSp)")
plot(gam_fit, select = 3, shade = TRUE, main = "s(Fare)")
par(mfrow = c(1, 1))
train_gam <- train[, vars, drop = FALSE]
test_gam <- test[, vars[-1], drop = FALSE]
# Align factor levels to training (important for predict())
train_gam$Sex <- factor(train_gam$Sex)
train_gam$Embarked <- factor(train_gam$Embarked)
train_gam$Pclass <- factor(train_gam$Pclass)
test_gam$Sex <- factor(test_gam$Sex, levels = levels(train_gam$Sex))
test_gam$Embarked <- factor(test_gam$Embarked, levels = levels(train_gam$Embarked))
test_gam$Pclass <- factor(test_gam$Pclass, levels = levels(train_gam$Pclass))
gam_full <- gam(
Survived ~ Sex + Pclass + Embarked + s(Age, k = 10) + s(SibSp, k = 6) + s(Fare, k = 10),
data = train_gam,
family = binomial(),
method = "REML"
)
# Predict Kaggle test probabilities and classes
test_prob_gam <- as.numeric(predict(gam_full, newdata = test_gam, type = "response"))
test_pred_gam <- ifelse(test_prob_gam >= 0.5, 1, 0)
# Create and write submission
submission_gam <- data.frame(
PassengerId = test_ids,
Survived = test_pred_gam
)
stopifnot(nrow(submission_gam) == length(test_ids))
write.csv(submission_gam, "submission_gam.csv", row.names = FALSE)
head(submission_gam)
## PassengerId Survived
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 1
## 6 897 0
tree_full <- rpart(
Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data = tr2,
method = "class",
control = rpart.control(cp = 0.0005) # allow growth; we'll prune using CV table
)
printcp(tree_full)
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch +
## Fare + Embarked, data = tr2, method = "class", control = rpart.control(cp = 5e-04))
##
## Variables actually used in tree construction:
## [1] Age Fare Pclass Sex SibSp
##
## Root node error: 274/714 = 0.38375
##
## n= 714
##
## CP nsplit rel error xerror xstd
## 1 0.4270073 0 1.00000 1.00000 0.047424
## 2 0.0310219 1 0.57299 0.57299 0.040390
## 3 0.0291971 3 0.51095 0.57664 0.040482
## 4 0.0255474 5 0.45255 0.57299 0.040390
## 5 0.0109489 6 0.42701 0.52920 0.039232
## 6 0.0072993 8 0.40511 0.52555 0.039131
## 7 0.0036496 11 0.38321 0.51825 0.038926
## 8 0.0018248 12 0.37956 0.50730 0.038614
## 9 0.0005000 16 0.37226 0.52190 0.039029
plotcp(tree_full)
cptab <- tree_full$cptable
minrow <- which.min(cptab[, "xerror"])
xerr_min <- cptab[minrow, "xerror"]
xerr_se <- cptab[minrow, "xstd"]
cp_1se <- cptab[which(cptab[, "xerror"] <= xerr_min + xerr_se)[1], "CP"]
tree_pruned <- prune(tree_full, cp = cp_1se)
cp_1se
## [1] 0.01094891
library(rpart.plot)
rpart.plot(
tree_pruned,
type = 2,
extra = 104, # shows class probs and predicted class
fallen.leaves = TRUE,
tweak = 1.1,
main = "Titanic Decision Tree (Pruned)"
)
##### After fitting and pruning the classification tree, the final model
uses only five variables: Sex, Pclass, Age, Fare, and SibSp. Even though
additional predictors (Parch and Embarked) were available, the pruning
process removed them because they did not meaningfully reduce
classification error once the core variables were included. This
confirms what we consistently observed in the logistic regression,
lasso, ridge, and GAM models: survival on the Titanic is primarily
driven by gender, passenger class, age, family structure, and
socioeconomic proxies such as fare.
# Predict class probabilities for Survived = 1
vl_prob_tree <- predict(tree_pruned, newdata = vl2, type = "prob")[, "1"]
vl_pred_tree <- ifelse(vl_prob_tree >= 0.5, 1, 0)
acc_tree <- mean(vl_pred_tree == vl2$Survived)
auc_tree <- as.numeric(pROC::auc(pROC::roc(vl2$Survived, vl_prob_tree, quiet = TRUE)))
cat(sprintf("Decision Tree: validation accuracy = %.3f\n", acc_tree))
## Decision Tree: validation accuracy = 0.847
cat(sprintf("Decision Tree: validation AUC = %.3f\n", auc_tree))
## Decision Tree: validation AUC = 0.839
cm_tree <- table(Pred = vl_pred_tree, True = vl2$Survived)
cm_tree
## True
## Pred 0 1
## 0 102 20
## 1 7 48
vars <- c("Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")
tr2 <- tr_df[, vars, drop = FALSE]
vl2 <- vl_df[, vars, drop = FALSE]
# Ensure factors are consistent
tr2$Sex <- factor(tr2$Sex)
tr2$Embarked <- factor(tr2$Embarked)
tr2$Pclass <- factor(tr2$Pclass)
vl2$Sex <- factor(vl2$Sex, levels = levels(tr2$Sex))
vl2$Embarked <- factor(vl2$Embarked, levels = levels(tr2$Embarked))
vl2$Pclass <- factor(vl2$Pclass, levels = levels(tr2$Pclass))
## error warning: Warning: The response has five or fewer unique values. Are you sure you want to do regression?
tr2$Survived <- factor(tr2$Survived, levels = c(0, 1))
vl2$Survived <- factor(vl2$Survived, levels = c(0, 1))
## was told to add this because R thought my response was numeric
p <- ncol(tr2) - 1
bag_model <- randomForest(
Survived ~ .,
data = tr2,
ntree = 200,
mtry = p,
importance = TRUE
)
bag_model
##
## Call:
## randomForest(formula = Survived ~ ., data = tr2, ntree = 200, mtry = p, importance = TRUE)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 22.27%
## Confusion matrix:
## 0 1 class.error
## 0 363 77 0.1750000
## 1 82 192 0.2992701
# Predict probabilities for Survived = 1
vl_prob_bag <- predict(bag_model, newdata = vl2, type = "prob")[, "1"]
vl_pred_bag <- ifelse(vl_prob_bag >= 0.5, 1, 0)
acc_bag <- mean(vl_pred_bag == vl2$Survived)
auc_bag <- as.numeric(pROC::auc(pROC::roc(vl2$Survived, vl_prob_bag, quiet = TRUE)))
cat(sprintf("Bagging: validation accuracy = %.3f\n", acc_bag))
## Bagging: validation accuracy = 0.864
cat(sprintf("Bagging: validation AUC = %.3f\n", auc_bag))
## Bagging: validation AUC = 0.900
cm_bag <- table(Pred = vl_pred_bag, True = vl2$Survived)
cm_bag
## True
## Pred 0 1
## 0 99 14
## 1 10 54
tr2 <- tr_df[, vars, drop = FALSE]
vl2 <- vl_df[, vars, drop = FALSE]
# Make sure factors are treated correctly (no new imputation)
tr2$Sex <- factor(tr2$Sex)
tr2$Embarked <- factor(tr2$Embarked)
tr2$Pclass <- factor(tr2$Pclass)
vl2$Sex <- factor(vl2$Sex, levels = levels(tr2$Sex))
vl2$Embarked <- factor(vl2$Embarked, levels = levels(tr2$Embarked))
vl2$Pclass <- factor(vl2$Pclass, levels = levels(tr2$Pclass))
boost_fit <- gbm(
formula = Survived ~ .,
data = tr2,
distribution = "bernoulli", # logistic boosting for binary outcome
n.trees = 3000, # large cap; we will pick the best with CV/OOB
interaction.depth = 2, # small trees (stumps/2-way interactions)
shrinkage = 0.01, # learning rate
n.minobsinnode = 10,
bag.fraction = 0.6, # stochastic gradient boosting
train.fraction = 1.0,
cv.folds = 10, # internal CV to choose number of trees
verbose = FALSE
)
best_iter <- gbm.perf(boost_fit, method = "cv")
best_iter
## [1] 918
# Predicted probabilities on validation set
vl_prob_boost <- predict(boost_fit, newdata = vl2, n.trees = best_iter, type = "response")
vl_pred_boost <- ifelse(vl_prob_boost >= 0.5, 1, 0)
acc_boost <- mean(vl_pred_boost == vl2$Survived)
auc_boost <- as.numeric(pROC::auc(pROC::roc(vl2$Survived, vl_prob_boost, quiet = TRUE)))
cat(sprintf("Boosting: validation accuracy = %.3f\n", acc_boost))
## Boosting: validation accuracy = 0.870
cat(sprintf("Boosting: validation AUC = %.3f\n", auc_boost))
## Boosting: validation AUC = 0.922
cm_boost <- table(Pred = vl_pred_boost, True = vl2$Survived)
cm_boost
## True
## Pred 0 1
## 0 103 17
## 1 6 51
boost_imp <- summary(boost_fit, n.trees = best_iter, plotit = FALSE)
head(boost_imp, 10)
## var rel.inf
## Sex Sex 47.1276669
## Pclass Pclass 18.7897739
## Age Age 15.8339485
## Fare Fare 11.0183844
## SibSp SibSp 4.2305089
## Embarked Embarked 2.5880649
## Parch Parch 0.4116525
# Fit boosting on ALL training data and predict Kaggle test
train2 <- train[, vars, drop = FALSE]
test2 <- test[, vars[-1], drop = FALSE]
train2$Sex <- factor(train2$Sex)
train2$Embarked <- factor(train2$Embarked)
train2$Pclass <- factor(train2$Pclass)
test2$Sex <- factor(test2$Sex, levels = levels(train2$Sex))
test2$Embarked <- factor(test2$Embarked, levels = levels(train2$Embarked))
test2$Pclass <- factor(test2$Pclass, levels = levels(train2$Pclass))
set.seed(42)
boost_full <- gbm(
formula = Survived ~ .,
data = train2,
distribution = "bernoulli",
n.trees = 3000,
interaction.depth = 2,
shrinkage = 0.01,
n.minobsinnode = 10,
bag.fraction = 0.6,
train.fraction = 1.0,
cv.folds = 10,
verbose = FALSE
)
best_iter_full <- gbm.perf(boost_full, method = "cv")
test_prob_boost <- predict(boost_full, newdata = test2, n.trees = best_iter_full, type = "response")
test_pred_boost <- ifelse(test_prob_boost >= 0.5, 1, 0)
submission_boost <- data.frame(
PassengerId = test_ids,
Survived = test_pred_boost
)
stopifnot(nrow(submission_boost) == length(test_ids))
write.csv(submission_boost, "submission_boosting.csv", row.names = FALSE)
head(submission_boost)
## PassengerId Survived
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 0
## 6 897 0
X_train_bart <- model.matrix(Survived ~ . - 1, data = tr2)
y_train_bart <- tr2$Survived
X_val_bart <- model.matrix(Survived ~ . - 1, data = vl2)
y_val_bart <- vl2$Survived
bart_fit <- bart(
x.train = X_train_bart,
y.train = y_train_bart,
x.test = X_val_bart,
ntree = 200, # number of trees
ndpost = 1000, # posterior draws
nskip = 200,
k = 2, # regularization strength
sigdf = 3,
sigquant = 0.9
)
##
## Running BART with binary y
##
## number of trees: 200
## number of chains: 1, default number of threads 1
## tree thinning rate: 1
## Prior:
## k prior fixed to 2.000000
## power and base for tree prior: 2.000000 0.950000
## use quantiles for rule cut points: false
## proposal probabilities: birth/death 0.50, swap 0.10, change 0.40; birth 0.50
## data:
## number of training observations: 714
## number of test observations: 177
## number of explanatory variables: 10
##
## Cutoff rules c in x<=c vs x>c
## Number of cutoffs: (var: number of possible c):
## (1: 100) (2: 100) (3: 100) (4: 100) (5: 100)
## (6: 100) (7: 100) (8: 100) (9: 100) (10: 100)
##
## offsets:
## reg : 0.00 0.00 0.00 0.00 0.00
## test: 0.00 0.00 0.00 0.00 0.00
## Running mcmc loop:
## iteration: 100 (of 1000)
## iteration: 200 (of 1000)
## iteration: 300 (of 1000)
## iteration: 400 (of 1000)
## iteration: 500 (of 1000)
## iteration: 600 (of 1000)
## iteration: 700 (of 1000)
## iteration: 800 (of 1000)
## iteration: 900 (of 1000)
## iteration: 1000 (of 1000)
## total seconds in loop: 1.528901
##
## Tree sizes, last iteration:
## [1] 1 2 2 3 3 2 2 2 2 2 3 4 2 2 2 2 2 3
## 4 2 2 2 2 2 1 2 2 4 3 2 1 4 2 3 1 2 2 2
## 2 2 2 2 3 2 1 1 2 3 2 2 2 3 3 2 2 3 2 4
## 2 2 3 3 3 3 3 3 1 3 2 2 2 2 2 3 3 2 3 2
## 2 3 2 2 2 2 2 1 2 1 2 2 2 4 2 1 3 2 2 3
## 2 3 3 1 2 2 2 3 2 2 3 2 2 2 2 5 1 3 2 4
## 3 4 2 2 2 3 2 2 2 2 3 2 1 2 4 2 2 1 2 2
## 3 2 3 2 2 2 3 2 2 2 3 2 3 2 2 1 2 3 3 2
## 3 2 2 2 2 3 2 2 3 2 2 2 2 2 2 2 2 2 2 3
## 1 3 5 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 3 2
## 2 2
##
## Variable Usage, last iteration (var:count):
## (1: 21) (2: 28) (3: 22) (4: 32) (5: 33)
## (6: 24) (7: 23) (8: 22) (9: 29) (10: 21)
##
## DONE BART
# If yhat.test.mean exists, use it; otherwise compute the mean from yhat.test draws [source copilot]
if (!is.null(bart_fit$yhat.test.mean) && length(bart_fit$yhat.test.mean) > 0) {
vl_prob_bart <- as.numeric(bart_fit$yhat.test.mean)
} else if (!is.null(bart_fit$yhat.test) && length(bart_fit$yhat.test) > 0) {
# yhat.test is ndpost x n_valid
vl_prob_bart <- as.numeric(colMeans(bart_fit$yhat.test))
} else {
stop("No test predictions found in bart_fit. Check that x.test was passed correctly.")
}#### ^^ until here
# evaluation
y_val_num <- as.integer(as.character(vl2$Survived))
vl_pred_bart <- ifelse(vl_prob_bart >= 0.5, 1, 0)
acc_bart <- mean(vl_pred_bart == y_val_num)
roc_bart <- pROC::roc(response = y_val_num, predictor = vl_prob_bart, quiet = TRUE)
auc_bart <- as.numeric(pROC::auc(roc_bart))
cat(sprintf("BART: validation accuracy = %.3f\n", acc_bart))
## BART: validation accuracy = 0.819
cat(sprintf("BART: validation AUC = %.3f\n", auc_bart))
## BART: validation AUC = 0.908
table(Pred = vl_pred_bart, True = y_val_num)
## True
## Pred 0 1
## 0 108 31
## 1 1 37
vars <- c("Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked")
# already-preprocessed
train2 <- train[, vars, drop = FALSE]
test2 <- test[, vars[-1], drop = FALSE]
train2$Sex <- factor(train2$Sex)
train2$Pclass <- factor(train2$Pclass)
train2$Embarked <- factor(train2$Embarked)
test2$Sex <- factor(test2$Sex, levels = levels(train2$Sex))
test2$Pclass <- factor(test2$Pclass, levels = levels(train2$Pclass))
test2$Embarked <- factor(test2$Embarked, levels = levels(train2$Embarked))
fare_mean <- mean(train2$Fare, na.rm = TRUE)
test2$Fare[is.na(test2$Fare)] <- fare_mean
mode_fun <- function(x) {
ux <- na.omit(x)
ux[which.max(tabulate(match(ux, ux)))]
}
emb_mode <- mode_fun(train2$Embarked)
train2$Embarked[is.na(train2$Embarked) | train2$Embarked == ""] <- emb_mode
# Re-factor Embarked after imputation so levels are locked kept running into errors here
train2$Embarked <- factor(train2$Embarked)
test2$Embarked <- factor(test2$Embarked, levels = levels(train2$Embarked))
X_all <- model.matrix(~ . - 1, data = rbind(
train2[, vars[-1], drop = FALSE],
test2
))
n_tr <- nrow(train2)
X_train_full <- X_all[1:n_tr, , drop = FALSE]
X_test_full <- X_all[(n_tr + 1):nrow(X_all), , drop = FALSE]
y_train_full <- as.integer(as.character(train2$Survived))
bart_full <- bart(
x.train = X_train_full,
y.train = y_train_full,
x.test = X_test_full,
ntree = 200,
ndpost = 1000,
nskip = 200,
k = 2,
sigdf = 3,
sigquant = 0.9
)
##
## Running BART with binary y
##
## number of trees: 200
## number of chains: 1, default number of threads 1
## tree thinning rate: 1
## Prior:
## k prior fixed to 2.000000
## power and base for tree prior: 2.000000 0.950000
## use quantiles for rule cut points: false
## proposal probabilities: birth/death 0.50, swap 0.10, change 0.40; birth 0.50
## data:
## number of training observations: 891
## number of test observations: 418
## number of explanatory variables: 10
##
## Cutoff rules c in x<=c vs x>c
## Number of cutoffs: (var: number of possible c):
## (1: 100) (2: 100) (3: 100) (4: 100) (5: 100)
## (6: 100) (7: 100) (8: 100) (9: 100) (10: 100)
##
## offsets:
## reg : 0.00 0.00 0.00 0.00 0.00
## test: 0.00 0.00 0.00 0.00 0.00
## Running mcmc loop:
## iteration: 100 (of 1000)
## iteration: 200 (of 1000)
## iteration: 300 (of 1000)
## iteration: 400 (of 1000)
## iteration: 500 (of 1000)
## iteration: 600 (of 1000)
## iteration: 700 (of 1000)
## iteration: 800 (of 1000)
## iteration: 900 (of 1000)
## iteration: 1000 (of 1000)
## total seconds in loop: 1.844940
##
## Tree sizes, last iteration:
## [1] 3 3 2 2 3 2 2 4 3 2 2 3 1 2 2 5 2 2
## 2 2 2 2 3 4 2 2 2 2 1 2 1 2 3 2 2 1 4 2
## 2 2 2 2 2 2 3 4 2 2 2 2 3 2 2 2 2 2 2 2
## 2 2 1 1 3 3 3 4 2 2 3 3 2 2 2 3 2 1 2 1
## 4 3 2 2 4 2 2 3 2 2 5 2 1 1 2 2 3 2 2 2
## 2 2 2 3 2 2 2 2 2 2 2 3 2 2 2 2 3 1 2 2
## 2 3 2 2 1 3 2 2 2 3 2 2 4 4 2 3 2 2 2 2
## 2 2 3 3 3 2 2 2 2 4 3 2 3 2 2 4 2 2 2 2
## 3 4 3 1 3 2 2 3 3 1 3 3 3 2 2 2 2 2 1 2
## 2 3 2 3 3 2 4 2 3 2 2 2 2 2 2 4 2 2 2 2
## 2 3
##
## Variable Usage, last iteration (var:count):
## (1: 20) (2: 26) (3: 17) (4: 36) (5: 33)
## (6: 29) (7: 26) (8: 20) (9: 28) (10: 27)
##
## DONE BART
test_prob_bart <- if (!is.null(bart_full$yhat.test.mean) && length(bart_full$yhat.test.mean) > 0) {
as.numeric(bart_full$yhat.test.mean)
} else {
as.numeric(colMeans(bart_full$yhat.test))
}
test_pred_bart <- ifelse(test_prob_bart >= 0.5, 1, 0)
submission_bart <- data.frame(
PassengerId = test_ids,
Survived = test_pred_bart
)
write.csv(submission_bart, "submission_bart.csv", row.names = FALSE)
head(submission_bart)
## PassengerId Survived
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 0
## 6 897 0
# Create numeric model matrices (XGBoost requires numeric input)
X_train_xgb <- model.matrix(Survived ~ . - 1, data = tr2)
y_train_xgb <- tr2$Survived
X_val_xgb <- model.matrix(Survived ~ . - 1, data = vl2)
y_val_xgb <- vl2$Survived
dtrain <- xgb.DMatrix(data = X_train_xgb, label = y_train_xgb)
dval <- xgb.DMatrix(data = X_val_xgb, label = y_val_xgb)
# Parameter specification
params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.05, # learning rate
max_depth = 4,
subsample = 0.8, # row sampling
colsample_bytree = 0.8
)
# Train with early stopping
xgb_fit <- xgb.train(
params = params,
data = dtrain,
nrounds = 2000,
evals = list(train = dtrain, val = dval),
early_stopping_rounds = 50,
print_every_n = 100
)
## Multiple eval metrics are present. Will use val_auc for early stopping.
## Will train until val_auc hasn't improved in 50 rounds.
##
## [1] train-auc:0.871906 val-auc:0.894091
## Stopping. Best iteration:
## [74] train-auc:0.922337 val-auc:0.909066
##
## [74] train-auc:0.922337 val-auc:0.909066
vl_prob_xgb <- predict(xgb_fit, dval)
vl_pred_xgb <- ifelse(vl_prob_xgb >= 0.5, 1, 0)
acc_xgb <- mean(vl_pred_xgb == y_val_xgb)
auc_xgb <- as.numeric(pROC::auc(pROC::roc(y_val_xgb, vl_prob_xgb, quiet = TRUE)))
cat(sprintf("XGBoost: validation accuracy = %.3f\n", acc_xgb))
## XGBoost: validation accuracy = 0.847
cat(sprintf("XGBoost: validation AUC = %.3f\n", auc_xgb))
## XGBoost: validation AUC = 0.913
cm_xgb <- table(Pred = vl_pred_xgb, True = y_val_xgb)
cm_xgb
## True
## Pred 0 1
## 0 106 24
## 1 3 44
xgb_imp <- xgb.importance(model = xgb_fit)
xgb_imp
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: Sexmale 0.43011444 0.191257849 0.069392813
## 2: Fare 0.16396851 0.277867608 0.343246592
## 3: Age 0.15768021 0.259047347 0.285006196
## 4: Pclass3 0.10995435 0.066039917 0.064436183
## 5: SibSp 0.04767595 0.049038631 0.086741016
## 6: Pclass1 0.03920916 0.060565955 0.038413879
## 7: EmbarkedS 0.02203021 0.040444041 0.043370508
## 8: Pclass2 0.01535963 0.029076299 0.029739777
## 9: Parch 0.01280067 0.025472371 0.034696406
## 10: EmbarkedQ 0.00120687 0.001189982 0.004956629
knitr::include_graphics("C:/Users/miada/OneDrive/Pictures/Screenshots/Screenshots 1/submission_xgboost.png")
My image