This is the R Notebook accompanying the portfolio methods note. This notebook contains all parts of the work including data analysis, model development, signal computation, cross-validation loops and final test set output. Core functions are packaged in R scripts referenced in the Data section.
library(here)
source(here("src","cv_assets.R"))
source(here("src","train_test_split.R"))
set.seed(42)
dev <- read.csv(here("dev.csv"))
dev <- dev[,-1]
dev$diagnosis <- as.factor(dev$diagnosis)
nrow(dev%>%filter(worst_perimeter >= 50 & worst_perimeter <= 105))
## [1] 219
rf_base <- rf_cv_build_calibration_stack(dev, folds=folds, mtry=8, min.node.size=5, num_trees = 500)
## Fold 1 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.009
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.074
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0423
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0844
## Fold 2 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.008
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0751
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0491
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0906
## Fold 3 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0094
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0561
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0973
## Fold 4 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0086
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0545
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0554
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0883
## Fold 5 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0091
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0153
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0696
rf_case_weights <- rf_cv_build_calibration_stack_weighted(dev, folds=folds, , mtry=8, min.node.size=5, num_trees = 500,
weight_col="worst_perimeter", lo=65, hi=110,
w_zone_all=2, w_zone_malignant=5)
## Fold 1 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.009
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.074
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0423
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0844
## Fold 2 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.008
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0751
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0491
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0906
## Fold 3 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0094
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0561
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0973
## Fold 4 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0086
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0545
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0554
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0883
## Fold 5 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0091
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0153
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0696
xgboost_base <- xgb_oof_predict(dev, folds=folds)
## XGB fold 1 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 2 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 3 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 4 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 5 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
patchwork::wrap_plots(
ggplot(rf_base%>%filter(pM < 0.4, diagnosis=="M")) +
geom_histogram(aes(x=pM), fill="skyblue") +
ylab("True Malignants") +
labs(title="False Negatives:\nRandom Forest Base") +
theme_minimal(),
ggplot(rf_case_weights%>%filter(pM < 0.4, diagnosis=="M")) +
geom_histogram(aes(x=pM), fill="skyblue") +
ylab("True Malignants") +
labs(title="\nRF w/ Case Weights in\nworst_perimeter [65,110]") +
theme_minimal(),
ggplot(xgboost_base%>%filter(pM < 0.4, diagnosis=="M")) +
geom_histogram(aes(x=pM), fill="skyblue") +
ylab("True Malignants") +
labs(title="\nXGBoost Base") +
theme_minimal(),
nrow=2, ncol=2
)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
param_grid <- expand.grid(
mtry = c(4, 8, 16, 25),
min.node.size = c(5, 15)
)
tune_out <- rf_cv_tune_uncertainty(
dev = dev,
folds = folds,
param_grid = param_grid,
seed = 42,
verbose = TRUE
)
## Starting integrated CV tuning loop...
## Testing: mtry=4 min.node.size=5
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=8 min.node.size=5
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=16 min.node.size=5
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=25 min.node.size=5
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=4 min.node.size=15
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=8 min.node.size=15
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=16 min.node.size=15
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
## Testing: mtry=25 min.node.size=15
## Fold 1 / 5
## Fold 2 / 5
## Fold 3 / 5
## Fold 4 / 5
## Fold 5 / 5
metrics <- tune_out$metrics
metrics
## # A tibble: 8 × 6
## mtry min.node.size correlation mid_se_sd n_mid auc
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 8 5 0.0238 0.0465 10 0.987
## 2 4 5 0.0487 0.0447 15 0.986
## 3 4 15 0.0505 0.0398 15 0.988
## 4 25 5 0.0530 0.0622 9 0.986
## 5 25 15 0.0534 0.0795 12 0.986
## 6 16 15 0.0556 0.0656 10 0.986
## 7 16 5 0.0744 0.0715 11 0.986
## 8 8 15 0.0857 0.0519 13 0.987
master_results <- tune_out$master_results
top_combos <- head(metrics, 4)
master_results %>%
inner_join(top_combos, by = c("mtry", "min.node.size")) %>%
ggplot(aes(x = pM, y = pM_se)) +
geom_point(alpha = 0.3) +
facet_wrap(~paste0("mtry=", mtry, " | node=", min.node.size)) +
theme_minimal() +
ggtitle("Uncertainty Triangles (Top Combos)")
bottom_combos <- tail(metrics, 4)
master_results %>%
inner_join(bottom_combos, by = c("mtry", "min.node.size")) %>%
ggplot(aes(x = pM, y = pM_se)) +
geom_point(alpha = 0.3) +
facet_wrap(~paste0("mtry=", mtry, " | node=", min.node.size)) +
theme_minimal() +
ggtitle("Uncertainty Triangles (Bottom Combos)")
calibration_z_stack <- rf_cv_build_calibration_stack(
dev = dev,
folds = folds,
mtry = 8,
min.node.size = 5,
seed = 42,
add_cosine=TRUE,
verbose = TRUE
)
## Fold 1 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.009
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.074
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0423
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0844
## Fold 2 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.008
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0751
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0491
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0906
## Fold 3 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0094
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0561
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0973
## Fold 4 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0086
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0545
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0554
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0883
## Fold 5 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0091
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0153
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0696
patchwork::wrap_plots(
ggplot(data=calibration_z_stack%>%filter(d_dist < 80)) +
geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.8) +
xlab("") +
ylab("d_dist") +
labs(title="d_dist and proj_marg vs logit(pM)") +
theme_minimal(),
ggplot(data=calibration_z_stack) +
geom_point(aes(x=qlogis(pM), y=proj_marg, color=diagnosis), alpha=0.8) +
xlab("logit(pM)") +
ylab("proj_marg") +
theme_minimal(),
nrow=2
)
ggplot(data=calibration_z_stack) +
geom_histogram(aes(x=pM, fill=diagnosis), alpha=0.8) +
labs(title="pM (RF probability of Malignancy)") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
ggplot(data=calibration_z_stack) +
geom_point(aes(x=qlogis(pM), y=proj_marg, color=diagnosis), alpha=0.8) +
xlab("logit(pM)") +
ylab("Marg. Z Proj.") +
labs(title="logit(pM) vs Marg. Z Proj.") +
theme_minimal()
ggplot(data=calibration_z_stack%>%filter(qlogis(pM) < -0.2)) +
geom_point(aes(x=qlogis(pM), y=proj_marg, color=diagnosis), alpha=0.8) +
xlab("logit(pM)") +
ylab("Marg. Z Proj.") +
labs(title="Confident Region (pM < 0.45): logit(pM) vs Marg. Z Proj.") +
theme_minimal()
ggplot(data=calibration_z_stack%>%filter(d_dist < 80)) +
geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.8) +
xlab("logit(pM)") +
ylab("d_dist") +
labs(title="d_dist vs logit(pM)") +
theme_minimal()
ggplot(data=calibration_z_stack%>%filter(qlogis(pM) < -0.2)) +
geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.8) +
xlab("logit(pM)") +
ylab("d_dist") +
labs(title="Confident Region (pM < 0.45): d_dist vs logit(pM)") +
theme_minimal()
ggplot(data=calibration_z_stack%>%filter(d_dist < 80)) +
geom_point(aes(x=d_dist, y=proj_marg, color=diagnosis), alpha=0.8) +
xlab("d_dist") +
ylab("proj_marg") +
labs(title="proj_marg vs d_dist") +
theme_minimal()
ggplot(data=calibration_z_stack%>%filter(qlogis(pM) < -0.2)) +
geom_point(aes(x=d_dist, y=proj_marg, color=diagnosis), alpha=0.8) +
xlab("d_dist") +
ylab("proj_marg") +
labs(title="Confident Region (pM < 0.45): proj_marg vs d_dist") +
theme_minimal()
calibration_z_stack%>%filter(pM < 0.45, diagnosis=="M")%>%arrange(d_dist)%>%select(pM, d_dist, proj_marg)
## pM d_dist proj_marg
## 333 0.00100000 -0.2569794 0.7297937
## 211 0.01046667 -0.0818665 1.3431847
## 37 0.34900000 0.1929986 5.4486465
## 21 0.31773333 0.8091095 4.1962350
## 72 0.44130000 0.8688281 4.9711317
## 286 0.23943333 1.3653573 4.0788680
## 278 0.42743333 1.5288178 4.9051828
## 48 0.34320000 1.7813111 3.3403494
## 4 0.37356667 2.0936859 1.5321755
## 14 0.36960000 2.3292545 4.9515052
bound_bootstrap_data <- enriched_split(dev, dev_frac = 0.60, rare_filter = quote(worst_perimeter >= 50 & worst_perimeter <= 95), seed=7)
bound_bootstrap_train <- bound_bootstrap_data$dev
bound_bootstrap_test <- bound_bootstrap_data$test
nrow(bound_bootstrap_train%>%filter(worst_perimeter >= 50, worst_perimeter <= 95))
## [1] 100
nrow(bound_bootstrap_test%>%filter(worst_perimeter >= 50, worst_perimeter <= 95))
## [1] 69
ggplot(bound_bootstrap_train%>%filter(worst_perimeter >= 50 & worst_perimeter <= 110)) +
geom_point(aes(x=worst_perimeter, y=worst_area, color=diagnosis), alpha=0.8) +
theme_minimal()
ggplot(bound_bootstrap_test%>%filter(worst_perimeter >= 50 & worst_perimeter <= 110)) +
geom_point(aes(x=worst_perimeter, y=worst_area, color=diagnosis), alpha=0.8) +
theme_minimal()
bound_bootstrap <- bootstrap_proj_and_ddist(bound_bootstrap_train, bound_bootstrap_test, B=300, seed=7)
bootstrap_summary_values <- list(
pM_mean = colMeans(bound_bootstrap$pM_boot),
proj_marg_lb = apply(bound_bootstrap$proj_boot, 2, quantile, 0.05),
proj_marg_ub = apply(bound_bootstrap$proj_boot, 2, quantile, 0.95),
d_dist_lb = apply(bound_bootstrap$ddist_boot, 2, quantile, 0.05),
d_dist_ub = apply(bound_bootstrap$ddist_boot, 2, quantile, 0.95)
)
bound_bootstrap_test_eval <- cbind(bound_bootstrap_test, bootstrap_summary_values)
bootstrap_bound_results <- bound_bootstrap_test_eval%>%filter(diagnosis=="M", pM_mean < 0.45)%>%arrange(pM_mean)%>%select(id, diagnosis, pM_mean, proj_marg_lb, proj_marg_ub, d_dist_lb, d_dist_ub)
bootstrap_bound_results
## id diagnosis pM_mean proj_marg_lb proj_marg_ub d_dist_lb d_dist_ub
## 1 892189 M 0.08459444 1.1281514 2.682305 -1.0999856 1.0586723
## 2 859983 M 0.11484467 2.9426902 4.113895 -0.5259265 1.0286532
## 3 90291 M 0.15533089 3.5109370 4.617933 0.6207670 2.1799682
## 4 855563 M 0.22664622 0.3225484 1.388728 0.5585505 2.0485803
## 5 886452 M 0.23258867 4.3050179 5.513227 -0.5423223 0.8360126
## 6 91979701 M 0.35898789 2.7766813 4.096138 0.7367100 2.6106656
## 7 875263 M 0.42206189 4.0894631 5.438537 2.1184344 4.1272260
bound_bootstrap_test_eval_FN_idx <- with(bound_bootstrap_test_eval, eval(quote(diagnosis == "M" & pM_mean < 0.45)))
bound_bootstrap_test_eval_FN <- bound_bootstrap_test_eval[bound_bootstrap_test_eval_FN_idx, , drop=FALSE]
proj_marg_FN <- bound_bootstrap$proj_boot[, bound_bootstrap_test_eval_FN_idx==TRUE]
ddist_FN <- bound_bootstrap$ddist_boot[, bound_bootstrap_test_eval_FN_idx==TRUE]
for (i in 1:ncol(ddist_FN)) {
hist(ddist_FN[, i])
}
for (i in 1:ncol(proj_marg_FN)) {
hist(proj_marg_FN[, i])
}
# write.csv(bootstrap_lower_bound_results, file="bootstrap_lower_bound_results.csv")
nrow(bound_bootstrap_test_eval%>%filter(diagnosis == "M", proj_marg_lb <= 0))
## [1] 0
nrow(bound_bootstrap_test_eval%>%filter(diagnosis == "M", d_dist_lb <= -1.2))
## [1] 0
nrow(bound_bootstrap_test_eval%>%filter(diagnosis == "M", proj_marg_lb <= 0, d_dist_lb <= -1.2))
## [1] 0
nrow(bound_bootstrap_test_eval%>%filter(proj_marg_lb > 0)) / nrow(bound_bootstrap_test_eval)
## [1] 0.6040268
nrow(bound_bootstrap_test_eval%>%filter(d_dist_lb > -1.2)) / nrow(bound_bootstrap_test_eval)
## [1] 0.4496644
nrow(bound_bootstrap_test_eval%>%filter(proj_marg_lb > 0 & d_dist_lb > -1.2)) / nrow(bound_bootstrap_test_eval)
## [1] 0.442953
rf_final <- ranger::ranger(
diagnosis ~ .,
data = dev%>%select(-id),
mtry = 8,
min.node.size = 5,
keep.inbag = TRUE,
probability = TRUE,
num.trees = 500
)
ref_final <- fit_references_BM(dev)
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.007
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0615
##
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0369
##
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0721
dirs_final <- fit_drift_directions(dev, ref_final)
threshold_final <- list(
d_dist_primary = -1.2,
proj_marg_secondary = 0
)
# saveRDS(rf_final, file="rf_model.rds")
# saveRDS(ref_final, file="gaussian_ref.rds")
# saveRDS(dirs_final, file="drift_dirs.rds")
# saveRDS(threshold_final, file="thresholds.rds")
test <- read.csv(here("test.csv"))
test <- test[-1]
test$diagnosis <- as.factor(test$diagnosis)
test_rf_res <- RFPredict(test, rf_final)
test_aug <- augment_with_z_and_projections(test, ref_final, dirs_final)
test_out <- left_join(test_rf_res, test_aug, by="id")
malignant_region <- test_out%>%filter(pM >= 0.65)
ambiguous_region <- test_out%>%filter(pM > 0.45 & pM < 0.65)
benign_region <- test_out%>%filter(pM <= 0.45)
ggplot(benign_region) +
geom_histogram(aes(x=pM, fill=diagnosis), alpha=0.8) +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
table(malignant_region$diagnosis)
##
## B M
## 4 69
benign_confident <- benign_region%>%filter(d_dist < -1.2)
benign_risky <- benign_region%>%filter(d_dist >= -1.2)
benign_review <- benign_risky%>%filter(proj_marg > 0)
benign_non_risky <- benign_risky%>%filter(proj_marg <= 0)
benign_non_review <- rbind(benign_confident, benign_non_risky)
table(benign_region$diagnosis)
##
## B M
## 121 3
table(benign_review$diagnosis)
##
## B M
## 34 3
table(benign_risky$diagnosis)
##
## B M
## 60 3
table(benign_non_risky$diagnosis)
##
## B M
## 26 0
table(benign_non_review$diagnosis)
##
## B M
## 87 0
table(ambiguous_region$diagnosis)
##
## B M
## 1 3
review_rate_benign_region <- nrow(setdiff(benign_risky, benign_non_risky)) / nrow(benign_region)
review_rate_benign_region
## [1] 0.2983871
review_rate_overall <- (nrow(setdiff(benign_risky, benign_non_risky)) + nrow(ambiguous_region)) / nrow(test)
review_rate_overall
## [1] 0.2039801