J. Prakash - Portfolio (2026)

Geometry-Based Risk Modeling for Safe ML Inference in Medical Diagnostic Triage

Jithakrishna Prakash | | linkedin.com/in/jithakrishna-prakash/


This is the R Notebook accompanying the portfolio methods note. This notebook contains all parts of the work including data analysis, model development, signal computation, cross-validation loops and final test set output. Core functions are packaged in R scripts referenced in the Data section.

library(here)

source(here("src","cv_assets.R"))
source(here("src","train_test_split.R"))

set.seed(42)

1. Data

dev <- read.csv(here("dev.csv"))
dev <- dev[,-1]
dev$diagnosis <- as.factor(dev$diagnosis)
nrow(dev%>%filter(worst_perimeter >= 50 & worst_perimeter <= 105))
## [1] 219

Class overlap and hidden failures

ggplot(dev%>%filter(worst_perimeter >= 50 & worst_perimeter <= 150)) +
  geom_point(aes(x=worst_perimeter, y=worst_area, color=diagnosis), alpha=0.8) +
  labs(title="Class Overlap in the BCW dataset") +
  theme_minimal()

folds <- make_strat_folds(dev$diagnosis, k = 5, seed = 42)

Baseline Performance Comparison

rf_base <- rf_cv_build_calibration_stack(dev, folds=folds, mtry=8, min.node.size=5, num_trees = 500)
## Fold 1 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.009 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.074 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0423 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0844
## Fold 2 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.008 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0751 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0491 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0906
## Fold 3 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0094 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0561 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0973
## Fold 4 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0086 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0545 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0554 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0883
## Fold 5 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0091 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0153 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0696
rf_case_weights <- rf_cv_build_calibration_stack_weighted(dev, folds=folds, , mtry=8, min.node.size=5, num_trees = 500,
                                                          weight_col="worst_perimeter", lo=65, hi=110,
                                                          w_zone_all=2, w_zone_malignant=5)
## Fold 1 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.009 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.074 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0423 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0844
## Fold 2 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.008 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0751 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0491 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0906
## Fold 3 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0094 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0561 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0973
## Fold 4 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0086 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0545 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0554 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0883
## Fold 5 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0091 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0153 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0696
xgboost_base <- xgb_oof_predict(dev, folds=folds)
## XGB fold 1 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 2 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 3 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 4 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
## XGB fold 5 / 5
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
patchwork::wrap_plots(

  ggplot(rf_base%>%filter(pM < 0.4, diagnosis=="M")) +
    geom_histogram(aes(x=pM), fill="skyblue") +
    ylab("True Malignants") +
    labs(title="False Negatives:\nRandom Forest Base") +
    theme_minimal(),
  
  ggplot(rf_case_weights%>%filter(pM < 0.4, diagnosis=="M")) +
    geom_histogram(aes(x=pM), fill="skyblue") +
    ylab("True Malignants") +
    labs(title="\nRF w/ Case Weights in\nworst_perimeter [65,110]") +
    theme_minimal(),
  
  ggplot(xgboost_base%>%filter(pM < 0.4, diagnosis=="M")) +
    geom_histogram(aes(x=pM), fill="skyblue") +
    ylab("True Malignants") +
    labs(title="\nXGBoost Base") +
    theme_minimal(),
  
  nrow=2, ncol=2
)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Random Forest Hyperparameter Tuning

param_grid <- expand.grid(
  mtry = c(4, 8, 16, 25),
  min.node.size = c(5, 15)
)

tune_out <- rf_cv_tune_uncertainty(
  dev = dev,
  folds = folds,
  param_grid = param_grid,
  seed = 42,
  verbose = TRUE
)
## Starting integrated CV tuning loop...
## Testing: mtry=4 min.node.size=5
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=8 min.node.size=5
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=16 min.node.size=5
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=25 min.node.size=5
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=4 min.node.size=15
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=8 min.node.size=15
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=16 min.node.size=15
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
## Testing: mtry=25 min.node.size=15
##   Fold 1 / 5
##   Fold 2 / 5
##   Fold 3 / 5
##   Fold 4 / 5
##   Fold 5 / 5
metrics <- tune_out$metrics
metrics
## # A tibble: 8 × 6
##    mtry min.node.size correlation mid_se_sd n_mid   auc
##   <dbl>         <dbl>       <dbl>     <dbl> <int> <dbl>
## 1     8             5      0.0238    0.0465    10 0.987
## 2     4             5      0.0487    0.0447    15 0.986
## 3     4            15      0.0505    0.0398    15 0.988
## 4    25             5      0.0530    0.0622     9 0.986
## 5    25            15      0.0534    0.0795    12 0.986
## 6    16            15      0.0556    0.0656    10 0.986
## 7    16             5      0.0744    0.0715    11 0.986
## 8     8            15      0.0857    0.0519    13 0.987
master_results <- tune_out$master_results
top_combos <- head(metrics, 4)
master_results %>%
  inner_join(top_combos, by = c("mtry", "min.node.size")) %>%
  ggplot(aes(x = pM, y = pM_se)) +
  geom_point(alpha = 0.3) +
  facet_wrap(~paste0("mtry=", mtry, " | node=", min.node.size)) +
  theme_minimal() +
  ggtitle("Uncertainty Triangles (Top Combos)")

bottom_combos <- tail(metrics, 4)
master_results %>%
  inner_join(bottom_combos, by = c("mtry", "min.node.size")) %>%
  ggplot(aes(x = pM, y = pM_se)) +
  geom_point(alpha = 0.3) +
  facet_wrap(~paste0("mtry=", mtry, " | node=", min.node.size)) +
  theme_minimal() +
  ggtitle("Uncertainty Triangles (Bottom Combos)")

Five-fold Cross Validation Loop for RF output and Risk Signals

calibration_z_stack <- rf_cv_build_calibration_stack(
  dev = dev,
  folds = folds,
  mtry = 8,
  min.node.size = 5,
  seed = 42,
  add_cosine=TRUE,
  verbose = TRUE
)
## Fold 1 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.009 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.074 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0423 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0844
## Fold 2 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.008 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0751 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0491 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0906
## Fold 3 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0094 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0561 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0973
## Fold 4 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0086 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0545 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0554 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0883
## Fold 5 / 5
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0091 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0801 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0153 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0696
patchwork::wrap_plots(
  ggplot(data=calibration_z_stack%>%filter(d_dist < 80)) +
  geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.8) +
  xlab("") +
  ylab("d_dist") +
  labs(title="d_dist and proj_marg vs logit(pM)") +
  theme_minimal(),
  
  ggplot(data=calibration_z_stack) +
  geom_point(aes(x=qlogis(pM), y=proj_marg, color=diagnosis), alpha=0.8) +
  xlab("logit(pM)") +
  ylab("proj_marg") +
  theme_minimal(),
  
  nrow=2
)

ggplot(data=calibration_z_stack) +
  geom_histogram(aes(x=pM, fill=diagnosis), alpha=0.8) +
  labs(title="pM (RF probability of Malignancy)") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(data=calibration_z_stack) +
  geom_point(aes(x=qlogis(pM), y=proj_marg, color=diagnosis), alpha=0.8) +
  xlab("logit(pM)") +
  ylab("Marg. Z Proj.") +
  labs(title="logit(pM) vs Marg. Z Proj.") +
  theme_minimal()

ggplot(data=calibration_z_stack%>%filter(qlogis(pM) < -0.2)) +
  geom_point(aes(x=qlogis(pM), y=proj_marg, color=diagnosis), alpha=0.8) +
  xlab("logit(pM)") +
  ylab("Marg. Z Proj.") +
  labs(title="Confident Region (pM < 0.45): logit(pM) vs Marg. Z Proj.") +
  theme_minimal()

ggplot(data=calibration_z_stack%>%filter(d_dist < 80)) +
  geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.8) +
  xlab("logit(pM)") +
  ylab("d_dist") +
  labs(title="d_dist vs logit(pM)") +
  theme_minimal()

ggplot(data=calibration_z_stack%>%filter(qlogis(pM) < -0.2)) +
  geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.8) +
  xlab("logit(pM)") +
  ylab("d_dist") +
  labs(title="Confident Region (pM < 0.45): d_dist vs logit(pM)") +
  theme_minimal()

ggplot(data=calibration_z_stack%>%filter(d_dist < 80)) +
  geom_point(aes(x=d_dist, y=proj_marg, color=diagnosis), alpha=0.8) +
  xlab("d_dist") +
  ylab("proj_marg") +
  labs(title="proj_marg vs d_dist") +
  theme_minimal()

ggplot(data=calibration_z_stack%>%filter(qlogis(pM) < -0.2)) +
  geom_point(aes(x=d_dist, y=proj_marg, color=diagnosis), alpha=0.8) +
  xlab("d_dist") +
  ylab("proj_marg") +
  labs(title="Confident Region (pM < 0.45): proj_marg vs d_dist") +
  theme_minimal()

calibration_z_stack%>%filter(pM < 0.45, diagnosis=="M")%>%arrange(d_dist)%>%select(pM, d_dist, proj_marg)
##             pM     d_dist proj_marg
## 333 0.00100000 -0.2569794 0.7297937
## 211 0.01046667 -0.0818665 1.3431847
## 37  0.34900000  0.1929986 5.4486465
## 21  0.31773333  0.8091095 4.1962350
## 72  0.44130000  0.8688281 4.9711317
## 286 0.23943333  1.3653573 4.0788680
## 278 0.42743333  1.5288178 4.9051828
## 48  0.34320000  1.7813111 3.3403494
## 4   0.37356667  2.0936859 1.5321755
## 14  0.36960000  2.3292545 4.9515052

Signal Stability Analysis and Threshold Selection

bound_bootstrap_data <- enriched_split(dev, dev_frac = 0.60, rare_filter = quote(worst_perimeter >= 50 & worst_perimeter <= 95), seed=7)
bound_bootstrap_train <- bound_bootstrap_data$dev

bound_bootstrap_test <- bound_bootstrap_data$test
nrow(bound_bootstrap_train%>%filter(worst_perimeter >= 50, worst_perimeter <= 95))
## [1] 100
nrow(bound_bootstrap_test%>%filter(worst_perimeter >= 50, worst_perimeter <= 95))
## [1] 69
ggplot(bound_bootstrap_train%>%filter(worst_perimeter >= 50 & worst_perimeter <= 110)) +
  geom_point(aes(x=worst_perimeter, y=worst_area, color=diagnosis), alpha=0.8) +
  theme_minimal()

ggplot(bound_bootstrap_test%>%filter(worst_perimeter >= 50 & worst_perimeter <= 110)) +
  geom_point(aes(x=worst_perimeter, y=worst_area, color=diagnosis), alpha=0.8) +
  theme_minimal()

bound_bootstrap <- bootstrap_proj_and_ddist(bound_bootstrap_train, bound_bootstrap_test, B=300, seed=7)
bootstrap_summary_values <- list(
  pM_mean = colMeans(bound_bootstrap$pM_boot),
  proj_marg_lb = apply(bound_bootstrap$proj_boot, 2, quantile, 0.05),
  proj_marg_ub = apply(bound_bootstrap$proj_boot, 2, quantile, 0.95),
  d_dist_lb = apply(bound_bootstrap$ddist_boot, 2, quantile, 0.05),
  d_dist_ub = apply(bound_bootstrap$ddist_boot, 2, quantile, 0.95)
)
bound_bootstrap_test_eval <- cbind(bound_bootstrap_test, bootstrap_summary_values)
bootstrap_bound_results <- bound_bootstrap_test_eval%>%filter(diagnosis=="M", pM_mean < 0.45)%>%arrange(pM_mean)%>%select(id, diagnosis, pM_mean, proj_marg_lb, proj_marg_ub, d_dist_lb, d_dist_ub)
bootstrap_bound_results
##         id diagnosis    pM_mean proj_marg_lb proj_marg_ub  d_dist_lb d_dist_ub
## 1   892189         M 0.08459444    1.1281514     2.682305 -1.0999856 1.0586723
## 2   859983         M 0.11484467    2.9426902     4.113895 -0.5259265 1.0286532
## 3    90291         M 0.15533089    3.5109370     4.617933  0.6207670 2.1799682
## 4   855563         M 0.22664622    0.3225484     1.388728  0.5585505 2.0485803
## 5   886452         M 0.23258867    4.3050179     5.513227 -0.5423223 0.8360126
## 6 91979701         M 0.35898789    2.7766813     4.096138  0.7367100 2.6106656
## 7   875263         M 0.42206189    4.0894631     5.438537  2.1184344 4.1272260
bound_bootstrap_test_eval_FN_idx <- with(bound_bootstrap_test_eval, eval(quote(diagnosis == "M" & pM_mean < 0.45)))
bound_bootstrap_test_eval_FN <- bound_bootstrap_test_eval[bound_bootstrap_test_eval_FN_idx, , drop=FALSE]
proj_marg_FN <- bound_bootstrap$proj_boot[, bound_bootstrap_test_eval_FN_idx==TRUE]
ddist_FN <- bound_bootstrap$ddist_boot[, bound_bootstrap_test_eval_FN_idx==TRUE]
for (i in 1:ncol(ddist_FN)) {
  
  hist(ddist_FN[, i])
  
}

for (i in 1:ncol(proj_marg_FN)) {
  
  hist(proj_marg_FN[, i])
  
}

# write.csv(bootstrap_lower_bound_results, file="bootstrap_lower_bound_results.csv")

Threshold Selection

nrow(bound_bootstrap_test_eval%>%filter(diagnosis == "M", proj_marg_lb <= 0))
## [1] 0
nrow(bound_bootstrap_test_eval%>%filter(diagnosis == "M", d_dist_lb <= -1.2))
## [1] 0
nrow(bound_bootstrap_test_eval%>%filter(diagnosis == "M", proj_marg_lb <= 0, d_dist_lb <= -1.2))
## [1] 0
nrow(bound_bootstrap_test_eval%>%filter(proj_marg_lb > 0)) / nrow(bound_bootstrap_test_eval)
## [1] 0.6040268
nrow(bound_bootstrap_test_eval%>%filter(d_dist_lb > -1.2)) / nrow(bound_bootstrap_test_eval)
## [1] 0.4496644
nrow(bound_bootstrap_test_eval%>%filter(proj_marg_lb > 0 & d_dist_lb > -1.2)) / nrow(bound_bootstrap_test_eval)
## [1] 0.442953
rf_final <- ranger::ranger(
      diagnosis ~ .,
      data = dev%>%select(-id),
      mtry = 8,
      min.node.size = 5,
      keep.inbag = TRUE,
      probability = TRUE,
      num.trees = 500
)
ref_final <- fit_references_BM(dev)
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.007 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0615 
## 
## Estimating optimal shrinkage intensity lambda.var (variance vector): 0.0369 
## 
## Estimating optimal shrinkage intensity lambda (correlation matrix): 0.0721
dirs_final <- fit_drift_directions(dev, ref_final)
threshold_final <- list(
  d_dist_primary = -1.2,
  proj_marg_secondary = 0
)
# saveRDS(rf_final, file="rf_model.rds")
# saveRDS(ref_final, file="gaussian_ref.rds")
# saveRDS(dirs_final, file="drift_dirs.rds")
# saveRDS(threshold_final, file="thresholds.rds")

Test Set Performance

test <- read.csv(here("test.csv"))
test <- test[-1]
test$diagnosis <- as.factor(test$diagnosis)
test_rf_res <- RFPredict(test, rf_final)
test_aug <- augment_with_z_and_projections(test, ref_final, dirs_final)
test_out <- left_join(test_rf_res, test_aug, by="id")
malignant_region <- test_out%>%filter(pM >= 0.65)
ambiguous_region <- test_out%>%filter(pM > 0.45 & pM < 0.65)
benign_region <- test_out%>%filter(pM <= 0.45)
ggplot(benign_region) +
  geom_histogram(aes(x=pM, fill=diagnosis), alpha=0.8) +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

table(malignant_region$diagnosis)
## 
##  B  M 
##  4 69
benign_confident <- benign_region%>%filter(d_dist < -1.2)
benign_risky <- benign_region%>%filter(d_dist >= -1.2)
benign_review <- benign_risky%>%filter(proj_marg > 0)

benign_non_risky <- benign_risky%>%filter(proj_marg <= 0)
benign_non_review <- rbind(benign_confident, benign_non_risky)
table(benign_region$diagnosis)
## 
##   B   M 
## 121   3
table(benign_review$diagnosis)
## 
##  B  M 
## 34  3
table(benign_risky$diagnosis)
## 
##  B  M 
## 60  3
table(benign_non_risky$diagnosis)
## 
##  B  M 
## 26  0
table(benign_non_review$diagnosis)
## 
##  B  M 
## 87  0
table(ambiguous_region$diagnosis)
## 
## B M 
## 1 3
review_rate_benign_region <- nrow(setdiff(benign_risky, benign_non_risky)) / nrow(benign_region)
review_rate_benign_region
## [1] 0.2983871
review_rate_overall <- (nrow(setdiff(benign_risky, benign_non_risky)) + nrow(ambiguous_region)) / nrow(test)
review_rate_overall
## [1] 0.2039801