Evidence Space Transformation
risk_spec : Object to store function argument values
for evidence space generation
fit : Learn feature-wise marginal likelihoods, and
geometries and eigenmodes of class manifolds in evidence space
loglik_matrices : Compute marginal positive-class
and negative-class evidences, and marginal relative evidence for input
data using fitted evidence generator
score_risk : Compute total feature-wise evidence,
distance constrast, drift projection, and eigenmode energies
Generate Evidence Space
bcw_spec <- risk_spec(
y_col="diagnosis", positive="M",
features = setdiff(names(bcw_train), c("diagnosis", "id")),
alpha = alpha,
laplace = 1, ridge = 1e-6, winsor_p=0.01,
weights=FALSE,
weight_method = "mutual_info",
numeric_candidates = c("gaussian", "lognormal", "gamma"),
numeric_val_frac = 0.2,
numeric_min_n = 25,
llr_cap_quantile = 0.01,
mi_nbins = 10
)
bcw_obj <- fit(bcw_train%>%select(-id), bcw_spec, k_eigen=2, k_energy=2, energy_ref="both")
head(print_feature_likelihoods(bcw_obj), 5)
## feature class feature_type likelihood_family
## 1 mean_radius positive numeric gamma
## 2 mean_radius negative numeric gamma
## 3 mean_texture positive numeric lognormal
## 4 mean_texture negative numeric lognormal
## 5 mean_perimeter positive numeric lognormal
## parameters
## 1 shape=31.2318, rate=1.7828
## 2 shape=61.2763, rate=5.0253
## 3 meanlog=3.0542, sdlog=0.1419
## 4 meanlog=2.8619, sdlog=0.2097
## 5 meanlog=4.7341, sdlog=0.1847
bcw_Lval <- loglik_matrices(bcw_val%>%select(-id), bcw_obj$fit, alpha=bcw_spec$alpha)
# val/test scoring
bcw_scores_val <- bind_cols(id=bcw_val$id,
diagnosis=bcw_val$diagnosis,
score_risk(bcw_Lval$l_pos, bcw_Lval$l_neg, bcw_obj$fit$weights, bcw_obj$geom,
bcw_spec$alpha, bcw_spec$eps))
Weighted evidence matrix using optional weights. Default weights
(when selected) : KL separation between positive and negative classes
per evidence
bcw_Lval_w <- apply_llr_weights(bcw_Lval$L, bcw_obj$fit$weights)
Learn baseline Random Forest and predict
bcw_rf_train_1 <- ranger(formula = diagnosis ~ ., data=bcw_train%>%select(-id),
mtry=8, min.node.size=5, num.trees=500, probability = TRUE,
keep.inbag = TRUE)
bcw_scores_val <- bind_cols(bcw_scores_val,
pM = predict(bcw_rf_train_1, bcw_val%>%select(-id, -diagnosis))$predictions[,"M"])
Feature Importance using ratio of feature-wise evidences
feature_importance(df=bcw_train%>%select(-id), y_col="diagnosis", fit = bcw_obj$fit, method = "mutual_info", top_n = 15)
## feature weight abs_weight
## 1 worst_perimeter 0.06802118 0.06802118
## 2 worst_area 0.06720117 0.06720117
## 3 worst_radius 0.06703131 0.06703131
## 4 mean_concave_points 0.06541001 0.06541001
## 5 worst_concave_points 0.06488235 0.06488235
## 6 mean_area 0.06011327 0.06011327
## 7 mean_radius 0.05935002 0.05935002
## 8 mean_perimeter 0.05923074 0.05923074
## 9 mean_concavity 0.05640648 0.05640648
## 10 worst_concavity 0.05002912 0.05002912
## 11 se_area 0.04956444 0.04956444
## 12 se_radius 0.03496687 0.03496687
## 13 worst_compactness 0.03488927 0.03488927
## 14 mean_compactness 0.03328922 0.03328922
## 15 se_perimeter 0.03231747 0.03231747
Sample from evidence output
l_pos : Sum of marginal positive evidence
l_neg : Sum of marginal negative evidence
l : Sum of marginal evidence (difference of positive
and negative evidence)
proj : Projection of benign-relative z scores of
evidences on mean deviation class separation direction learned from
training
d_dist : Difference of Mahalanobis Distances of
evidence vector from negative-class evidence manifold and positive-class
evidence manifold
E_pos : Total energy (sum of squares of projections
over k principal components of positive-class) of evidence vector
pM : Random Forest probability score for
positive-class
head(bcw_scores_val, 5)
## id diagnosis l_pos l_neg l t proj
## 1 913512 B -7.181215 25.174531 -32.355746 17.993315 -0.014474594
## 2 863031 B -1.687254 19.441711 -21.128966 17.754457 0.002029465
## 3 855138 M 7.536457 -2.010215 9.546673 5.526242 0.241179508
## 4 883270 B -8.495305 2.749076 -11.244380 -5.746229 0.166289857
## 5 921362 B -62.346930 -31.269338 -31.077592 -93.616268 -0.068812180
## d_dist l_norm E_pos E_neg dE eig_1 eig_2
## 1 -4.640003 8.647529 0.3514493 0.9006716 -0.5492223 7.842260 0.5472624
## 2 -2.529992 8.003247 5.5361752 6.5353216 -0.9991463 6.762143 2.8967998
## 3 6.814686 10.884689 78.3435176 152.1444580 -73.8009404 -1.541281 5.3848165
## 4 -2.850673 5.250365 26.9890565 35.6939991 -8.7049426 1.924869 0.2263808
## 5 -8.426509 13.472078 24.8759126 17.9959150 6.8799976 9.493974 5.1267741
## pM
## 1 0.00740000
## 2 0.03666667
## 3 0.68913333
## 4 0.29660000
## 5 0.10830000
ggplot(bcw_scores_val) +
geom_histogram(aes(x=l, fill=diagnosis), alpha=0.6) +
xlab("Sum of relative marginal evidence from all features") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(bcw_scores_val) +
geom_point(aes(x=l, y=pM, color=diagnosis), alpha=0.7) +
stat_smooth(aes(x=l, y=as.numeric(diagnosis)-1), method = "glm", # Specify Generalized Linear Model
method.args = list(family = "binomial"), # Specify logistic regression
se = TRUE) +
ylab("Random forest malignancy probability (pM)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(bcw_scores_val) +
geom_point(aes(x=qlogis(pM), y=l, color=diagnosis), alpha=0.7) +
xlab("log-odds of random forest malignancy probability (pM)") +
theme_minimal()

ggplot(bcw_scores_val) +
geom_point(aes(x=qlogis(pM), y=d_dist, color=diagnosis), alpha=0.7) +
xlab("log-odds of random forest malignancy probability (pM)") +
theme_minimal()

ggplot(bcw_scores_val) +
geom_point(aes(x=qlogis(pM), y=proj, color=diagnosis), alpha=0.7) +
xlab("log-odds of random forest malignancy probability (pM)") +
theme_minimal()

ggplot(bcw_scores_val) +
geom_point(aes(x=d_dist, y=proj, color=diagnosis), alpha=0.7) +
theme_minimal()

ggplot(bcw_scores_val%>%filter(d_dist < 20)) +
geom_point(aes(x=d_dist, y=proj, color=diagnosis), alpha=0.7) +
theme_minimal()

patchwork::wrap_plots(
ggplot(bcw_scores_val) +
geom_point(aes(x=d_dist, y=E_pos, color=diagnosis), alpha=0.7) +
theme_minimal(),
ggplot(bcw_scores_val) +
geom_point(aes(x=proj, y=E_pos, color=diagnosis), alpha=0.7) +
theme_minimal()
) + patchwork::plot_layout(guides="collect")

patchwork::wrap_plots(
ggplot(bcw_scores_val%>%filter(d_dist < 10, E_pos < 400)) +
geom_point(aes(x=d_dist, y=E_pos, color=diagnosis), alpha=0.7) +
theme_minimal(),
ggplot(bcw_scores_val%>%filter(d_dist < 10, E_pos < 400)) +
geom_point(aes(x=proj, y=E_pos, color=diagnosis), alpha=0.7) +
theme_minimal()
) + patchwork::plot_layout(guides="collect")

bcw_val_L <- bind_cols(diagnosis=bcw_val$diagnosis,
bcw_Lval$L)
Case-wise Feature Importance on validation set using unweighted
relative evidence
Malignant cases
#apply(bcw_val_L, 1, function (x) {max(abs(x)) / sum(abs(x))})
bcw_val_M_dom_l <- apply(bcw_val_L%>%filter(diagnosis=="M")%>%select(-diagnosis), 1,
function (x) {list(dom_l = sum(head(sort(abs(x), decreasing=TRUE),3)) / sum(abs(x)),
dom_l_names = names(head(sort(abs(x), decreasing=TRUE),3)))})
bcw_val_M_dom_l <- bcw_val_M_dom_l[order(sapply(bcw_val_M_dom_l, function(x) x$dom_l), decreasing=TRUE)]
head(bcw_val_M_dom_l, 3)
## [[1]]
## [[1]]$dom_l
## [1] 0.4986962
##
## [[1]]$dom_l_names
## [1] "worst_radius" "worst_perimeter" "worst_area"
##
##
## [[2]]
## [[2]]$dom_l
## [1] 0.4476173
##
## [[2]]$dom_l_names
## [1] "worst_concavity" "mean_concavity"
## [3] "worst_fractal_dimension"
##
##
## [[3]]
## [[3]]$dom_l
## [1] 0.4400803
##
## [[3]]$dom_l_names
## [1] "worst_radius" "worst_perimeter" "worst_area"
Benign cases
bcw_val_B_dom_l <- sort(apply(bcw_val_L%>%filter(diagnosis=="B")%>%select(-diagnosis), 1,
function (x) {sum(head(sort(abs(x), decreasing=TRUE),3)) / sum(abs(x))}), decreasing=TRUE)
head(bcw_val_B_dom_l, 3)
## [1] 0.5925254 0.4787052 0.4625822
tail(bcw_val_B_dom_l, 3)
## [1] 0.2135967 0.2131890 0.2097654
Eigenmode Analysis (PCA) on validation set
Eigenmodes of malignant subset
bcw_Lval_w_M <- bcw_Lval_w[bcw_val$diagnosis == "M", , drop = FALSE]
bcw_Lval_w_Sigma_M <- cov(bcw_Lval_w_M)
bcw_Lval_w_Sigma_M <- bcw_Lval_w_Sigma_M + diag(1e-6, ncol(bcw_Lval_w_Sigma_M))
bcw_Lval_w_eig_M <- eigen(bcw_Lval_w_Sigma_M, symmetric = TRUE)
bcw_Lval_w_eigvals_M <- bcw_Lval_w_eig_M$values
bcw_Lval_w_eigvecs_M <- bcw_Lval_w_eig_M$vectors
bcw_Lval_w_coords_M <- bcw_Lval_w_M %*% bcw_Lval_w_eigvecs_M[, 1:2]
Variance Explained
bcw_Lval_w_eigvals_M[1:2] / sum(bcw_Lval_w_eigvals_M[1:2])
## [1] 0.8062571 0.1937429
Top 5 feature loadings
decompose_eigenmode(bcw_Lval_w_eigvecs_M, k=1, feature_names = bcw_obj$fit$features, top_n=5)
## # A tibble: 5 × 3
## feature loading abs_loading
## <chr> <dbl> <dbl>
## 1 mean_concave_points -0.482 0.482
## 2 worst_radius -0.454 0.454
## 3 mean_concavity -0.382 0.382
## 4 worst_perimeter -0.314 0.314
## 5 worst_area -0.227 0.227
decompose_eigenmode(bcw_Lval_w_eigvecs_M, k=2, feature_names = bcw_obj$fit$features, top_n=5)
## # A tibble: 5 × 3
## feature loading abs_loading
## <chr> <dbl> <dbl>
## 1 mean_concavity 0.408 0.408
## 2 worst_concavity 0.371 0.371
## 3 mean_compactness 0.362 0.362
## 4 worst_radius -0.356 0.356
## 5 worst_concave_points 0.230 0.230
Eigenmode Plot
plot(bcw_Lval_w_coords_M[,1], bcw_Lval_w_coords_M[,2])

Eigenmodes of Benign subset
bcw_Lval_w_B <- bcw_Lval_w[bcw_val$diagnosis == "B", , drop = FALSE]
bcw_Lval_w_Sigma_B <- cov(bcw_Lval_w_B)
bcw_Lval_w_Sigma_B <- bcw_Lval_w_Sigma_B + diag(1e-6, ncol(bcw_Lval_w_Sigma_B))
bcw_Lval_w_eig_B <- eigen(bcw_Lval_w_Sigma_B, symmetric = TRUE)
bcw_Lval_w_eigvals_B <- bcw_Lval_w_eig_B$values
bcw_Lval_w_eigvecs_B <- bcw_Lval_w_eig_B$vectors
bcw_Lval_w_coords_B <- bcw_Lval_w_B %*% bcw_Lval_w_eigvecs_B[, 1:2]
Variance Explained
bcw_Lval_w_eigvals_B[1:2] / sum(bcw_Lval_w_eigvals_B[1:2])
## [1] 0.7269915 0.2730085
Top 5 feature loadings
decompose_eigenmode(bcw_Lval_w_eigvecs_B, k=1, feature_names = bcw_obj$fit$features, top_n=5)
## # A tibble: 5 × 3
## feature loading abs_loading
## <chr> <dbl> <dbl>
## 1 mean_concavity 0.686 0.686
## 2 mean_compactness 0.355 0.355
## 3 worst_concavity 0.319 0.319
## 4 worst_concave_points 0.264 0.264
## 5 mean_concave_points 0.255 0.255
decompose_eigenmode(bcw_Lval_w_eigvecs_B, k=2, feature_names = bcw_obj$fit$features, top_n=5)
## # A tibble: 5 × 3
## feature loading abs_loading
## <chr> <dbl> <dbl>
## 1 worst_area 0.420 0.420
## 2 mean_perimeter 0.400 0.400
## 3 mean_area 0.391 0.391
## 4 worst_perimeter 0.327 0.327
## 5 mean_radius 0.315 0.315
Eigenmode Plot
plot(bcw_Lval_w_coords_B[,1], bcw_Lval_w_coords_B[,2])

Cosine Similarity of Eigenmode 1 Malignant and Eigenmode 1
Benign
acos((t(bcw_Lval_w_eigvecs_B[,1]) %*% bcw_Lval_w_eigvecs_M[,1]) / (sqrt(t(bcw_Lval_w_eigvecs_B[,1]) %*% bcw_Lval_w_eigvecs_B[,1]) * sqrt(t(bcw_Lval_w_eigvecs_M[,1]) %*% bcw_Lval_w_eigvecs_M[,1]))) * 180 / pi
## [,1]
## [1,] 131.5815
Fitting Evidence Model and Random Forest on train+val set
bcw_obj_2 <- fit(bind_rows(bcw_train%>%select(-id),
bcw_val%>%select(-id)), bcw_spec, k_eigen=2, k_energy=2, energy_ref="both")
bcw_Lval_test <- loglik_matrices(bcw_test%>%select(-id), bcw_obj_2$fit, alpha=bcw_spec$alpha)
# val/test scoring
bcw_scores_test <- bind_cols(id=bcw_test$id,
diagnosis=bcw_test$diagnosis,
score_risk(bcw_Lval_test$l_pos, bcw_Lval_test$l_neg, bcw_obj_2$fit$weights, bcw_obj_2$geom,
bcw_spec$alpha, bcw_spec$eps))
head(print_feature_likelihoods(bcw_obj_2), 5)
## feature class feature_type likelihood_family
## 1 mean_radius positive numeric gamma
## 2 mean_radius negative numeric gamma
## 3 mean_texture positive numeric lognormal
## 4 mean_texture negative numeric lognormal
## 5 mean_perimeter positive numeric gaussian
## parameters
## 1 shape=31.6265, rate=1.8164
## 2 shape=54.4581, rate=4.4767
## 3 meanlog=3.0583, sdlog=0.1598
## 4 meanlog=2.8716, sdlog=0.2073
## 5 mean=114.8236, sd=21.1460
feature_importance(df=bind_rows(bcw_train%>%select(-id),
bcw_val%>%select(-id)), y_col="diagnosis", fit = bcw_obj_2$fit, method = "mutual_info", top_n = 15)
## feature weight abs_weight
## 1 worst_radius 0.07318007 0.07318007
## 2 worst_area 0.07300859 0.07300859
## 3 worst_perimeter 0.07184495 0.07184495
## 4 worst_concave_points 0.06896854 0.06896854
## 5 mean_concave_points 0.06589216 0.06589216
## 6 mean_perimeter 0.06134988 0.06134988
## 7 mean_radius 0.05846728 0.05846728
## 8 mean_area 0.05846728 0.05846728
## 9 mean_concavity 0.05498769 0.05498769
## 10 se_area 0.05403280 0.05403280
## 11 worst_concavity 0.04891966 0.04891966
## 12 se_radius 0.03747952 0.03747952
## 13 worst_compactness 0.03286711 0.03286711
## 14 se_perimeter 0.03233489 0.03233489
## 15 mean_compactness 0.02980909 0.02980909
bcw_rf_train_2 <- ranger(formula = diagnosis ~ ., data=bind_rows(bcw_train%>%select(-id), bcw_val%>%select(-id)),
mtry=8, min.node.size=5, num.trees=500, probability = TRUE,
keep.inbag = TRUE)
Comment Block : Data prep from BCW raw data