PERSIMUNE
diff --git a/‎R/SHAPclust.R‎
Lines changed: 49 additions & 45 deletions b/‎R/SHAPclust.R‎
Lines changed: 49 additions & 45 deletions
diff --git a/‎R/eCM_plot.R‎
Lines changed: 6 additions & 11 deletions b/‎R/eCM_plot.R‎
Lines changed: 6 additions & 11 deletions
diff --git a/‎R/eSHAP_plot.R‎
Lines changed: 64 additions & 37 deletions b/‎R/eSHAP_plot.R‎
Lines changed: 64 additions & 37 deletions
@@ -115,35 +115,34 @@ SHAPclust <- function(task,
                       algorithm="Hartigan-Wong",
                       iter.max = 1000
                       ){
-
-  prediction_correctness <- NULL
-  truth <- NULL
-  response <- NULL
+  cluster <- NULL
+  correct_prediction <- NULL
+  feature <- NULL
+  f_val <- NULL
   fval <- NULL
-  variable <- NULL
   mean_absolute_shap <- NULL
-  feature <- NULL
-  value <- NULL
+  mean_phi <- NULL
+  Phi <- NULL
+  pred_class <- NULL
+  pred_prob <- NULL
+  prediction_correctness <- NULL
+  response <- NULL
   sample_num <- NULL
-  cluster <- NULL
+  truth <- NULL
+  unscaled_f_val <- NULL
+  value <- NULL
+  variable <- NULL
+
   mydata <- task$data()
   # randomly subset the target variable and the corresponding rows
-  if (subset < 1) {
-    set.seed(seed) # set seed for reproducibility
-    n <- round(subset * length(splits$test))
-    target_index <- sample(splits$test, size = n, replace = FALSE)
-    mydata <- mydata[target_index, ]
-
-    # do the prediction for the test set
-    pred_results <- trained_model$predict(task,target_index)
 
-  } else {
-    mydata <- mydata[splits$test, ]
+  set.seed(seed) # set seed for reproducibility
+  n <- round(subset * length(splits$test))
+  target_index <- sample(splits$test, size = n, replace = FALSE)
+  mydata <- mydata[target_index, ]
 
-    # do the prediction for the test set
-    pred_results <- trained_model$predict(task,splits$test)
-
-  }
+  # do the prediction for the test set
+  pred_results <- trained_model$predict(task,target_index)
 
   # the test set based on the data split is used to calculate SHAP values
   test_set <- as.data.frame(mydata)
@@ -165,39 +164,44 @@ SHAPclust <- function(task,
   shap_Mean_wide_kmeans$row_ids <- shap_Mean_wide_kmeans$row_ids - shap_Mean_wide_kmeans$row_ids[1] + 1
   shap_Mean_wide_kmeans[, prediction_correctness := (truth == response)]
   shap_Mean_wide_kmeans_forCM <- shap_Mean_wide_kmeans
-  shap_Mean_wide_kmeans[,c(1,2,3,4,5)] <- NULL
+
+  shap_Mean_wide_kmeans[,c(1,2,5)] <- NULL # ,3,4
+  colnames(shap_Mean_wide_kmeans)[2] <- "prob_positive_class"
   variables_for_long_format <- colnames(shap_Mean_wide_kmeans)
 
-  variables_for_long_format <- variables_for_long_format[!variables_for_long_format %in% c(colnames(pred_results), "sample_num", "prediction_correctness", "cluster")]
+  variables_for_long_format <- variables_for_long_format[!variables_for_long_format %in% c("sample_num", "prediction_correctness", "cluster","response","prob_positive_class")]
 
   # Melt the data.table from wide to long format
-  dt_long <- data.table::melt(shap_Mean_wide_kmeans, id.vars = c("sample_num","prediction_correctness","cluster"),
+  dt_long <- data.table::melt(shap_Mean_wide_kmeans,
+                              id.vars = c("sample_num", "prediction_correctness", "cluster","response","prob_positive_class"),
                               measure.vars = variables_for_long_format,
                               variable.name = "variable",
                               value.name = "value")
 
-  dt_long$fval <- NA
-  dt_long_vars <- as.character(dt_long$variable)
-  for (i in 1:nrow(dt_long)){
-    dt_long$mean_absolute_shap[i] <- mean(abs(dt_long$value[dt_long$variable==dt_long$variable[i]]))
-    idx <- which(row(test_set)[,1]==dt_long$sample_num[i])
-    dt_long$fval[i] <- test_set[idx, which(colnames(test_set)==dt_long_vars[idx])]
-  }
-
-  dt_long$fval <- as.numeric(dt_long$fval)
-  dt_long[, fval := lapply(.SD, range01), by = variable, .SDcols = "fval"]
+  # Remove specified columns
+  dt_long[, c("response", "prob_positive_class", "prediction_correctness") := NULL]
+  # Rename columns
+  names(dt_long)[names(dt_long) == "variable"] <- "feature"
+  names(dt_long)[names(dt_long) == "value"] <- "Phi"
 
+  # Merge the two dataframes
+  dt_long <- merge(dt_long, shap_Mean_long, by = c("sample_num", "feature", "Phi"))
+  print(dt_long)
   ############## SHAP plots for clusters
   shap_plot1 <- dt_long %>%
-    mutate(feature = forcats::fct_reorder(variable, mean_absolute_shap)) %>%
-    ggplot(aes(x = feature, y = value, color = fval)) +
+    mutate(feature = forcats::fct_reorder(feature, mean_phi)) %>%
+    ggplot(aes(x = feature, y = Phi, color = f_val))+
     geom_violin(colour = "grey") +
-    geom_line(aes(group = sample_num), alpha = 0.1, size = 0.2) +
+    geom_line(aes(group = sample_num), alpha = 0.1,size=0.2) +
     coord_flip() +
-    geom_jitter(aes(shape = factor(prediction_correctness, levels = c(FALSE, TRUE), labels = c("Incorrect","Correct"))), alpha = 0.6, size = 1.5, position = position_jitter(width = 0.2, height = 0)) +
-    # geom_jitter(aes(shape = factor(prediction_correctness)), alpha = 0.6, size = 1, position = position_jitter(width = 0.2, height = 0)) +
+    geom_jitter(aes(shape=correct_prediction, text = paste("Feature: ", feature,
+                                                           "<br>Unscaled feature value: ", unscaled_f_val,
+                                                           "<br>SHAP value: ", Phi,
+                                                           "<br>Prediction correctness: ", correct_prediction,
+                                                           "<br>Predicted probability: ", pred_prob,
+                                                           "<br>Predicted class: ", pred_class)),
+                alpha = 0.6, size=1.5, position=position_jitter(width=0.2, height=0)) +
     scale_shape_manual(values = c(4, 19)) +  # 19 for correct predictions (circle), 4 for incorrect predictions (cross)
-    # labs(shape = "Prediction Correctness") +
     labs(shape = "model prediction") +
     scale_colour_gradient2(low = "blue", mid = "green", high = "red", midpoint = 0.5, breaks = c(0, 1), labels = c("Low", "High")) +
     geom_text(aes(x = feature, y = -Inf, label = ""), hjust = -0.2, alpha = 0.7, color = "black") +
@@ -207,19 +211,19 @@ SHAPclust <- function(task,
     theme(text = element_text(size = 8, family = "Helvetica"), panel.border = element_blank(),
           panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(),
           axis.line = element_line(colour = "grey"), legend.key.width = grid::unit(2, "mm")) +
-    ylim(min(dt_long$value) - 0.05, max(dt_long$value) + 0.05) +
+    ylim(min(dt_long$Phi) - 0.05, max(dt_long$Phi) + 0.05) +
     guides(
       shape = ggplot2::guide_legend(color = "black")
     )
 
   shap_plot_onerow <- shap_plot1 + facet_wrap(~ cluster, ncol = num_of_clusters)
 
-  shap_plot_onerow <- ggplotly(shap_plot_onerow)
+  shap_plot_onerow <- ggplotly(shap_plot_onerow, tooltip="text")
 
   CM_plt <- list()
   # Create a tibble for each cluster and calculate the confusion matrix for each cluster
   for (i in 1:num_of_clusters) {
-    d_binomial <- tibble("Truth" = shap_Mean_wide_kmeans_forCM$truth[which(shap_Mean_wide_kmeans_forCM$cluster==i)],
+    d_binomial <- tibble::tibble("Truth" = shap_Mean_wide_kmeans_forCM$truth[which(shap_Mean_wide_kmeans_forCM$cluster==i)],
                          "Prediction" = shap_Mean_wide_kmeans_forCM$response[which(shap_Mean_wide_kmeans_forCM$cluster==i)])
     cvms::confusion_matrix(targets = d_binomial$Truth, predictions = d_binomial$Prediction)
     # basic_table <- table(d_binomial)
@@ -228,7 +232,7 @@ SHAPclust <- function(task,
 
     cm_tbl <- data.frame(matrix(nrow = 4, ncol = 3))
     colnames(cm_tbl) <- c("Target", "Prediction", "N")
-    cm_tbl <- as_tibble(cm_tbl)
+    cm_tbl <- tibble::as_tibble(cm_tbl)
     cm_tbl[1:2,1] <- levels(d_binomial$Truth)[1]
     cm_tbl[3:4,1] <- levels(d_binomial$Truth)[2]
     cm_tbl[1,2] <- levels(d_binomial$Truth)[1]
 
@@ -71,7 +71,7 @@ eCM_plot <- function(task,
   featset_total_test <- as.data.frame(featset_total_test)
   pred_results <- trained_model$predict(task, splits$test)
   # plot confusion matrix
-  d_binomial <- tibble("Truth" = featset_total_test[, task$target_names],
+  d_binomial <- tibble::tibble("Truth" = featset_total_test[, task$target_names],
                        "Prediction" = pred_results$response)
   basic_table <- table(d_binomial)
   cfm <- tibble::as_tibble(basic_table)
@@ -84,7 +84,7 @@ eCM_plot <- function(task,
                                                palette = "Oranges",
                                                label = "Total",
                                                tc_tile_border_color = "black"
-                                             ))
+                                             )) + ggtitle("Confusion matrix for the train set")
   CM_plt_test[["labels"]][["x"]] <- 'Truth (observation)'
   CM_plt_test[["labels"]][["y"]] <- 'Prediction (model output)'
   CM_plt_test[["theme"]][["text"]][["size"]] <- 9
@@ -95,7 +95,7 @@ eCM_plot <- function(task,
   featset_total_train <- mydata[splits$train,]
   featset_total_train <- as.data.frame(featset_total_train)
   pred_results <- trained_model$predict(task, splits$train)
-  d_binomial <- tibble("Truth" = featset_total_train[, task$target_names],
+  d_binomial <- tibble::tibble("Truth" = featset_total_train[, task$target_names],
                        "Prediction" = pred_results$response)
   basic_table <- table(d_binomial)
   # cfm <- broom::tidy(basic_table)
@@ -109,17 +109,12 @@ eCM_plot <- function(task,
                                                 palette = "Oranges",
                                                 label = "Total",
                                                 tc_tile_border_color = "black"
-                                              ))
+                                              )) + ggtitle("Confusion matrix for the test set")
   CM_plt_train[["labels"]][["x"]] <- 'Truth (observation)'
   CM_plt_train[["labels"]][["y"]] <- 'Prediction (model output)'
   CM_plt_train[["theme"]][["text"]][["size"]] <- 9
   CM_plt_train[["theme"]][["axis.text"]][["size"]] <- 9
   # CM_plt_train[["theme"]][["text"]][["family"]] <- 'Helvetica'
-
-  CM_plt_both <- egg::ggarrange(CM_plt_train,
-                                CM_plt_test,
-                                labels = c("train set", "test set"),
-                                nrow = 1,
-                                ncol = 2)
-  return(CM_plt_both)
+  # Return a list containing both plots
+  return(list(train_set = CM_plt_train, test_set = CM_plt_test))
 }
@@ -73,56 +73,60 @@ eSHAP_plot <- function(task,
                        sample.size = 30,
                        seed = 246,
                        subset = 1) {
-
-  # utils::globalVariables(c("feature", "sample_num", "correct_prediction"))
+  cluster <- NULL
+  correct_prediction <- NULL
   feature <- NULL
+  f_val <- NULL
+  fval <- NULL
+  mean_absolute_shap <- NULL
+  mean_phi <- NULL
+  Phi <- NULL
+  pred_class <- NULL
+  pred_prob <- NULL
+  prediction_correctness <- NULL
+  response <- NULL
   sample_num <- NULL
-  correct_prediction <- NULL
-  # library(ggplot2)
+  truth <- NULL
+  unscaled_f_val <- NULL
+
+  set.seed(seed) # set seed for reproducibility
   mydata <- task$data()
   mydata <- as.data.frame(mydata)
   X <- mydata[which(names(mydata[splits$train,]) != task$target_names)]
   model <- iml::Predictor$new(trained_model, data = X, y = mydata[, task$target_names])
-  # randomly subset the target variable and the corresponding rows
-  if (subset < 1) {
-    set.seed(seed) # set seed for reproducibility
-    n <- round(subset * length(splits$test))
-    target_index <- sample(splits$test, size = n, replace = FALSE)
-    mydata <- mydata[target_index, ]
-
-    # do the prediction for the test set
-    pred_results <- trained_model$predict(task,target_index)
-
-  } else {
-    mydata <- mydata[splits$test, ]
 
-    # do the prediction for the test set
-    pred_results <- trained_model$predict(task,splits$test)
-
-  }
+  # randomly subset the target variable and the corresponding rows
+  n <- round(subset * length(splits$test))
+  target_index <- sample(splits$test, size = n, replace = FALSE)
+  mydata <- mydata[target_index, ]
+  # do the prediction for the test set
+  pred_results <- trained_model$predict(task,target_index)
 
   # the test set based on the data split is used to calculate SHAP values
   test_set <- as.data.frame(mydata)
   feature_names <- colnames(X)
   nfeats <- length(feature_names)
 
+  # print(pred_results)
+  # print(pred_results$prob)
+  # save the predicted probability for the positive class
+  pred_prob <- pred_results$prob[,1]
 
-
-  # save the predicted probability for the positive class (assuming with have a binary classification task)
-  pred_prob <- pred_results$prob[,2]
   # mark which samples were correctly predicted and which samples were not
   predicted_correct <- mydata$Class==pred_results$response
 
-  test_set.nolab <- mydata
+  # test_set.nolab <- mydata
   # initialize the results list.
   shap_values <- vector("list", nrow(test_set))
   for (i in seq_along(shap_values)) {
-    set.seed(seed)
-    shap_values[[i]] <- iml::Shapley$new(model, x.interest = test_set[i,feature_names],
+    # set.seed(seed)
+    shap_values[[i]] <- iml::Shapley$new(model,
+                                         x.interest = test_set[i,feature_names],
                                          sample.size = sample.size)$results
     shap_values[[i]]$sample_num <- i  # identifier to track our instances.
     shap_values[[i]]$predcorrectness <- predicted_correct[i]
     shap_values[[i]]$pred_prob <- pred_prob[i]
+    shap_values[[i]]$pred_class <- pred_results$response[i]
   }
   data_shap_values <- dplyr::bind_rows(shap_values)  # collapse the list.
 
@@ -134,6 +138,7 @@ eSHAP_plot <- function(task,
   f_val_lst <- rep(0,nfeats)
   indiv_correctness <- rep(0,nfeats)
   pred_prob_rep <- rep(0,nfeats)
+  pred_class_rep <- rep(0,nfeats)
 
   feature_values <- gsub(".*=",'',shap$feature.value)
   shap$feature.value <- as.numeric(feature_values)
@@ -143,49 +148,63 @@ eSHAP_plot <- function(task,
     f_val_lst[i] = list(feature_values[seq(i,nrow(shap),nfeats)])
     indiv_correctness[i] = list(shap$predcorrectness[seq(i,nrow(shap),nfeats)])
     pred_prob_rep[i] = list(shap$pred_prob[seq(i,nrow(shap),nfeats)])
+    pred_class_rep[i] = list(shap$pred_class[seq(i,nrow(shap),nfeats)])
   }
 
-
   # test_set.nolab[,task$target_names:=NULL]
-  test_set.nolab[,task$target_names] <- NULL
+  mydata[,task$target_names] <- NULL
   # get the column names of the data frame
-  cols <- colnames(test_set.nolab)
+  cols <- colnames(mydata)
 
   # loop through each column
   for (col in cols) {
     # check if the column is numeric
-    if (!is.numeric(test_set.nolab[[col]])) {
+    if (!is.numeric(mydata[[col]])) {
       # convert non-numeric columns to numeric
-      test_set.nolab[[col]] <- as.numeric(test_set.nolab[[col]])
+      mydata[[col]] <- as.numeric(mydata[[col]])
     }
   }
+  # store feature values
+  unscaled_f_val_lst <- f_val_lst
 
   # apply transformation for visualization
   for (i in 1:length(f_val_lst)){
-    f_val_lst[[i]] <- range01(test_set.nolab[,i])
+    unscaled_f_val_lst[[i]] <- mydata[,i] # not scaled
+    f_val_lst[[i]] <- range01(mydata[,i]) # normalization
   }
 
+  (unscaled_f_val = as.numeric(unlist(unscaled_f_val_lst)))
   (f_val = as.numeric(unlist(f_val_lst)))
   (Phi = unlist(indiv_phi))
 
   shap_Mean <- data.table::data.table(feature=rep(feature_names,each=total_reps),
                                       mean_phi = rep(mean_phi,each=total_reps),
                                       Phi = Phi,
                                       f_val = f_val,
+                                      unscaled_f_val = unscaled_f_val,
                                       sample_num = rep(1:nrow(test_set),length(feature_names)),
                                       correct_prediction = unlist(indiv_correctness),
-                                      pred_prob = unlist(pred_prob_rep))
+                                      pred_prob = unlist(pred_prob_rep),
+                                      pred_class = unlist(pred_class_rep))
 
   shap_Mean_wide <- data.table::dcast(shap_Mean, sample_num ~ feature, value.var="Phi")
 
   shap_Mean$correct_prediction <- factor(shap_Mean$correct_prediction, levels = c(FALSE, TRUE), labels = c("Incorrect","Correct"))
+
+
   shap_plot <- shap_Mean %>%
     mutate(feature = forcats::fct_reorder(feature, mean_phi)) %>%
     ggplot(aes(x = feature, y = Phi, color = f_val))+
     geom_violin(colour = "grey") +
-    geom_line(aes(group = sample_num), alpha = 0.1,size=0.2) +
+    geom_line(aes(group = sample_num), alpha = 0.1, size=0.2) +
     coord_flip() +
-    geom_jitter(alpha = 0.6,size=1.5, position=position_jitter(width=0.2, height=0),aes(shape=correct_prediction)) +
+    geom_jitter(aes(shape=correct_prediction, text = paste("Feature: ", feature,
+                                                           "<br>Unscaled feature value: ", unscaled_f_val,
+                                                           "<br>SHAP value: ", Phi,
+                                                           "<br>Prediction correctness: ", correct_prediction,
+                                                           "<br>Predicted probability: ", pred_prob,
+                                                           "<br>Predicted class: ", pred_class)),
+                alpha = 0.6, size=1.5, position=position_jitter(width=0.2, height=0)) +
     scale_shape_manual(values=c(4, 19), guide = FALSE)+
     # scale_color_manual(values=c("black","grey")) +
     labs(shape = "model prediction") +
@@ -209,8 +228,16 @@ eSHAP_plot <- function(task,
           legend.key.width = grid::unit(2,"mm")) +
     ylim(min(shap_Mean$Phi)-0.05, max(shap_Mean$Phi)+0.05)
 
+  # Convert ggplot to Plotly
+  shap_plot <- ggplotly(shap_plot, tooltip="text")
 
-  shap_plot <- ggplotly(shap_plot)
+  # Additional plot to show SHAP values vs. predicted probabilities
+  shap_pred_plot <- shap_Mean %>%
+    ggplot(aes(x = Phi, y = pred_prob, shape=pred_class)) +
+    geom_point() +
+    geom_smooth(method = "loess", se = FALSE) +
+    labs(x = "SHAP value", y = "Predicted probability") +
+    theme_minimal()
 
-  return(list(shap_plot, shap_Mean_wide, shap_Mean, shap))
+  return(list(shap_plot, shap_Mean_wide, shap_Mean, shap, shap_pred_plot))
 }