|
| 1 | +#!/bin/env Rscript |
| 2 | + |
| 3 | +# Goal: determine the correlation between |
| 4 | +# course satisfaction and confidence outcomes |
| 5 | +# |
| 6 | +# |
| 7 | +# | confidence |
| 8 | +# | |
| 9 | +# | |
| 10 | +# | |
| 11 | +# +----------------- |
| 12 | +# satisfaction |
| 13 | +# |
| 14 | + |
| 15 | + |
| 16 | +#' Create non-correlated and correlated values |
| 17 | +create_test_values <- function(n = 50) { |
| 18 | + t <- tibble::tibble( |
| 19 | + average_confidence = runif(n = n, min = 0.0, max = 5.0), |
| 20 | + random_satisfactions = runif(n = n, min = 1.0, max = 10.0) |
| 21 | + ) |
| 22 | + t$correlated_satisfactions <- t$average_confidence * 2 |
| 23 | + t |
| 24 | +} |
| 25 | +testthat::expect_true(nrow(create_test_values()) > 1) |
| 26 | + |
| 27 | +#' Determine if the two columns are correlated |
| 28 | +are_correlated <- function(t, alpha_value = 0.05) { |
| 29 | + testthat::expect_equal(ncol(t), 2) |
| 30 | + results <- correlation::correlation(t) |
| 31 | + results$p < alpha_value |
| 32 | +} |
| 33 | + |
| 34 | +testthat::expect_false( |
| 35 | + are_correlated( |
| 36 | + t = create_test_values() |> dplyr::select(average_confidence, random_satisfactions) |
| 37 | + ) |
| 38 | +) |
| 39 | +testthat::expect_true( |
| 40 | + are_correlated( |
| 41 | + t = create_test_values() |> dplyr::select(average_confidence, correlated_satisfactions) |
| 42 | + ) |
| 43 | +) |
| 44 | + |
| 45 | +# Find the tables |
| 46 | + |
| 47 | +#' Get all the CSV filename |
| 48 | +get_all_csv_filenames <- function() { |
| 49 | + list.files(path = "..", pattern = "csv", recursive = TRUE, full.names = TRUE) |
| 50 | +} |
| 51 | +testthat::expect_true(all(file.exists(get_all_csv_filenames()))) |
| 52 | + |
| 53 | +#' Get all the relevant CSV filename |
| 54 | +get_csv_filenames <- function() { |
| 55 | + filenames <- get_all_csv_filenames() |
| 56 | + |
| 57 | + # Remove the helper CSVs |
| 58 | + filenames <- stringr::str_subset(filenames, "average_confidences", negate = TRUE) |
| 59 | + filenames <- stringr::str_subset(filenames, "survey_start", negate = TRUE) |
| 60 | + filenames <- stringr::str_subset(filenames, "initial_knowledge", negate = TRUE) |
| 61 | + filenames <- stringr::str_subset(filenames, "confidences_course", negate = TRUE) |
| 62 | + |
| 63 | + # Remove evaluations without confidences |
| 64 | + if (exists("has_confidence")) { |
| 65 | + testthat::expect_false(has_confidence("../20230523/evaluation_20230523.csv")) |
| 66 | + } |
| 67 | + filenames <- stringr::str_subset(filenames, "20230523/evaluation_20230523.csv", negate = TRUE) |
| 68 | + if (exists("has_confidence")) { |
| 69 | + testthat::expect_false(has_confidence("../20231201/evaluation_20231201.csv")) |
| 70 | + } |
| 71 | + filenames <- stringr::str_subset(filenames, "20231201/evaluation_20231201.csv", negate = TRUE) |
| 72 | + filenames |
| 73 | +} |
| 74 | +testthat::expect_true(all(file.exists(get_csv_filenames()))) |
| 75 | + |
| 76 | +#' Check that a file has a column related to the rating |
| 77 | +has_satisfaction <- function(csv_filename) { |
| 78 | + t <- readr::read_csv(csv_filename, show_col_types = FALSE) |
| 79 | + col_names <- names(t) |
| 80 | + sum(stringr::str_count(col_names, "how would you rate this training event")) != 0 |
| 81 | +} |
| 82 | +for (filename in get_csv_filenames()) { |
| 83 | + # message(filename) |
| 84 | + testthat::expect_true(has_satisfaction(filename)) |
| 85 | +} |
| 86 | + |
| 87 | +#' Check that a file has a column related to the rating |
| 88 | +has_confidence <- function(csv_filename) { |
| 89 | + t <- readr::read_csv(csv_filename, show_col_types = FALSE) |
| 90 | + col_names <- names(t) |
| 91 | + sum(stringr::str_count(col_names, "I can")) != 0 |
| 92 | +} |
| 93 | +testthat::expect_false(has_confidence("../20230523/evaluation_20230523.csv")) |
| 94 | +testthat::expect_false(has_confidence("../20231201/evaluation_20231201.csv")) |
| 95 | +for (csv_filename in get_csv_filenames()) { |
| 96 | + # message(csv_filename) |
| 97 | + testthat::expect_true(has_confidence(csv_filename)) |
| 98 | +} |
| 99 | + |
| 100 | +get_satisfactions <- function(csv_filename) { |
| 101 | + testthat::expect_true(has_satisfaction(csv_filename)) |
| 102 | + t <- readr::read_csv(csv_filename, show_col_types = FALSE) |
| 103 | + col_name <- stringr::str_subset(names(t), "rate") |
| 104 | + testthat::expect_equal(1, length(col_name)) |
| 105 | + satisfactions <- t |> dplyr::select(dplyr::all_of(col_name)) |> tibble::deframe() |
| 106 | + testthat::expect_true(length(satisfactions) > 0) |
| 107 | + satisfactions |
| 108 | +} |
| 109 | +for (csv_filename in get_csv_filenames()) { |
| 110 | + message(csv_filename, ": ", paste(get_satisfactions(csv_filename), collapse = " ")) |
| 111 | + testthat::expect_true(all(get_satisfaction(csv_filename) >= 1.0)) |
| 112 | + testthat::expect_true(all(get_satisfaction(csv_filename) <= 10.0)) |
| 113 | +} |
| 114 | + |
| 115 | +#' Get the average confidences per day |
| 116 | +get_average_confidence <- function(csv_filename) { |
| 117 | + testthat::expect_true(has_confidence(csv_filename)) |
| 118 | + t <- readr::read_csv(csv_filename, show_col_types = FALSE) |
| 119 | + col_names <- stringr::str_subset(names(t), "I can") |
| 120 | + |
| 121 | + t_sub <- t |> |
| 122 | + dplyr::select(dplyr::all_of(col_names)) |
| 123 | + |
| 124 | + t_sub <- t_sub |> |
| 125 | + dplyr::mutate_all(~ replace(., . == "I can absolutely do this!", 5)) |> |
| 126 | + dplyr::mutate_all(~ replace(., . == "I have good confidence I can do this", 4)) |> |
| 127 | + dplyr::mutate_all(~ replace(., . == "I have some confidence I can do this", 3)) |> |
| 128 | + dplyr::mutate_all(~ replace(., . == "I have low confidence I can do this", 2)) |> |
| 129 | + dplyr::mutate_all(~ replace(., . == "I have no confidence I can do this", 1)) |> |
| 130 | + dplyr::mutate_all(~ replace(., . == "I don't know even what this is about ...?", 0)) |> |
| 131 | + dplyr::mutate_all(~ replace(., . == "I did not attend that session", NA)) |
| 132 | + |
| 133 | + average_confidence <- rep(NA, nrow(t_sub)) |
| 134 | + for (i in seq_len(nrow(t_sub))) { |
| 135 | + confidences <- as.numeric(t_sub[i, ]) |
| 136 | + confidences <- confidences[ !is.na(confidences) ] |
| 137 | + average_confidence[i] <- mean(confidences) |
| 138 | + } |
| 139 | + #average_confidence <- average_confidence[ !is.na(average_confidence) ] |
| 140 | + |
| 141 | + testthat::expect_equal(0, sum(is.na(average_confidence))) |
| 142 | + average_confidence |
| 143 | +} |
| 144 | +for (csv_filename in get_csv_filenames()) { |
| 145 | + message(csv_filename, ": ", paste(get_average_confidence(csv_filename), collapse = " ")) |
| 146 | + testthat::expect_true(all(get_average_confidence(csv_filename) >= 1.0)) |
| 147 | + testthat::expect_true(all(get_average_confidence(csv_filename) <= 10.0)) |
| 148 | +} |
| 149 | + |
| 150 | +csv_filenames <- get_csv_filenames() |
| 151 | +list_of_tables <- list() |
| 152 | +for (i in seq_len(length(csv_filenames))) { |
| 153 | + csv_filename <- csv_filenames[i] |
| 154 | + message(csv_filename) |
| 155 | + satisfactions <- get_satisfactions(csv_filename) |
| 156 | + average_confidences <- get_average_confidence(csv_filename) |
| 157 | + testthat::expect_equal(length(satisfactions), length(average_confidences)) |
| 158 | + t <- tibble::tibble( |
| 159 | + satisfaction = satisfactions, |
| 160 | + average_confidence = average_confidences |
| 161 | + ) |
| 162 | + message(are_correlated(t)) |
| 163 | + list_of_tables[[i]] <- t |
| 164 | +} |
| 165 | +t <- dplyr::bind_rows(list_of_tables) |
| 166 | + |
| 167 | + |
| 168 | +results <- correlation::correlation(t) |
| 169 | +p_value <- results$p |
| 170 | + |
| 171 | +model <- lm(average_confidence ~ satisfaction, data = t) |
| 172 | +r_squared <- summary(model)$r.squared |
| 173 | + |
| 174 | +ggplot2::ggplot(t, ggplot2::aes(x = satisfaction, y = average_confidence)) + |
| 175 | + ggplot2::geom_jitter(width = 0.01, height = 0.01) + |
| 176 | + ggplot2::geom_smooth(method = "lm") + |
| 177 | + ggplot2::labs( |
| 178 | + title = "Correlation between course satisfaction and average confidence", |
| 179 | + caption = paste0( |
| 180 | + "n: ", nrow(t), ", ", |
| 181 | + "p value: ", round(p_value, digits = 5), ", ", |
| 182 | + "R squared: ", round(100.0 * r_squared, digits = 1), "%" |
| 183 | + ) |
| 184 | + ) |
| 185 | +ggplot2::ggsave("correlation.png", width = 7, height = 7) |
0 commit comments