Skip to content

Commit 4763dc8

Browse files
Merge pull request cdcepi#2093 from reichlab/2025-05-03-UMass-trends_ensemble
2025 05 03 u mass trends ensemble
2 parents 5dfd123 + c7cdbf1 commit 4763dc8

File tree

6 files changed

+359012
-30881
lines changed

6 files changed

+359012
-30881
lines changed
Binary file not shown.

target-data/get_target_data_hubverse.R

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
# Generate Hubverse-formatted target data for the FluSight Forecast hub.
44
#
55
# USAGE
6-
# Rscript "get_target_data_hubverse.R" as_of include_after
6+
# Rscript "get_target_data_hubverse.R" as_of oracle_include_after
77
#
88
# ARGUMENTS
99
# as_of: Optional "YYYY-MM-DD" string.
1010
# If provided, read archived target data instead of the latest version.
11-
# include_after: Optional "YYYY-MM-DD" string.
12-
# Exclude target data dated on or earlier than this date. Defaults to "2024-11-01".
11+
# oracle_include_after: Optional "YYYY-MM-DD" string.
12+
# Starting date for oracle-output target data. Defaults to "2024-11-01".
1313
# Note: the modeling tasks defined in tasks.json were updated for the 2024-2025 flu
14-
# season, so don't run this script with an include_after date before 2024-11-01.
14+
# season, so don't run this script with an oracle_include_after date before 2024-11-01.
1515
#
1616
# EXAMPLE
1717
# Generate Hubverse target data based on the latest available FluSight target-hospital-admissions.csv
@@ -49,7 +49,7 @@ get_location_data <- function() {
4949
#' @param target_data_path Path to the target data directory.
5050
#' @param as_of Optional "YYYY-MM-DD" string. If provided, read archived target data instead of the latest version.
5151
#' @return A dataframe
52-
get_base_target_data <- function(include_after, target_data_path, as_of = NULL) {
52+
get_base_target_data <- function(target_data_path, as_of = NULL) {
5353
# if as_of is not provided, read the latest target data
5454
if (is.null(as_of) || is.na(as_of)) {
5555
get_latest <- TRUE
@@ -135,18 +135,30 @@ create_time_series_target_data <- function(weekly_data, location_data) {
135135
}
136136

137137
#' @description
138-
#' `create_oracle_output_target_data` creates Hubverse-formatted oracle output
139-
#' target data.
138+
#' `create_oracle_output_target_data` uses Hubverse-formatted time series target data to generate
139+
#' a corresponding set of oracle output data. If the time series target data doesn't contain
140+
#' any records with a target_end_date greater than the specified oracle_include_after date,
141+
#' return an empty oracle output dataframe.
140142
#'
141143
#' @param time_series_target Dataframe with the time series target data.
144+
#' @param oracle_include_after "YYYY-MM-DD" string. Start date for oracle output target data.
142145
#' @return A dataframe
143-
create_oracle_output_target_data <- function(time_series_target) {
146+
create_oracle_output_target_data <- function(time_series_target, oracle_include_after) {
147+
oracle_output_cols <- c(
148+
"target", "location", "horizon", "target_end_date", "output_type", "output_type_id", "oracle_value", "as_of")
149+
150+
# Filter time series rows to those that match the oracle_include_after criteria
151+
time_series_target <- time_series_target[time_series_target$target_end_date > oracle_include_after, ]
152+
if (nrow(time_series_target) == 0) {
153+
empty_oracle_output <- data.frame(matrix(ncol = length(oracle_output_cols), nrow = 0))
154+
colnames(empty_oracle_output) <- oracle_output_cols
155+
return(empty_oracle_output)
156+
}
157+
144158
oracle_output_wk_inc <- create_oracle_output_wk_inc(time_series_target)
145159
oracle_output_rate_change <- calc_oracle_output_rate_change(time_series_target)
146160

147161
oracle_output <- dplyr::bind_rows(oracle_output_wk_inc, oracle_output_rate_change)
148-
oracle_output_cols <- c(
149-
"target", "location", "horizon", "target_end_date", "output_type", "output_type_id", "oracle_value", "as_of")
150162
oracle_output <- oracle_output[oracle_output_cols]
151163

152164
oracle_output
@@ -487,14 +499,14 @@ run_target_data_tests <- function() {
487499
#' by the FluSight Forecast hub.
488500
#'
489501
#' @param as_of Optional "YYYY-MM-DD" string. If provided, read archived target data instead of the latest version.
490-
#' @param include_after "YYYY-MM_DD" string. Base target data dated on or earlier will be excluded.
502+
#' @param oracle_include_after "YYYY-MM_DD" string. Base target data dated on or earlier will be excluded.
491503
#' @param target_data_path Path to the target data directory.
492504
#' @return NULL
493-
create_target_data <- function(as_of = NULL, include_after = "2024-11-01", target_data_path) {
505+
create_target_data <- function(as_of = NULL, oracle_include_after = "2024-11-01", target_data_path) {
494506
# Validate input params
495507
tryCatch(
496-
as.Date(include_after, format = "%Y-%m-%d"),
497-
error = function(e) stop(paste0("Invalid date format for include_after. Please use 'YYYY-MM-DD': ", include_after))
508+
as.Date(oracle_include_after, format = "%Y-%m-%d"),
509+
error = function(e) stop(paste0("Invalid date format for oracle_include_after. Please use 'YYYY-MM-DD': ", oracle_include_after))
498510
)
499511
if (!is.null(as_of)) {
500512
tryCatch(
@@ -504,15 +516,12 @@ create_target_data <- function(as_of = NULL, include_after = "2024-11-01", targe
504516
}
505517

506518
# Where we'll save things
507-
time_series_path <- file.path(target_data_path, "time-series")
508-
time_series_file <- file.path(time_series_path, "time-series.csv")
509-
oracle_output_path <- file.path(target_data_path, "oracle-output")
510-
oracle_output_file <- file.path(oracle_output_path, "oracle-output.csv")
519+
time_series_file <- file.path(target_data_path, "time-series.csv")
520+
oracle_output_file <- file.path(target_data_path, "oracle-output.csv")
511521

512-
# Get original target data from FluSight hub and filter using include_after
522+
# Get original target data from FluSight hub
513523
location_data <- get_location_data()
514524
weekly_data_all <- get_base_target_data(target_data_path = target_data_path, as_of = as_of)
515-
weekly_data_all <- weekly_data_all[weekly_data_all$date > include_after, ]
516525
as_of <- weekly_data_all$as_of[1]
517526

518527
# Specify sort order for target data files (not absolutely necessary, but helps human readibility and diffs)
@@ -532,7 +541,7 @@ create_target_data <- function(as_of = NULL, include_after = "2024-11-01", targe
532541
updated_time_series <- arrange_cols(updated_time_series, time_series_col_order)
533542

534543
# Create oracle output data
535-
oracle_output_target <- create_oracle_output_target_data(time_series_target)
544+
oracle_output_target <- create_oracle_output_target_data(time_series_target, oracle_include_after)
536545
oracle_output_target <- arrange_cols(oracle_output_target, oracle_col_order)
537546

538547
# Re-order the target-data columns to reflect the files' sort order
@@ -542,11 +551,8 @@ create_target_data <- function(as_of = NULL, include_after = "2024-11-01", targe
542551
dplyr::select(all_of(oracle_col_order), everything())
543552

544553
# Write updated target data files
545-
if (!dir.exists(time_series_path)) {
546-
dir.create(time_series_path, recursive = TRUE)
547-
}
548-
if (!dir.exists(oracle_output_path)) {
549-
dir.create(oracle_output_path, recursive = TRUE)
554+
if (!dir.exists(target_data_path)) {
555+
dir.create(target_data_path, recursive = TRUE)
550556
}
551557

552558
tryCatch({
@@ -566,12 +572,14 @@ create_target_data <- function(as_of = NULL, include_after = "2024-11-01", targe
566572

567573
args <- commandArgs(trailingOnly = TRUE)
568574
as_of <- args[1]
569-
include_after <- args[2]
570-
571-
# if include_after date is not provided, default to the beginning
572-
# of the 2024-2025 flu season
573-
if (is.na(include_after) || is.null(include_after)) {
574-
include_after <- "2024-11-01"
575+
# next line is for testing
576+
oracle_include_after <- args[2]
577+
578+
# If oracle_include_after date is not provided, default to the beginning
579+
# of the 2024-2025 flu season (note: mandatory reporting was reinstated as
580+
# of 2024-11-01)
581+
if (is.na(oracle_include_after) || is.null(oracle_include_after)) {
582+
oracle_include_after <- "2024-10-31"
575583
}
576584

577585
# Run tests
@@ -586,4 +594,4 @@ if (isTRUE(all(ok))) {
586594

587595
# Create the target data
588596
target_data_path <- file.path(here::here(), "target-data")
589-
create_target_data(as_of = as_of, include_after = include_after, target_data_path = target_data_path)
597+
create_target_data(as_of = as_of, oracle_include_after = oracle_include_after, target_data_path = target_data_path)

0 commit comments

Comments
 (0)