-
Notifications
You must be signed in to change notification settings - Fork 116
/
Copy pathhaven-sas.R
206 lines (183 loc) · 7.36 KB
/
haven-sas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#' Read SAS files
#'
#' `read_sas()` supports both sas7bdat files and the accompanying sas7bcat files
#' that SAS uses to record value labels.
#'
#' @param data_file,catalog_file Path to data and catalog files. The files are
#' processed with [readr::datasource()].
#' @param encoding,catalog_encoding The character encoding used for the
#' `data_file` and `catalog_encoding` respectively. A value of `NULL` uses the
#' encoding specified in the file; use this argument to override it if it is
#' incorrect.
#' @inheritParams tibble::as_tibble
#' @param col_select One or more selection expressions, like in
#' [dplyr::select()]. Use `c()` or `list()` to use more than one expression.
#' See `?dplyr::select` for details on available selection options. Only the
#' specified columns will be read from `data_file`.
#' @param skip Number of lines to skip before reading data.
#' @param n_max Maximum number of lines to read.
#' @param cols_only `r lifecycle::badge("deprecated")` `cols_only` is no longer
#' supported; use `col_select` instead.
#' @return A tibble, data frame variant with nice defaults.
#'
#' Variable labels are stored in the "label" attribute of each variable. It is
#' not printed on the console, but the RStudio viewer will show it.
#'
#' `write_sas()` returns the input `data` invisibly.
#' @export
#' @examples
#' path <- system.file("examples", "iris.sas7bdat", package = "haven")
#' read_sas(path)
read_sas <- function(data_file, catalog_file = NULL,
encoding = NULL, catalog_encoding = encoding,
col_select = NULL, skip = 0L, n_max = Inf, cols_only = deprecated(),
.name_repair = "unique") {
if (lifecycle::is_present(cols_only)) {
lifecycle::deprecate_warn("2.2.0", "read_sas(cols_only)", "read_sas(col_select)")
stopifnot(is.character(cols_only)) # used to only work with a char vector
# guarantee a quosure to keep NULL and tidyselect logic clean downstream
col_select <- quo(c(!!!cols_only))
} else {
col_select <- enquo(col_select)
}
if (is.null(encoding)) {
encoding <- ""
}
cols_skip <- skip_cols(read_sas, !!col_select, data_file, encoding = encoding)
n_max <- validate_n_max(n_max)
spec_data <- readr::datasource(data_file)
if (is.null(catalog_file)) {
spec_cat <- list()
} else {
spec_cat <- readr::datasource(catalog_file)
}
switch(class(spec_data)[1],
source_file = df_parse_sas_file(spec_data, spec_cat, encoding = encoding, catalog_encoding = catalog_encoding, cols_skip = cols_skip, n_max = n_max, rows_skip = skip, name_repair = .name_repair),
source_raw = df_parse_sas_raw(spec_data, spec_cat, encoding = encoding, catalog_encoding = catalog_encoding, cols_skip = cols_skip, n_max = n_max, rows_skip = skip, name_repair = .name_repair),
cli_abort("This kind of input is not handled.")
)
}
#' Write SAS files
#'
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' `write_sas()` creates sas7bdat files. Unfortunately the SAS file format is
#' complex and undocumented, so `write_sas()` is unreliable and in most cases
#' SAS will not read files that it produces.
#'
#' [write_xpt()] writes files in the open SAS transport format, which has
#' limitations but will be reliably read by SAS.
#'
#' @param data Data frame to write.
#' @param path Path to file where the data will be written.
#' @keywords internal
#' @export
write_sas <- function(data, path) {
lifecycle::deprecate_warn("2.5.2", "write_sas()", "write_xpt()")
validate_sas(data)
data_out <- adjust_tz(data)
write_sas_(data_out, normalizePath(path, mustWork = FALSE))
invisible(data)
}
#' Read and write SAS transport files
#'
#' The SAS transport format is a open format, as is required for submission
#' of the data to the FDA.
#'
#' @inheritParams read_spss
#' @return A tibble, data frame variant with nice defaults.
#'
#' Variable labels are stored in the "label" attribute of each variable.
#' It is not printed on the console, but the RStudio viewer will show it.
#'
#' If a dataset label is defined, it will be stored in the "label" attribute
#' of the tibble.
#'
#' `write_xpt()` returns the input `data` invisibly.
#' @export
#' @examples
#' tmp <- tempfile(fileext = ".xpt")
#' write_xpt(mtcars, tmp)
#' read_xpt(tmp)
read_xpt <- function(file, col_select = NULL, skip = 0, n_max = Inf, .name_repair = "unique") {
cols_skip <- skip_cols(read_xpt, {{ col_select }}, file)
n_max <- validate_n_max(n_max)
spec <- readr::datasource(file)
switch(class(spec)[1],
source_file = df_parse_xpt_file(spec, cols_skip, n_max, skip, name_repair = .name_repair),
source_raw = df_parse_xpt_raw(spec, cols_skip, n_max, skip, name_repair = .name_repair),
cli_abort("This kind of input is not handled.")
)
}
#' @export
#' @rdname read_xpt
#' @param version Version of transport file specification to use: either 5 or 8.
#' @param name Member name to record in file. Defaults to file name sans
#' extension. Must be <= 8 characters for version 5, and <= 32 characters
#' for version 8.
#' @param label Dataset label to use, or `NULL`. Defaults to the value stored in
#' the "label" attribute of `data`.
#'
#' Note that although SAS itself supports dataset labels up to 256 characters
#' long, dataset labels in SAS transport files must be <= 40 characters.
#' @param adjust_tz Stata, SPSS and SAS do not have a concept of time zone,
#' and all [date-time] variables are treated as UTC. `adjust_tz` controls
#' how the timezone of date-time values is treated when writing.
#'
#' * If `TRUE` (the default) the timezone of date-time values is ignored, and
#' they will display the same in R and Stata/SPSS/SAS, e.g.
#' `"2010-01-01 09:00:00 NZDT"` will be written as `"2010-01-01 09:00:00"`.
#' Note that this changes the underlying numeric data, so use caution if
#' preserving between-time-point differences is critical.
#' * If `FALSE`, date-time values are written as the corresponding UTC value,
#' e.g. `"2010-01-01 09:00:00 NZDT"` will be written as
#' `"2009-12-31 20:00:00"`.
write_xpt <- function(data, path, version = 8, name = NULL, label = attr(data, "label"), adjust_tz = TRUE) {
if (!version %in% c(5, 8)) {
cli_abort("SAS transport file version {.val {version}} is not currently supported.")
}
if (is.null(name)) {
name <- tools::file_path_sans_ext(basename(path))
}
name <- validate_xpt_name(name, version)
label <- validate_xpt_label(label)
data_out <- validate_sas(data)
if (isTRUE(adjust_tz)) {
data_out <- adjust_tz(data_out)
}
write_xpt_(
data_out,
normalizePath(path, mustWork = FALSE),
version = version,
name = name,
label = label
)
invisible(data)
}
# Validation --------------------------------------------------------------
validate_sas <- function(data) {
stopifnot(is.data.frame(data))
invisible(data)
}
validate_xpt_name <- function(name, version, call = caller_env()) {
if (version == 5) {
if (nchar(name) > 8) {
cli_abort("{.arg name} must be 8 characters or fewer.", call = call)
}
} else {
if (nchar(name) > 32) {
cli_abort("{.arg name} must be 32 characters or fewer.", call = call)
}
}
name
}
validate_xpt_label <- function(label, call = caller_env()) {
if (!is.null(label)) {
stopifnot(is.character(label), length(label) == 1)
if (nchar(label) > 40) {
cli_abort("{.arg label} must be 40 characters or fewer.", call = call)
}
}
label
}