diff --git a/.Rbuildignore b/.Rbuildignore index b58c7010..83f299f3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,3 +18,5 @@ ^tests/\.lintr$ ^File_management$ ^simstudy\.code-workspace$ +^codemeta\.json$ +^paper$ diff --git a/DESCRIPTION b/DESCRIPTION index a6d5f824..f27ce078 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: simstudy Title: Simulation of Study Data -Version: 0.2.1.9000 -Date: 2020-10-07 +Version: 0.2.2 +Date: 2020-10-26 Authors@R: c(person(given = "Keith", family = "Goldfeld", diff --git a/NEWS.md b/NEWS.md index cdabcceb..00142e5e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,5 @@ -# simstudy (development version) +# simstudy 0.2.2 +* Improve documentation and vignettes. # simstudy 0.2.1 * Add 'backports' for compatibility with R < 4.0 diff --git a/R/add_correlated_data.R b/R/add_correlated_data.R index c2535ee9..e4960ce5 100644 --- a/R/add_correlated_data.R +++ b/R/add_correlated_data.R @@ -292,13 +292,15 @@ addCorFlex <- function(dt, defs, rho = 0, tau = NULL, corstr = "cs", #' @param method Two methods are available to generate correlated data. (1) "copula" uses #' the multivariate Gaussian copula method that is applied to all other distributions; this #' applies to all available distributions. (2) "ep" uses an algorithm developed by -#' Emrich and Piedmonte. +#' Emrich and Piedmonte (1991). #' @param formSpec The formula (as a string) that was used to generate the binary #' outcome in the `defDataAdd` statement. This is only necessary when method "ep" is #' requested. #' @param periodvar A string value that indicates the name of the field that indexes #' the repeated measurement for an individual unit. The value defaults to "period". #' @return Original data.table with added column(s) of correlated data +#' @references Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional +#' Multivariate Binary Variates. The American Statistician 1991;45:302-4. #' @examples #' # Wide example #' diff --git a/R/generate_correlated_data.R b/R/generate_correlated_data.R index 12f113d8..e906b016 100644 --- a/R/generate_correlated_data.R +++ b/R/generate_correlated_data.R @@ -250,10 +250,12 @@ genCorFlex <- function(n, defs, rho = 0, tau = NULL, corstr = "cs", corMatrix = #' @param method Two methods are available to generate correlated data. (1) "copula" uses #' the multivariate Gaussian copula method that is applied to all other distributions; this #' applies to all available distributions. (2) "ep" uses an algorithm developed by -#' Emrich and Piedmonte. +#' Emrich and Piedmonte (1991). #' @param idname Character value that specifies the name of the id variable. #' #' @return data.table with added column(s) of correlated data +#' @references Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional +#' Multivariate Binary Variates. The American Statistician 1991;45:302-4. #' @examples #' set.seed(23432) #' l <- c(8, 10, 12) diff --git a/README.Rmd b/README.Rmd index 870fc26f..9295bcf2 100644 --- a/README.Rmd +++ b/README.Rmd @@ -16,6 +16,7 @@ knitr::opts_chunk$set( [![R build status](https://github.com/kgoldfeld/simstudy/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/kgoldfeld/simstudy/actions){target="_blank"} [![CRAN status](https://www.r-pkg.org/badges/version/simstudy)](https://CRAN.R-project.org/package=simstudy){target="_blank"} +[![status](https://joss.theoj.org/papers/640fd4333948933b2817343e86df3424/status.svg)](https://joss.theoj.org/papers/640fd4333948933b2817343e86df3424){target="_blank"} [![CRAN downloads](https://cranlogs.r-pkg.org/badges/grand-total/simstudy)](https://CRAN.R-project.org/package=simstudy){target="_blank"} [![codecov](https://codecov.io/gh/kgoldfeld/simstudy/branch/main/graph/badge.svg)](https://codecov.io/gh/kgoldfeld/simstudy){target="_blank"} [![Lifecycle: stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://www.tidyverse.org/lifecycle/#stable){target="_blank"} @@ -25,7 +26,8 @@ The `simstudy` package is a collection of functions that allow users to generate Simulation using `simstudy` has two fundamental steps. The user (1) **defines** the data elements of a data set and (2) **generates** the data based on these definitions. Additional functionality exists to simulate observed or randomized **treatment assignment/exposures**, to create **longitudinal/panel** data, to create **multi-level/hierarchical** data, to create datasets with **correlated variables** based on a specified covariance structure, to **merge** datasets, to create data sets with **missing** data, and to create non-linear relationships with underlying **spline** curves. -The overarching philosophy of `simstudy` is to create data generating processes that mimic the typical models used to fit those types of data. So, the parameterization of some of the data generating processes may not follow the standard parameterizations for the specific distributions. For example, in `simstudy` *gamma*-distributed data are generated based on the specification of a mean μ (or log(μ)) and a dispersion $d$, rather than shape α and rate β parameters that more typically characterize the *gamma* distribution. When we estimate the parameters, we are modeling μ (or some function of μ), so we should explicitly recover the `simstudy` parameters used to generate the model, thus illuminating the relationship between the underlying data generating processes and the models. +The overarching philosophy of `simstudy` is to create data generating processes that mimic the typical models used to fit those types of data. So, the parameterization of some of the data generating processes may not follow the standard parameterizations for the specific distributions. For example, in `simstudy` *gamma*-distributed data are generated based on the specification of a mean μ (or log(μ)) and a dispersion $d$, rather than shape α and rate β parameters that more typically characterize the *gamma* distribution. When we estimate the parameters, we are modeling μ (or some function of μ), so we should explicitly recover the `simstudy` parameters used to generate the model, thus illuminating the relationship between the underlying data generating processes and the models. For more details on the +package, use cases, examples, and function reference see the [documentation page](https://kgoldfeld.github.io/simstudy/articles/simstudy.html). ## Installation diff --git a/README.md b/README.md index 612590e8..92eb5b4c 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ simstudy status](https://github.com/kgoldfeld/simstudy/workflows/R-CMD-check/badge.svg?branch=main)](https://github.com/kgoldfeld/simstudy/actions) [![CRAN status](https://www.r-pkg.org/badges/version/simstudy)](https://CRAN.R-project.org/package=simstudy) +[![status](https://joss.theoj.org/papers/640fd4333948933b2817343e86df3424/status.svg)](https://joss.theoj.org/papers/640fd4333948933b2817343e86df3424) [![CRAN downloads](https://cranlogs.r-pkg.org/badges/grand-total/simstudy)](https://CRAN.R-project.org/package=simstudy) [![codecov](https://codecov.io/gh/kgoldfeld/simstudy/branch/main/graph/badge.svg)](https://codecov.io/gh/kgoldfeld/simstudy) @@ -48,7 +49,9 @@ typically characterize the *gamma* distribution. When we estimate the parameters, we are modeling μ (or some function of μ), so we should explicitly recover the `simstudy` parameters used to generate the model, thus illuminating the relationship between the underlying data -generating processes and the models. +generating processes and the models. For more details on the package, +use cases, examples, and function reference see the [documentation +page](https://kgoldfeld.github.io/simstudy/articles/simstudy.html). ## Installation @@ -83,16 +86,16 @@ dd <- trtAssign(dd, nTrt = 4, grpName = "grp", balanced = TRUE) dd #> id x y grp #> 1: 1 11.191960 8.949389 4 -#> 2: 2 10.418375 7.372060 2 -#> 3: 3 8.512109 6.925844 4 +#> 2: 2 10.418375 7.372060 4 +#> 3: 3 8.512109 6.925844 3 #> 4: 4 11.361632 9.850340 4 -#> 5: 5 9.928811 6.515463 2 +#> 5: 5 9.928811 6.515463 4 #> --- -#> 246: 246 8.220609 7.898416 4 -#> 247: 247 8.531483 8.681783 4 -#> 248: 248 10.507370 8.552350 4 +#> 246: 246 8.220609 7.898416 2 +#> 247: 247 8.531483 8.681783 2 +#> 248: 248 10.507370 8.552350 3 #> 249: 249 8.621339 6.652300 1 -#> 250: 250 9.508164 7.083845 4 +#> 250: 250 9.508164 7.083845 3 ``` ## Contributing & Support diff --git a/codemeta.json b/codemeta.json new file mode 100644 index 00000000..74a969f8 --- /dev/null +++ b/codemeta.json @@ -0,0 +1,372 @@ +{ + "@context": [ + "https://doi.org/10.5063/schema/codemeta-2.0", + "http://schema.org" + ], + "@type": "SoftwareSourceCode", + "identifier": "simstudy", + "description": "Simulates data sets in order to explore modeling\n techniques or better understand data generating processes. The user\n specifies a set of relationships between covariates, and generates\n data based on these specifications. The final data sets can represent\n data from randomized control trials, repeated measure (longitudinal)\n designs, and cluster randomized trials. Missingness can be generated\n using various mechanisms (MCAR, MAR, NMAR).", + "name": "simstudy: Simulation of Study Data", + "codeRepository": "https://github.com/kgoldfeld/simstudy", + "relatedLink": [ + "https://kgoldfeld.github.io/simstudy/", + "https://kgoldfeld.github.io/simstudy/dev/", + "https://CRAN.R-project.org/package=simstudy" + ], + "issueTracker": "https://github.com/kgoldfeld/simstudy/issues", + "license": "https://spdx.org/licenses/GPL-3.0", + "version": "0.2.2", + "programmingLanguage": { + "@type": "ComputerLanguage", + "name": "R", + "url": "https://r-project.org" + }, + "runtimePlatform": "R version 4.0.2 (2020-06-22)", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "author": [ + { + "@type": "Person", + "givenName": "Keith", + "familyName": "Goldfeld", + "email": "Keith.Goldfeld@nyumc.org", + "@id": "https://orcid.org/0000-0002-0292-8780" + }, + { + "@type": "Person", + "givenName": "Jacob", + "familyName": "Wujciak-Jens", + "email": "jacob@wujciak.de", + "@id": "https://orcid.org/0000-0002-7281-3989" + } + ], + "contributor": {}, + "copyrightHolder": {}, + "funder": {}, + "maintainer": [ + { + "@type": "Person", + "givenName": "Keith", + "familyName": "Goldfeld", + "email": "Keith.Goldfeld@nyumc.org", + "@id": "https://orcid.org/0000-0002-0292-8780" + } + ], + "softwareSuggestions": [ + { + "@type": "SoftwareApplication", + "identifier": "covr", + "name": "covr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=covr" + }, + { + "@type": "SoftwareApplication", + "identifier": "dplyr", + "name": "dplyr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=dplyr" + }, + { + "@type": "SoftwareApplication", + "identifier": "formatR", + "name": "formatR", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=formatR" + }, + { + "@type": "SoftwareApplication", + "identifier": "gee", + "name": "gee", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=gee" + }, + { + "@type": "SoftwareApplication", + "identifier": "ggplot2", + "name": "ggplot2", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=ggplot2" + }, + { + "@type": "SoftwareApplication", + "identifier": "grid", + "name": "grid" + }, + { + "@type": "SoftwareApplication", + "identifier": "gridExtra", + "name": "gridExtra", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=gridExtra" + }, + { + "@type": "SoftwareApplication", + "identifier": "hedgehog", + "name": "hedgehog", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=hedgehog" + }, + { + "@type": "SoftwareApplication", + "identifier": "knitr", + "name": "knitr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=knitr" + }, + { + "@type": "SoftwareApplication", + "identifier": "magrittr", + "name": "magrittr", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=magrittr" + }, + { + "@type": "SoftwareApplication", + "identifier": "Matrix", + "name": "Matrix", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=Matrix" + }, + { + "@type": "SoftwareApplication", + "identifier": "mgcv", + "name": "mgcv", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=mgcv" + }, + { + "@type": "SoftwareApplication", + "identifier": "ordinal", + "name": "ordinal", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=ordinal" + }, + { + "@type": "SoftwareApplication", + "identifier": "pracma", + "name": "pracma", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=pracma" + }, + { + "@type": "SoftwareApplication", + "identifier": "rmarkdown", + "name": "rmarkdown", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=rmarkdown" + }, + { + "@type": "SoftwareApplication", + "identifier": "scales", + "name": "scales", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=scales" + }, + { + "@type": "SoftwareApplication", + "identifier": "splines", + "name": "splines" + }, + { + "@type": "SoftwareApplication", + "identifier": "survival", + "name": "survival", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=survival" + }, + { + "@type": "SoftwareApplication", + "identifier": "testthat", + "name": "testthat", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=testthat" + } + ], + "softwareRequirements": [ + { + "@type": "SoftwareApplication", + "identifier": "R", + "name": "R", + "version": ">= 3.3.0" + }, + { + "@type": "SoftwareApplication", + "identifier": "data.table", + "name": "data.table", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=data.table" + }, + { + "@type": "SoftwareApplication", + "identifier": "glue", + "name": "glue", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=glue" + }, + { + "@type": "SoftwareApplication", + "identifier": "methods", + "name": "methods" + }, + { + "@type": "SoftwareApplication", + "identifier": "mvnfast", + "name": "mvnfast", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=mvnfast" + }, + { + "@type": "SoftwareApplication", + "identifier": "mvtnorm", + "name": "mvtnorm", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=mvtnorm" + }, + { + "@type": "SoftwareApplication", + "identifier": "Rcpp", + "name": "Rcpp", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=Rcpp" + }, + { + "@type": "SoftwareApplication", + "identifier": "backports", + "name": "backports", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=backports" + } + ], + "releaseNotes": "https://github.com/kgoldfeld/simstudy/blob/main/NEWS.md", + "readme": "https://github.com/kgoldfeld/simstudy/blob/main/README.md", + "fileSize": "8057.013KB", + "contIntegration": "https://codecov.io/gh/kgoldfeld/simstudy", + "developmentStatus": "https://www.tidyverse.org/lifecycle/#stable", + "keywords": [ + "r", + "data-simulation", + "data-generation", + "simulation", + "statistical-models" + ] +} diff --git a/man/addCorGen.Rd b/man/addCorGen.Rd index 2ab7babf..18b09285 100644 --- a/man/addCorGen.Rd +++ b/man/addCorGen.Rd @@ -57,7 +57,7 @@ represents the column.} \item{method}{Two methods are available to generate correlated data. (1) "copula" uses the multivariate Gaussian copula method that is applied to all other distributions; this applies to all available distributions. (2) "ep" uses an algorithm developed by -Emrich and Piedmonte.} +Emrich and Piedmonte (1991).} \item{formSpec}{The formula (as a string) that was used to generate the binary outcome in the `defDataAdd` statement. This is only necessary when method "ep" is @@ -190,4 +190,8 @@ dg <- addCorGen(dx, periodvar = "period" ) } +\references{ +Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional +Multivariate Binary Variates. The American Statistician 1991;45:302-4. +} \concept{correlated} diff --git a/man/genCorGen.Rd b/man/genCorGen.Rd index 70e909a0..a1db6775 100644 --- a/man/genCorGen.Rd +++ b/man/genCorGen.Rd @@ -57,7 +57,7 @@ represents the column.} \item{method}{Two methods are available to generate correlated data. (1) "copula" uses the multivariate Gaussian copula method that is applied to all other distributions; this applies to all available distributions. (2) "ep" uses an algorithm developed by -Emrich and Piedmonte.} +Emrich and Piedmonte (1991).} \item{idname}{Character value that specifies the name of the id variable.} } @@ -100,4 +100,8 @@ genCorGen(1000, corMatrix = genCorMat(3), method = "copula" ) } +\references{ +Emrich LJ, Piedmonte MR. A Method for Generating High-Dimensional +Multivariate Binary Variates. The American Statistician 1991;45:302-4. +} \concept{correlated} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 00000000..23e7e245 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,108 @@ +--- +title: 'simstudy: Illuminating research methods through data generation' +tags: + - R + - statistics + - data-simulation + - statistical-models + - data-generation +authors: + - name: Keith Goldfeld + orcid: 0000-0002-0292-8780 + affiliation: 1 + - name: Jacob Wujciak-Jens + orcid: 0000-0002-7281-3989 + affiliation: 2 +affiliations: + - name: NYU Grossman School of Medicine. + index: 1 + - name: Independent Researcher + index: 2 +date: 18 October 2020 +bibliography: simstudy.bib +--- + +# Summary + +The `simstudy` package is a collection of functions for R [@rcoreteam2020] that +allow users to generate simulated data sets in order to explore modeling +techniques or better understand data generating processes. The user defines the +distributions of individual variables, specifies relationships between +covariates and outcomes, and generates data based on these specifications. The +final data sets can represent randomized control trials, repeated measure +designs, cluster-randomized trials, or naturally observed data processes. Many other +complexities can be added, including survival data, correlated data, factorial +study designs, step wedge designs, and missing data processes. + +Simulation using `simstudy` has two fundamental steps. The user (1) **defines** +the data elements of a data set and (2) **generates** the data based on these +definitions. Additional functionality exists to simulate observed or randomized +**treatment assignment/exposures**, to create **longitudinal/panel** data, to +create **multi-level/hierarchical** data, to create datasets with **correlated +variables** based on a specified covariance structure, to **merge** datasets, to +create data sets with **missing** data, and to create non-linear relationships +with underlying **spline** curves. + +The overarching philosophy of `simstudy` is to create data generating processes +that mimic the typical models used to fit those types of data. So, the +parameterization of some of the data generating processes may not follow the +standard parameterizations for the specific distributions. For example, in +`simstudy` *gamma*-distributed data are generated based on the specification of +a mean $\mu$ (or $\log(\mu)$) and a dispersion $d$, rather than shape $\alpha$ +and rate $\beta$ parameters that more typically characterize the *gamma* +distribution. When we estimate the parameters, we are modeling $\mu$ (or some +function of $(\mu)$), so we should explicitly recover the `simstudy` parameters +used to generate the model - illuminating the relationship between the +underlying data generating processes and the models. For more details on the +package, use cases, examples, and function reference see the [documentation page](https://kgoldfeld.github.io/simstudy/articles/simstudy.html). + +`simstudy` is available on [CRAN](https://cran.r-project.org/package=simstudy) +and can be installed with: + +``` r +install.packages("simstudy") +``` + +Alternatively, the newest development version can be installed from [GitHub](https://github.com/) with: + +``` r +# install.packages("devtools") +devtools::install_github("kgoldfeld/simstudy") +``` + +# Statement of need + +Empiricism and statistical analysis are cornerstones of scientific research +but they can lead us astray if used incorrectly. Choosing the right methodology for the +hypothesis and expected data is crucial for useful, valid results. Data +simulated with `simstudy` under the assumptions derived from a hypothesis +enables researchers to test and refine their analysis methodologies without the +need for time-intensive, expensive pre-tests or collection of actual data. Additionally data generated with `simstudy` can be used in generalized, theoretical simulation studies to further the field of methodology. + +There are several `R`-packages that allow for data generation under different +assumptions. Most of these packages have a narrower scope that focuses on +a specific class of data, like `ICCbin` [@hossain2017], `BinNonNor` +[@inan2020] and `genSurv` [@meira-machado2014]. Some do not seem to be actively +maintained [@hofert2016;@chan2014;@alfons2010;@bien2016], which can cause +compatibility issues. Some target specific fields of study and their needs, like the +psychology-focused `psych` package [@revelle2020] or the `conjurer` package +[@macherla2020] that provides methods to generate synthetic customer data for +industry use. `simstudy` is unique with its philosophy of data generating +processes that mimic the models used in analysis and allowing for the possibility of generating a wide range of complex data through these processes. The `SimDesign` Package +[@chalmers2020] and the related `MonteCarlo` Package [@leschinski2019] follow a +similar line of thought but focus on easy replication of the analyses and providing summaries of simulated data. + +`simstudy` has been used in a variety of fields for theoretical exploration of +research methodology +[@anderson2019;@kirasich2018;@krzykalla2020;@liu2019;@nickodem2020;@thoya2018;@wang2020;@elalili2020], +power calculation for trials [@wei2019] and other simulation tasks supporting +researchers +[@forthun2020;@horry2020;@renson2017;@chukwu2019]. + +# Acknowledgements + +We acknowledge contributions from James Balamuta, Michael Bradley, Gertjan +Verhoeven. For the generation of multivariate binary data the algorithm by +@emrich1991 is used. + +# References \ No newline at end of file diff --git a/paper/simstudy.bib b/paper/simstudy.bib new file mode 100644 index 00000000..8d8e0160 --- /dev/null +++ b/paper/simstudy.bib @@ -0,0 +1,278 @@ + +@article{alfons2010, + title = {An Object-Oriented Framework for Statistical Simulation: {{The R}} Package {{simFrame}}}, + author = {Alfons, Andreas and Templ, Matthias and Filzmoser, Peter}, + date = {2010}, + journaltitle = {Journal of Statistical Software}, + volume = {37}, + pages = {1--36}, + doi = {10.18637/jss.v037.i03}, + url = {http://www.jstatsoft.org/v37/i03/}, + number = {3} +} + +@article{anderson2019, + title = {Enhancing Quantitative Theory-Testing Entrepreneurship Research}, + author = {Anderson, Brian S and Wennberg, Karl and McMullen, Jeffery S}, + date = {2019}, + journaltitle = {Journal of Business Venturing}, + volume = {34}, + pages = {105928}, + publisher = {{Elsevier}}, + doi = {10.1016/j.jbusvent.2019.02.001}, + number = {5} +} + +@article{bien2016, + title = {The Simulator: {{An}} Engine to Streamline Simulations}, + author = {Bien, Jacob}, + date = {2016}, + journaltitle = {Submitted}, + url = {http://faculty.bscb.cornell.edu/~bien/simulator.pdf}, + keywords = {⛔ No DOI found} +} + +@article{chalmers2020, + title = {Writing Effective and Reliable {{Monte Carlo}} Simulations with the {{SimDesign}} Package}, + author = {Chalmers, R. Philip and Adkins, Mark C.}, + date = {2020}, + journaltitle = {The Quantitative Methods for Psychology}, + volume = {16}, + pages = {248--280}, + doi = {10.20982/tqmp.16.4.p248}, + number = {4} +} + +@manual{chan2014, + title = {Ezsim: Provide an Easy to Use Framework to Conduct Simulation}, + author = {Chan, TszKin Julian}, + date = {2014}, + url = {https://CRAN.R-project.org/package=ezsim}, + type = {manual} +} + +@article{chukwu2019, + title = {Safety Constraint Optimization of Combination Drug Therapy in Hypertension Clinical Trials}, + author = {Chukwu, Victor}, + date = {2019}, + url = {https://digitalcommons.georgiasouthern.edu/etd/2007/}, + keywords = {⛔ No DOI found} +} + +@article{elalili2020, + title = {Taking the Analysis of Trial-Based Economic Evaluations to the next Level: {{The}} Importance of Accounting for Clustering}, + author = {El Alili, Mohamed and van Dongen, Johanna M and Goldfeld, Keith S and Heymans, Martijn W and van Tulder, Maurits W and Bosmans, Judith E}, + date = {2020}, + journaltitle = {PharmacoEconomics}, + pages = {1--15}, + publisher = {{Springer}}, + doi = {10.1007/s40273-020-00946-y}, + options = {useprefix=true} +} + +@article{emrich1991, + title = {A {{Method}} for {{Generating High}}-{{Dimensional Multivariate Binary Variates}}}, + author = {Emrich, Lawrence J. and Piedmonte, Marion R.}, + date = {1991-11}, + journaltitle = {The American Statistician}, + shortjournal = {The American Statistician}, + volume = {45}, + pages = {302--304}, + issn = {0003-1305, 1537-2731}, + doi = {10.1080/00031305.1991.10475828}, + langid = {english}, + number = {4} +} + +@article{forthun2020, + title = {Parental Education and the Risk of Cerebral Palsy for Children: An Evaluation of Causality}, + author = {Forthun, Ingeborg and Lie, Rolv Terje and Strandberg-Larsen, Katrine and Solheim, Magne Haugland and Moster, Dag and Wilcox, Allen J and Mortensen, Laust Hvas and Tollånes, Mette C}, + date = {2020}, + journaltitle = {Developmental Medicine \& Child Neurology}, + publisher = {{Wiley Online Library}}, + doi = {10.1111/dmcn.14552} +} + +@article{hofert2016, + title = {Parallel and Other Simulations in {{R}} Made Easy: {{An}} End-to-End Study}, + author = {Hofert, Marius and Mächler, Martin}, + date = {2016}, + journaltitle = {Journal of Statistical Software}, + volume = {69}, + pages = {1--44}, + doi = {10.18637/jss.v069.i04}, + number = {4} +} + +@article{horry2020, + title = {“{{Only}} Your First Yes Will Count”: {{The}} Impact of Pre-Lineup Instructions on Sequential Lineup Decisions}, + author = {Horry, Ruth and Fitzgerald, Ryan J and Mansour, Jamal K}, + date = {2020}, + journaltitle = {Journal of Experimental Psychology: Applied}, + publisher = {{American Psychological Association}}, + doi = {10.31234/osf.io/59uaq} +} + +@manual{hossain2017, + title = {{{ICCbin}}: {{Facilitates}} Clustered Binary Data Generation, and Estimation of Intracluster Correlation Coefficient ({{ICC}}) for Binary Data}, + author = {Hossain, Akhtar and Chakraborty, Hrishikesh}, + date = {2017}, + url = {https://CRAN.R-project.org/package=ICCbin}, + type = {manual} +} + +@manual{inan2020, + title = {{{BinNonNor}}: {{Data}} Generation with Binary and Continuous Non-Normal Components}, + author = {Inan, Gul and Demirtas, Hakan and Gao, Ran}, + date = {2020}, + url = {https://CRAN.R-project.org/package=BinNonNor}, + type = {manual} +} + +@article{kirasich2018, + title = {Random Forest vs Logistic Regression: {{Binary}} Classification for Heterogeneous Datasets}, + author = {Kirasich, Kaitlin and Smith, Trace and Sadler, Bivin}, + date = {2018}, + journaltitle = {SMU Data Science Review}, + volume = {1}, + pages = {9}, + url = {https://scholar.smu.edu/datasciencereview/vol1/iss3/9/}, + keywords = {⛔ No DOI found}, + number = {3} +} + +@article{krzykalla2020, + title = {Exploratory Identification of Predictive Biomarkers in Randomized Trials with Normal Endpoints}, + author = {Krzykalla, Julia and Benner, Axel and Kopp-Schneider, Annette}, + date = {2020}, + journaltitle = {Statistics in Medicine}, + volume = {39}, + pages = {923--939}, + publisher = {{Wiley Online Library}}, + doi = {10.1002/sim.8452}, + number = {7} +} + +@manual{leschinski2019, + title = {{{MonteCarlo}}: {{Automatic}} Parallelized Monte Carlo Simulations}, + author = {Leschinski, Christian Hendrik}, + date = {2019}, + url = {https://CRAN.R-project.org/package=MonteCarlo}, + type = {manual} +} + +@article{liu2019, + title = {Missing Data in Marginal Structural Models: {{A}} Plasmode Simulation Study Comparing Multiple Imputation and Inverse Probability Weighting}, + author = {Liu, Shao-Hsien and Chrysanthopoulou, Stavroula A and Chang, Qiuzhi and Hunnicutt, Jacob N and Lapane, Kate L}, + date = {2019}, + journaltitle = {Medical care}, + volume = {57}, + pages = {237}, + publisher = {{NIH Public Access}}, + doi = {10.1097/MLR.0000000000001063}, + number = {3} +} + +@manual{macherla2020, + title = {Conjurer: {{A}} Parametric Method for Generating Synthetic Data}, + author = {Macherla, Sidharth}, + date = {2020}, + url = {https://CRAN.R-project.org/package=conjurer}, + type = {manual} +} + +@article{meira-machado2014, + title = {A {{Simulation Study Comparing Modeling Approaches}} in an {{Illness}}-{{Death Multi}}-{{State Model}}}, + author = {Meira-Machado, Luís and Faria, Susana}, + date = {2014-01}, + journaltitle = {Communications in Statistics - Simulation and Computation}, + shortjournal = {Communications in Statistics - Simulation and Computation}, + volume = {43}, + pages = {929--946}, + issn = {0361-0918, 1532-4141}, + doi = {10.1080/03610918.2012.718841}, + url = {http://www.tandfonline.com/doi/abs/10.1080/03610918.2012.718841}, + urldate = {2020-10-18}, + langid = {english}, + number = {5} +} + +@article{nickodem2020, + title = {Use of Aggregated Covariates in Propensity Score Analysis of Clustered Data}, + author = {Nickodem, Kyle}, + date = {2020}, + url = {https://conservancy.umn.edu/handle/11299/216126}, + keywords = {⛔ No DOI found} +} + +@manual{rcoreteam2020, + title = {R: {{A}} Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + date = {2020}, + location = {{Vienna, Austria}}, + url = {https://www.R-project.org/}, + organization = {{R Foundation for Statistical Computing}}, + type = {manual} +} + +@article{renson2017, + title = {Lack of Insurance Is Associated with Lower Probability of Diagnostic Imaging Use among {{US}} Trauma Patients: {{An}} Instrumental Variable Analysis and Simulation}, + author = {Renson, Audrey and Schubert, Finn D and Bjurlin, Marc A}, + date = {2017}, + journaltitle = {bioRxiv}, + pages = {215889}, + publisher = {{Cold Spring Harbor Laboratory}}, + doi = {10.1101/215889} +} + +@manual{revelle2020, + title = {Psych: {{Procedures}} for Psychological, Psychometric, and Personality Research}, + author = {Revelle, William}, + date = {2020}, + location = {{Evanston, Illinois}}, + url = {https://CRAN.R-project.org/package=psych}, + organization = {{Northwestern University}}, + type = {manual} +} + +@manual{song2019, + title = {{{CorBin}}: {{Generate}} High-Dimensional Binary Data with Correlation Structures}, + author = {Song, Shuang and Jiang, Wei and Hou, Lin and Zhao, Hongyu}, + date = {2019}, + url = {https://CRAN.R-project.org/package=CorBin}, + type = {manual} +} + +@article{thoya2018, + title = {Evaluating Methods of Assessing Optimism in Regression Models}, + author = {Thoya, Daniel and Waititu, Antony and Magheto, Thomas and Ngunyi, Antony}, + date = {2018}, + journaltitle = {Am. J. Appl. Math. Stat}, + volume = {6}, + pages = {126--134}, + keywords = {⛔ No DOI found} +} + +@article{wang2020, + title = {Improved Empirical Likelihood Inference and Variable Selection for Generalized Linear Models with Longitudinal Nonignorable Dropouts}, + author = {Wang, Lei and Ma, Wei}, + date = {2020}, + journaltitle = {Annals of the Institute of Statistical Mathematics}, + pages = {1--25}, + publisher = {{Springer}}, + doi = {10.1007/s10463-020-00761-4} +} + +@article{wei2019, + title = {Protocol for a Randomised Controlled Trial to Evaluate the Effectiveness of Improving Tuberculosis Patients’ Treatment Adherence via Electronic Monitors and an App versus Usual Care in {{Tibet}}}, + author = {Wei, Xiaolin and Hicks, Joseph Paul and Pasang, Pande and Zhang, Zhitong and Haldane, Victoria and Liu, Xiaoqiu and Yin, Tingting and Wang, Lixia and Shi, Dachun and Ge, Shiliang and others}, + date = {2019}, + journaltitle = {Trials}, + volume = {20}, + pages = {273}, + publisher = {{Springer}}, + doi = {10.1186/s13063-019-3364-x}, + number = {1} +} + +