Skip to content

Commit 20762c5

Browse files
authored
Merge pull request #62 from ropensci/split_table
Split table
2 parents bad8a4c + 2ed454e commit 20762c5

32 files changed

+1215
-95
lines changed

.Rbuildignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@
1313
^codecov.yml$
1414
^LICENSE.md$
1515
^\.httr-oauth$
16+
^doc$
17+
^Meta$

.github/workflows/check_on_different_r_os.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ jobs:
1818
fail-fast: false
1919
matrix:
2020
config:
21-
- {os: macOS-latest, r: 'devel'}
2221
- {os: macOS-latest, r: 'release'}
2322
- {os: windows-latest, r: 'release'}
23+
- {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
2424
- {os: ubuntu-16.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
2525

2626
env:

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@
55
inst/doc
66
docs
77
.httr-oauth
8+
doc
9+
Meta

DESCRIPTION

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: git2rdata
22
Title: Store and Retrieve Data.frames in a Git Repository
3-
Version: 0.2.2
3+
Version: 0.3.0
44
Authors@R:
55
c(person(given = "Thierry",
66
family = "Onkelinx",
@@ -66,5 +66,6 @@ Collate:
6666
'recent_commit.R'
6767
'reexport.R'
6868
'relabel.R'
69+
'rename_variable.R'
6970
'upgrade_data.R'
7071
'utils.R'

NAMESPACE

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ S3method(recent_commit,git_repository)
3333
S3method(relabel,data.frame)
3434
S3method(relabel,default)
3535
S3method(relabel,list)
36+
S3method(rename_variable,character)
37+
S3method(rename_variable,default)
38+
S3method(rename_variable,git_repository)
3639
S3method(rm_data,character)
3740
S3method(rm_data,default)
3841
S3method(rm_data,git_repository)
@@ -53,6 +56,7 @@ export(push)
5356
export(read_vc)
5457
export(recent_commit)
5558
export(relabel)
59+
export(rename_variable)
5660
export(repository)
5761
export(rm_data)
5862
export(status)
@@ -67,7 +71,6 @@ importFrom(assertthat,noNA)
6771
importFrom(git2r,add)
6872
importFrom(git2r,commit)
6973
importFrom(git2r,hash)
70-
importFrom(git2r,hashfile)
7174
importFrom(git2r,last_commit)
7275
importFrom(git2r,odb_blobs)
7376
importFrom(git2r,pull)
@@ -77,6 +80,7 @@ importFrom(git2r,status)
7780
importFrom(git2r,workdir)
7881
importFrom(methods,setOldClass)
7982
importFrom(stats,setNames)
83+
importFrom(utils,file_test)
8084
importFrom(utils,packageVersion)
8185
importFrom(utils,read.table)
8286
importFrom(utils,write.table)

NEWS.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
# git2rdata 0.3.0
2+
3+
## New features
4+
5+
* `write_vc()` gains an optional `split_by` argument.
6+
See `vignette("split_by")` for more details.
7+
* `rename_variable()` efficiently renames variables in a stored `git2rdata`
8+
object.
9+
10+
## Bugfixes
11+
12+
* `read_vc()`, `is_git2rdata()` and `is_git2rmeta()` now yield a better message
13+
when both the data and metadata are missing.
14+
115
# git2rdata 0.2.2
216

317
* Use the [checklist](https://inbo.github.io/checklist) package for CI.

R/datahash.R

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,40 @@
88
#' @family internal
99
#' @importFrom assertthat assert_that
1010
#' @importFrom git2r hash
11+
#' @importFrom utils file_test
1112
datahash <- function(file) {
12-
chunk_size <- 1e4
13-
hashes <- character(chunk_size + 1)
14-
i <- 0
15-
rawdata <- scan(
16-
file = file, what = character(), nmax = -1, sep = "\n", quote = "",
17-
skip = i * chunk_size, nlines = chunk_size, na.strings = "",
18-
flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
19-
blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
20-
encoding = "UTF-8", skipNul = FALSE
21-
)
22-
while (length(rawdata)) {
23-
hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
24-
i <- i + 1
25-
if (i %% chunk_size == 0) {
26-
hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
27-
}
13+
if (file_test("-f", file)) {
14+
chunk_size <- 1e4
15+
hashes <- character(chunk_size + 1)
16+
i <- 0
2817
rawdata <- scan(
2918
file = file, what = character(), nmax = -1, sep = "\n", quote = "",
3019
skip = i * chunk_size, nlines = chunk_size, na.strings = "",
3120
flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
3221
blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
3322
encoding = "UTF-8", skipNul = FALSE
3423
)
24+
while (length(rawdata)) {
25+
hashes[1 + i %% chunk_size] <- hash(paste(hash(rawdata), collapse = "\n"))
26+
i <- i + 1
27+
if (i %% chunk_size == 0) {
28+
hashes[chunk_size + 1] <- hash(paste(hashes, collapse = "")) # nocov
29+
}
30+
rawdata <- scan(
31+
file = file, what = character(), nmax = -1, sep = "\n", quote = "",
32+
skip = i * chunk_size, nlines = chunk_size, na.strings = "",
33+
flush = FALSE, fill = FALSE, strip.white = FALSE, quiet = TRUE,
34+
blank.lines.skip = FALSE, comment.char = "", allowEscapes = FALSE,
35+
encoding = "UTF-8", skipNul = FALSE
36+
)
37+
}
38+
} else {
39+
hashes <- sapply(
40+
list.files(
41+
file, pattern = "(index|[[:xdigit:]]{20}\\.tsv$)", full.names = TRUE
42+
),
43+
datahash
44+
)
3545
}
3646
hash(paste(hashes, collapse = ""))
3747
}

R/is_git2rdata.R

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,51 @@ is_git2rdata.character <- function(file, root = ".",
4343

4444
# read the metadata
4545
meta_data <- read_yaml(file["meta_file"])
46-
47-
correct <- names(meta_data)
48-
correct <- paste(correct[correct != "..generic"], collapse = "\t")
49-
header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
50-
if (correct != header) {
51-
msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
52-
switch(message, error = stop(msg, call. = FALSE),
53-
warning = warning(msg, call. = FALSE))
54-
return(FALSE)
46+
if (has_name(meta_data[["..generic"]], "split_by")) {
47+
header <- readLines(
48+
file.path(file["raw_file"], "index.tsv"), n = 1, encoding = "UTF-8"
49+
)
50+
correct <- paste(
51+
c(meta_data[["..generic"]][["split_by"]], "..hash"),
52+
collapse = "\t"
53+
)
54+
if (correct != header) {
55+
msg <- paste(
56+
"Corrupt data, incorrect header in index.tsv. Expecting:", correct
57+
)
58+
switch(message, error = stop(msg, call. = FALSE),
59+
warning = warning(msg, call. = FALSE))
60+
return(FALSE)
61+
}
62+
correct <- names(meta_data)
63+
keep <- !correct %in% c("..generic", meta_data[["..generic"]][["split_by"]])
64+
correct <- paste(correct[keep], collapse = "\t")
65+
header <- vapply(
66+
list.files(file["raw_file"], pattern = "[[:xdigit:]]{20}\\.tsv"),
67+
function(z) {
68+
readLines(
69+
file.path(file["raw_file"], z), n = 1, encoding = "UTF-8"
70+
)
71+
},
72+
character(1)
73+
)
74+
if (any(header != correct)) {
75+
msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
76+
switch(message, error = stop(msg, call. = FALSE),
77+
warning = warning(msg, call. = FALSE))
78+
return(FALSE)
79+
}
80+
} else {
81+
correct <- names(meta_data)
82+
correct <- paste(correct[correct != "..generic"], collapse = "\t")
83+
header <- readLines(file["raw_file"], n = 1, encoding = "UTF-8")
84+
if (correct != header) {
85+
msg <- paste("Corrupt data, incorrect header. Expecting:", correct)
86+
switch(message, error = stop(msg, call. = FALSE),
87+
warning = warning(msg, call. = FALSE))
88+
return(FALSE)
89+
}
5590
}
56-
5791
return(TRUE)
5892
}
5993

R/is_git2rmeta.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,11 @@ is_git2rmeta.character <- function(file, root = ".",
3838
file <- clean_data_path(root = root, file = file)
3939

4040
if (!file.exists(file["meta_file"])) {
41-
msg <- "Metadata file missing."
41+
msg <- ifelse(
42+
file.exists(file["raw_file"]),
43+
"Metadata file missing.",
44+
"`git2rdata` object not found."
45+
)
4246
switch(message, error = stop(msg, call. = FALSE),
4347
warning = warning(msg, call. = FALSE))
4448
return(FALSE)

R/meta.R

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,12 +211,24 @@ meta.Date <- function(x, optimize = TRUE, ...) {
211211
#' @rdname meta
212212
#' @inheritParams write_vc
213213
meta.data.frame <- function(# nolint
214-
x, optimize = TRUE, na = "NA", sorting, strict = TRUE, ...
214+
x, optimize = TRUE, na = "NA", sorting, strict = TRUE,
215+
split_by = character(0), ...
215216
) {
216217
assert_that(
217218
!has_name(x, "..generic"),
218219
msg = "'..generic' is a reserved name and not allowed as column name")
220+
assert_that(
221+
!has_name(x, "..hash"),
222+
msg = "'..hash' is a reserved name and not allowed as column name")
219223
generic <- list(optimize = optimize, "NA string" = na)
224+
assert_that(is.character(split_by))
225+
assert_that(
226+
all(split_by %in% colnames(x)),
227+
msg = "All split_by variables must be available in the data.frame")
228+
assert_that(
229+
any(!colnames(x) %in% split_by),
230+
msg = "No remaining variables after splitting"
231+
)
220232

221233
dots <- list(...)
222234
if (has_name(dots, "old")) {
@@ -236,6 +248,7 @@ Sorting is strongly recommended in combination with version control.")
236248
assert_that(
237249
all(sorting %in% colnames(x)),
238250
msg = "All sorting variables must be available in the data.frame")
251+
sorting <- unique(c(split_by, sorting))
239252
if (nrow(x) > 1) {
240253
old_locale <- set_c_locale()
241254
x <- x[do.call(order, unname(x[sorting])), , drop = FALSE] # nolint
@@ -249,6 +262,9 @@ Add extra sorting variables to ensure small diffs.", sorted)
249262
}
250263
generic <- c(generic, sorting = list(sorting))
251264
}
265+
if (length(split_by) > 0) {
266+
generic <- c(generic, split_by = list(split_by))
267+
}
252268
# calculate meta for each column
253269
if (has_name(dots, "old")) {
254270
common <- names(old)[names(old) %in% colnames(x)]

0 commit comments

Comments
 (0)