Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@

9. Grouping operations on empty (0-row, 0-column) data.tables work as intended, [#7749](https://github.com/Rdatatable/data.table/issues/7749). Thanks @rickhelmus for the report and @MichaelChirico for the fix.

10. `setkey()`, `setindex()`, `CJ()`, and `setnames()` now prevent creating ambiguous keys from duplicated column names, and keyed joins now error on existing duplicated key columns rather than silently giving incorrect results, [#4888](https://github.com/Rdatatable/data.table/issues/4888) and [#4891](https://github.com/Rdatatable/data.table/issues/4891). Thanks @magerton and @MichaelChirico for the reports, and @ben-schwen for the fix.

### Notes

1. {data.table} now depends on R 3.5.0 (2018).
Expand Down
24 changes: 18 additions & 6 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -841,9 +841,10 @@ replace_dot_alias = function(e) {

if (is.data.frame(i)) {
if (missing(on)) {
if (!haskey(x)) {
stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
}
if (haskey(x)) check_duplicate_key(x)
else stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")

if (haskey(i)) check_duplicate_key(i)
} else if (identical(substitute(on), as.name(".NATURAL"))) {
naturaljoin = TRUE
}
Expand Down Expand Up @@ -1871,10 +1872,10 @@ replace_dot_alias = function(e) {
}
shared_keys = get_shared_keys(jsub, jvnames, sdvars = sdvars, key(x))
if (is.null(irows) && !is.null(shared_keys)) {
setattr(jval, 'sorted', shared_keys)
if (!any(shared_keys %chin% duplicated_values(names(jval)))) setattr(jval, 'sorted', shared_keys)
# potentially inefficient backup -- check if jval is sorted by key(x)
} else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) {
setattr(jval, 'sorted', key(x))
if (!any(key(x) %chin% duplicated_values(names(jval)))) setattr(jval, 'sorted', key(x))
}
if (any(vapply_1b(jval, is.null))) internal_error("j has created a data.table result containing a NULL column") # nocov
}
Expand Down Expand Up @@ -2188,7 +2189,8 @@ replace_dot_alias = function(e) {
setkeyv(ans,names(ans)[seq_along(byval)])
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
} else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) {
setattr(ans, "sorted", names(ans)[seq_along(grpcols)])
grpnames = names(ans)[seq_along(grpcols)]
if (!any(grpnames %chin% duplicated_values(names(ans)))) setattr(ans, "sorted", grpnames)
}
setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line
}
Expand Down Expand Up @@ -2610,6 +2612,7 @@ subset.data.table = function(x, subset, select, ...)
} else {
setkey(ans,NULL)
}
if (haskey(ans) && any(key(ans) %chin% duplicated_values(names(ans)))) setattr(ans, "sorted", NULL)
ans
}

Expand Down Expand Up @@ -2946,6 +2949,15 @@ setnames = function(x,old,new,skip_absent=FALSE) {
# update the key if the column name being change is in the key
m = chmatch(names(x)[i], key(x))
w = which(!is.na(m))
if (haskey(x)) {
check_duplicate_key(x)
k = key(x)
if (length(w)) k[m[w]] = new[w]
newnames = names(x)
newnames[i] = new
duplicate_key = unique(c(k[duplicated(k)], k[k %chin% duplicated_values(newnames)]))
if (length(duplicate_key)) stopf("The new names would result in duplicated key columns: %s", brackify(duplicate_key))
}
if (length(w))
.Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w])

Expand Down
4 changes: 4 additions & 0 deletions R/setkey.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
cols = gsub("`", "", cols, fixed = TRUE)
miss = !(cols %chin% colnames(x))
if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss]), class = "dt_missing_column_error")
if (anyDuplicated(cols)) stopf("cols contains duplicate column names: %s", brackify(duplicated_values(cols)))
if (any(cols %chin% (dups <- duplicated_values(names(x)))))
stopf("x has duplicated column names in the columns to key by: %s", brackify(cols[cols %chin% dups]))

if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387
## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
Expand Down Expand Up @@ -309,6 +312,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE)
l = list(...)
vnames = name_dots(...)$vnames
if (any(tt <- !nzchar(vnames))) vnames[tt] = paste0("V", which(tt))
if (sorted && anyDuplicated(vnames)) stopf("CJ() cannot create a keyed data.table with duplicated column names: %s", brackify(duplicated_values(vnames)))
dups = FALSE # fix for #1513
for (i in seq_along(l)) {
y = l[[i]]
Expand Down
11 changes: 11 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) {
table_name, brackify(duplicate_names), domain=NA)
}

check_duplicate_key = function(x) {
k = key(x)
duplicate_key = unique(c(k[duplicated(k)], k[k %chin% duplicated_values(names(x))]))
if (length(duplicate_key))
stopf(ngettext(length(duplicate_key),
"%s has duplicated key column %s. Please remove or rename the duplicate and try again.",
"%s has duplicated key columns %s. Please remove or rename the duplicates and try again."),
deparse(substitute(x)), brackify(duplicate_key), domain=NA)
invisible()
}

duplicated_values = function(x) {
# fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case
if (!anyDuplicated(x)) return(vector(typeof(x)))
Expand Down
35 changes: 28 additions & 7 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -2405,7 +2405,8 @@ b=data.table('User ID'=c(1,2,3), 'Yadda Yadda'=c(1,2,3), key='User ID')
test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda"))

# setcolorder and merge check for dup column names, #2193(ii)
setnames(DT2,"b","a")
DT2 = data.table(a=letters[1:5], a=6L)
setattr(DT2, "sorted", "a")
test(828, setcolorder(DT2,c("a","b")), error="x has duplicated column name [a]. Please remove or rename")
test(829, merge(DT1,DT2), error="y has duplicated column name [a]. Please remove or rename")
test(830, merge(DT2,DT1), error="x has duplicated column name [a]. Please remove or rename")
Expand Down Expand Up @@ -12364,9 +12365,9 @@ children = data.table(parent=c("Sarah", "Max", "Max"),
sex=c("M", "M", "F"), age=c(5,8,7))
joined = merge(parents, children, by.x="name", by.y="parent")
test(1880.1, length(names(joined)), length(unique(names(joined))))
test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L,
test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""), sort=FALSE)), 3L,
warning = "column names.*are duplicated in the result")
joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE))
joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE, sort=FALSE))
test(1880.3, anyDuplicated(names(joined)) > 0L, TRUE)

# out-of-sample quote rule bump, #2265
Expand All @@ -12384,7 +12385,7 @@ unlink(f)
test(1882.1, .Machine$integer.max, 2147483647L) # same on all platforms and very unlikely to change in R (which is good)
test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291)
v = seq_len(1291L)
test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647")
test(1882.3, CJ(v, v, v, sorted=FALSE), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647")

# no re-read for particular file, #2509
if (test_R.utils) test(1883, fread(testDir("SA2-by-DJZ.csv.gz"), verbose=TRUE, header=FALSE)[c(1,2,1381,.N),],
Expand Down Expand Up @@ -13599,7 +13600,7 @@ test(1967.26, foverlaps(x, y, by.x = c('start', 'END')),
error = "Elements listed in 'by.x' must be valid names")
test(1967.27, foverlaps(x, y, by.x = c('start', 'start')),
error = 'Duplicate columns are not allowed')
setkey(y, start, start)
setattr(y, 'sorted', c('start', 'start'))
test(1967.28, foverlaps(x, y, by.y = c('start', 'start')),
error = 'Duplicate columns are not allowed')
setkey(y, start, end)
Expand Down Expand Up @@ -13693,7 +13694,7 @@ test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOu
# test 1967.69 subsumes 1967.69 and 1967.70
test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L,
output = "forder.c received 5 rows and 1 column")
setkey(x)
setattr(x, 'sorted', c('a', 'a'))
test(1967.72,optimize=1L, x[x, .N, on = 'a', verbose = TRUE], 5L,
output = "on= matches existing key")

Expand Down Expand Up @@ -16102,8 +16103,9 @@ test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a",
set.seed(2)
ids = sample(letters, 10) # reduced from 20 to 10
dates = 1:10 # and 40 to 10 to save ram, #5517
dt = data.table(CJ(dates, ids, ids))
dt = data.table(CJ(dates, ids, ids, sorted=FALSE))
setnames(dt, c("date", "id1", "id2"))
setorder(dt, date, id1, id2)
dt[, value := rnorm(length(date))]
dt = dt[!(date == 1 & (id1 == "a" | id2 == "a"))]
dt = dt[!(date == 4 & (id1 == "e" | id2 == "e"))]
Expand Down Expand Up @@ -21620,3 +21622,22 @@ test(2373.1, empty_dt[, 1, by = numeric()], data.table(numeric=numeric(), V1=num
test(2373.2, empty_dt[, 1L, by = numeric()], data.table(numeric=numeric(), V1=integer()))
test(2373.3, empty_dt[, .(x=1), by = .(b=numeric())], data.table(b=numeric(), x=numeric()))
test(2373.4, data.table()[, x := 1, by=numeric()], data.table(x=numeric()))

# prohibit duplicated column names as key columns, #4888 and #4891
DT = data.table(a=1:3, a=4:6)
test(2374.01, setkey(DT, a, a), error="cols contains duplicate column names")
test(2374.02, setindex(DT, a, a), error="cols contains duplicate column names")
test(2374.03, setkey(DT, a), error="duplicated column names in the columns to key by")
DT = data.table(a=1:2, b=3:4, key=c("a", "b"))
test(2374.04, setnames(DT, "b", "a"), error="duplicated key columns")
test(2374.05, key(DT), c("a", "b"))
test(2374.06, CJ(a=1:3, a=4:5), error="duplicated column names")
DT1 = CJ(a=1:3, b=c('a','b'), c=6)
DT2 = data.table(a=rep(1:3, each=2L), a=rep(4:5, 3L), d=6)
setattr(DT2, "sorted", c("a", "a", "d"))
test(2374.07, DT1[DT2], error="i has duplicated key column")
DT = data.table(a=1:2, b=3:4, key="a")
test(2374.08, key(DT[, .(a, a)]), NULL)
test(2374.09, key(subset(DT, select=c(a, a))), NULL)
DT = data.table(a=1:2, a.1=3:4, val=10:11)
test(2374.10, key(DT[, .(a.1, sum(val)), keyby=.(a, a)]), NULL)
Loading