Rdatatable · ben-schwen · May 27, 2026 · May 26, 2026 · May 27, 2026 · May 27, 2026
@@ -52,6 +52,8 @@
 
 9. Grouping operations on empty (0-row, 0-column) data.tables work as intended, [#7749](https://github.com/Rdatatable/data.table/issues/7749). Thanks @rickhelmus for the report and @MichaelChirico for the fix.
 
+10. `setkey()`, `setindex()`, `CJ()`, and `setnames()` now prevent creating ambiguous keys from duplicated column names, and keyed joins now error on existing duplicated key columns rather than silently giving incorrect results, [#4888](https://github.com/Rdatatable/data.table/issues/4888) and [#4891](https://github.com/Rdatatable/data.table/issues/4891). Thanks @magerton and @MichaelChirico for the reports, and @ben-schwen for the fix.
+
 ### Notes
 
 1. {data.table} now depends on R 3.5.0 (2018).

@@ -841,9 +841,10 @@ replace_dot_alias = function(e) {
 
     if (is.data.frame(i)) {
       if (missing(on)) {
-        if (!haskey(x)) {
-          stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
-        }
+        if (haskey(x)) check_duplicate_key(x)
+        else stopf("When i is a data.table (or character vector), the columns to join by must be specified using the 'on=' argument (see ?data.table); by keying x (i.e., x is sorted and marked as such, see ?setkey); or by using 'on = .NATURAL' to indicate using the shared column names between x and i (i.e., a natural join). Keyed joins might have further speed benefits on very large data due to x being sorted in RAM.")
+
+        if (haskey(i)) check_duplicate_key(i)
       } else if (identical(substitute(on), as.name(".NATURAL"))) {
         naturaljoin = TRUE
       }
@@ -1871,10 +1872,10 @@ replace_dot_alias = function(e) {
       }
       shared_keys = get_shared_keys(jsub, jvnames, sdvars = sdvars, key(x))
       if (is.null(irows) && !is.null(shared_keys)) {
-        setattr(jval, 'sorted', shared_keys)
+        if (!any(shared_keys %chin% duplicated_values(names(jval)))) setattr(jval, 'sorted', shared_keys)
         # potentially inefficient backup -- check if jval is sorted by key(x)
       } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) {
-        setattr(jval, 'sorted', key(x))
+        if (!any(key(x) %chin% duplicated_values(names(jval)))) setattr(jval, 'sorted', key(x))
       }
       if (any(vapply_1b(jval, is.null))) internal_error("j has created a data.table result containing a NULL column") # nocov
     }
@@ -2188,7 +2189,8 @@ replace_dot_alias = function(e) {
     setkeyv(ans,names(ans)[seq_along(byval)])
     if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
   } else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) {
-    setattr(ans, "sorted", names(ans)[seq_along(grpcols)])
+    grpnames = names(ans)[seq_along(grpcols)]
+    if (!any(grpnames %chin% duplicated_values(names(ans)))) setattr(ans, "sorted", grpnames)
   }
   setalloccol(ans)   # TODO: overallocate in dogroups in the first place and remove this line
 }
@@ -2610,6 +2612,7 @@ subset.data.table = function(x, subset, select, ...)
   } else {
     setkey(ans,NULL)
   }
+  if (haskey(ans) && any(key(ans) %chin% duplicated_values(names(ans)))) setattr(ans, "sorted", NULL)
   ans
 }
 
@@ -2946,6 +2949,15 @@ setnames = function(x,old,new,skip_absent=FALSE) {
   # update the key if the column name being change is in the key
   m = chmatch(names(x)[i], key(x))
   w = which(!is.na(m))
+  if (haskey(x)) {
+    check_duplicate_key(x)
+    k = key(x)
+    if (length(w)) k[m[w]] = new[w]
+    newnames = names(x)
+    newnames[i] = new
+    duplicate_key = unique(c(k[duplicated(k)], k[k %chin% duplicated_values(newnames)]))
+    if (length(duplicate_key)) stopf("The new names would result in duplicated key columns: %s", brackify(duplicate_key))
+  }
   if (length(w))
     .Call(Csetcharvec, attr(x, "sorted", exact=TRUE), m[w], new[w])
 

@@ -44,6 +44,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
   cols = gsub("`", "", cols, fixed = TRUE)
   miss = !(cols %chin% colnames(x))
   if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss]), class = "dt_missing_column_error")
+  if (anyDuplicated(cols)) stopf("cols contains duplicate column names: %s", brackify(duplicated_values(cols)))
+  if (any(cols %chin% (dups <- duplicated_values(names(x)))))
+    stopf("x has duplicated column names in the columns to key by: %s", brackify(cols[cols %chin% dups]))
 
   if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387
     ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
@@ -309,6 +312,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE)
   l = list(...)
   vnames = name_dots(...)$vnames
   if (any(tt <- !nzchar(vnames))) vnames[tt] = paste0("V", which(tt))
+  if (sorted && anyDuplicated(vnames)) stopf("CJ() cannot create a keyed data.table with duplicated column names: %s", brackify(duplicated_values(vnames)))
   dups = FALSE # fix for #1513
   for (i in seq_along(l)) {
     y = l[[i]]

@@ -35,6 +35,17 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) {
         table_name, brackify(duplicate_names), domain=NA)
 }
 
+check_duplicate_key = function(x) {
+  k = key(x)
+  duplicate_key = unique(c(k[duplicated(k)], k[k %chin% duplicated_values(names(x))]))
+  if (length(duplicate_key))
+    stopf(ngettext(length(duplicate_key),
+                   "%s has duplicated key column %s. Please remove or rename the duplicate and try again.",
+                   "%s has duplicated key columns %s. Please remove or rename the duplicates and try again."),
+          deparse(substitute(x)), brackify(duplicate_key), domain=NA)
+  invisible()
+}
+
 duplicated_values = function(x) {
   # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case
   if (!anyDuplicated(x)) return(vector(typeof(x)))

@@ -2405,7 +2405,8 @@ b=data.table('User ID'=c(1,2,3), 'Yadda Yadda'=c(1,2,3), key='User ID')
 test(827.1, names(a[b]), c("User ID","Blah Blah","Yadda Yadda"))
 
 # setcolorder and merge check for dup column names, #2193(ii)
-setnames(DT2,"b","a")
+DT2 = data.table(a=letters[1:5], a=6L)
+setattr(DT2, "sorted", "a")
 test(828, setcolorder(DT2,c("a","b")), error="x has duplicated column name [a]. Please remove or rename")
 test(829, merge(DT1,DT2), error="y has duplicated column name [a]. Please remove or rename")
 test(830, merge(DT2,DT1), error="x has duplicated column name [a]. Please remove or rename")
@@ -12364,9 +12365,9 @@ children = data.table(parent=c("Sarah", "Max", "Max"),
                        sex=c("M", "M", "F"), age=c(5,8,7))
 joined = merge(parents, children, by.x="name", by.y="parent")
 test(1880.1, length(names(joined)), length(unique(names(joined))))
-test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L,
+test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""), sort=FALSE)), 3L,
              warning = "column names.*are duplicated in the result")
-joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE))
+joined = suppressWarnings(merge(parents, children, by.x="name", by.y="parent", no.dups=FALSE, sort=FALSE))
 test(1880.3, anyDuplicated(names(joined)) > 0L, TRUE)
 
 # out-of-sample quote rule bump, #2265
@@ -12384,7 +12385,7 @@ unlink(f)
 test(1882.1, .Machine$integer.max, 2147483647L)  # same on all platforms and very unlikely to change in R (which is good)
 test(1882.2, ceiling(.Machine$integer.max^(1/3)), 1291)
 v = seq_len(1291L)
-test(1882.3, CJ(v, v, v), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647")
+test(1882.3, CJ(v, v, v, sorted=FALSE), error="Cross product of elements provided to CJ() would result in 2151685171 rows which exceeds .Machine$integer.max == 2147483647")
 
 # no re-read for particular file, #2509
 if (test_R.utils) test(1883, fread(testDir("SA2-by-DJZ.csv.gz"), verbose=TRUE, header=FALSE)[c(1,2,1381,.N),],
@@ -13599,7 +13600,7 @@ test(1967.26, foverlaps(x, y, by.x = c('start', 'END')),
      error = "Elements listed in 'by.x' must be valid names")
 test(1967.27, foverlaps(x, y, by.x = c('start', 'start')),
      error = 'Duplicate columns are not allowed')
-setkey(y, start, start)
+setattr(y, 'sorted', c('start', 'start'))
 test(1967.28, foverlaps(x, y, by.y = c('start', 'start')),
      error = 'Duplicate columns are not allowed')
 setkey(y, start, end)
@@ -13693,7 +13694,7 @@ test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOu
 # test 1967.69 subsumes 1967.69 and 1967.70
 test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L,
      output = "forder.c received 5 rows and 1 column")
-setkey(x)
+setattr(x, 'sorted', c('a', 'a'))
 test(1967.72,optimize=1L, x[x, .N, on = 'a', verbose = TRUE], 5L,
      output = "on= matches existing key")
 
@@ -16102,8 +16103,9 @@ test(2114.2, DT[,g(.GRP),by=A], data.table(A=INT(1,1,2,2,2), V1=as.factor(c("a",
 set.seed(2)
 ids   = sample(letters, 10)   # reduced from 20 to 10
 dates = 1:10                  #   and 40 to 10 to save ram, #5517
-dt = data.table(CJ(dates, ids, ids))
+dt = data.table(CJ(dates, ids, ids, sorted=FALSE))
 setnames(dt, c("date", "id1", "id2"))
+setorder(dt, date, id1, id2)
 dt[, value := rnorm(length(date))]
 dt = dt[!(date == 1 & (id1 == "a" | id2 == "a"))]
 dt = dt[!(date == 4 & (id1 == "e" | id2 == "e"))]
@@ -21620,3 +21622,22 @@ test(2373.1, empty_dt[, 1, by = numeric()], data.table(numeric=numeric(), V1=num
 test(2373.2, empty_dt[, 1L, by = numeric()], data.table(numeric=numeric(), V1=integer()))
 test(2373.3, empty_dt[, .(x=1), by = .(b=numeric())], data.table(b=numeric(), x=numeric()))
 test(2373.4, data.table()[, x := 1, by=numeric()], data.table(x=numeric()))
+
+# prohibit duplicated column names as key columns, #4888 and #4891
+DT = data.table(a=1:3, a=4:6)
+test(2374.01, setkey(DT, a, a), error="cols contains duplicate column names")
+test(2374.02, setindex(DT, a, a), error="cols contains duplicate column names")
+test(2374.03, setkey(DT, a), error="duplicated column names in the columns to key by")
+DT = data.table(a=1:2, b=3:4, key=c("a", "b"))
+test(2374.04, setnames(DT, "b", "a"), error="duplicated key columns")
+test(2374.05, key(DT), c("a", "b"))
+test(2374.06, CJ(a=1:3, a=4:5), error="duplicated column names")
+DT1 = CJ(a=1:3, b=c('a','b'), c=6)
+DT2 = data.table(a=rep(1:3, each=2L), a=rep(4:5, 3L), d=6)
+setattr(DT2, "sorted", c("a", "a", "d"))
+test(2374.07, DT1[DT2], error="i has duplicated key column")
+DT = data.table(a=1:2, b=3:4, key="a")
+test(2374.08, key(DT[, .(a, a)]), NULL)
+test(2374.09, key(subset(DT, select=c(a, a))), NULL)
+DT = data.table(a=1:2, a.1=3:4, val=10:11)
+test(2374.10, key(DT[, .(a.1, sum(val)), keyby=.(a, a)]), NULL)