Skip to content

Commit 98adfbd

Browse files
authored
test: improve array_distinct test coverage and incompatibility description (#3887)
1 parent 9d3e166 commit 98adfbd

2 files changed

Lines changed: 153 additions & 4 deletions

File tree

spark/src/main/scala/org/apache/comet/serde/arrays.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,8 @@ object CometArrayContains extends CometExpressionSerde[ArrayContains] {
189189

190190
object CometArrayDistinct extends CometExpressionSerde[ArrayDistinct] {
191191

192-
override def getSupportLevel(expr: ArrayDistinct): SupportLevel = Incompatible(None)
192+
override def getSupportLevel(expr: ArrayDistinct): SupportLevel =
193+
Incompatible(Some("Output elements are sorted rather than preserving insertion order"))
193194

194195
override def convert(
195196
expr: ArrayDistinct,

spark/src/test/resources/sql-tests/expressions/array/array_distinct.sql

Lines changed: 151 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,163 @@
1717

1818
-- ConfigMatrix: parquet.enable.dictionary=false,true
1919

20+
-- ===== INT arrays =====
21+
2022
statement
21-
CREATE TABLE test_array_distinct(arr array<int>) USING parquet
23+
CREATE TABLE test_array_distinct_int(arr array<int>) USING parquet
2224

2325
statement
24-
INSERT INTO test_array_distinct VALUES (array(1, 2, 2, 3, 3)), (array()), (NULL), (array(NULL, 1, NULL, 2)), (array(1))
26+
INSERT INTO test_array_distinct_int VALUES
27+
(array(1, 2, 2, 3, 3)),
28+
(array()),
29+
(NULL),
30+
(array(NULL, 1, NULL, 2)),
31+
(array(1)),
32+
(array(NULL, NULL, NULL)),
33+
(array(-2147483648, 2147483647, -2147483648, 0)),
34+
(array(0, -1, -1, 0, 1))
2535

36+
-- column argument
2637
query spark_answer_only
27-
SELECT array_distinct(arr) FROM test_array_distinct
38+
SELECT array_distinct(arr) FROM test_array_distinct_int
2839

2940
-- literal arguments
3041
query spark_answer_only
3142
SELECT array_distinct(array(1, 2, 2, 3, 3))
43+
44+
-- all NULLs
45+
query spark_answer_only
46+
SELECT array_distinct(array(CAST(NULL AS INT), CAST(NULL AS INT)))
47+
48+
-- NULL input
49+
query spark_answer_only
50+
SELECT array_distinct(CAST(NULL AS array<int>))
51+
52+
-- boundary values
53+
query spark_answer_only
54+
SELECT array_distinct(array(-2147483648, 2147483647, -2147483648, 2147483647, 0))
55+
56+
-- ===== LONG arrays =====
57+
58+
statement
59+
CREATE TABLE test_array_distinct_long(arr array<bigint>) USING parquet
60+
61+
statement
62+
INSERT INTO test_array_distinct_long VALUES
63+
(array(1, 2, 2, 3, 3)),
64+
(NULL),
65+
(array(NULL, 1, NULL, 2)),
66+
(array(-9223372036854775808, 9223372036854775807, -9223372036854775808))
67+
68+
query spark_answer_only
69+
SELECT array_distinct(arr) FROM test_array_distinct_long
70+
71+
-- boundary values
72+
query spark_answer_only
73+
SELECT array_distinct(array(CAST(-9223372036854775808 AS BIGINT), CAST(9223372036854775807 AS BIGINT), CAST(-9223372036854775808 AS BIGINT)))
74+
75+
-- ===== STRING arrays =====
76+
77+
statement
78+
CREATE TABLE test_array_distinct_string(arr array<string>) USING parquet
79+
80+
statement
81+
INSERT INTO test_array_distinct_string VALUES
82+
(array('b', 'a', 'a', 'c', 'b')),
83+
(array('')),
84+
(NULL),
85+
(array(NULL, 'a', NULL, 'a')),
86+
(array('', '', NULL, '')),
87+
(array('hello', 'world', 'hello'))
88+
89+
query spark_answer_only
90+
SELECT array_distinct(arr) FROM test_array_distinct_string
91+
92+
-- empty string and NULL distinction
93+
query spark_answer_only
94+
SELECT array_distinct(array('', NULL, '', NULL, 'a'))
95+
96+
-- ===== BOOLEAN arrays =====
97+
98+
statement
99+
CREATE TABLE test_array_distinct_bool(arr array<boolean>) USING parquet
100+
101+
statement
102+
INSERT INTO test_array_distinct_bool VALUES
103+
(array(true, false, false, true)),
104+
(array(true, true)),
105+
(NULL),
106+
(array(NULL, true, NULL, false))
107+
108+
query spark_answer_only
109+
SELECT array_distinct(arr) FROM test_array_distinct_bool
110+
111+
-- ===== DOUBLE arrays =====
112+
113+
statement
114+
CREATE TABLE test_array_distinct_double(arr array<double>) USING parquet
115+
116+
statement
117+
INSERT INTO test_array_distinct_double VALUES
118+
(array(1.123, 0.1234, 1.121, 1.123, 0.1234)),
119+
(NULL),
120+
(array(NULL, 1.0, NULL, 2.0))
121+
122+
query spark_answer_only
123+
SELECT array_distinct(arr) FROM test_array_distinct_double
124+
125+
-- NaN deduplication
126+
query spark_answer_only
127+
SELECT array_distinct(array(CAST('NaN' AS DOUBLE), CAST('NaN' AS DOUBLE), 1.0, 1.0))
128+
129+
-- NaN with NULL
130+
query spark_answer_only
131+
SELECT array_distinct(array(CAST('NaN' AS DOUBLE), NULL, CAST('NaN' AS DOUBLE), NULL, 1.0))
132+
133+
-- Infinity
134+
query spark_answer_only
135+
SELECT array_distinct(array(CAST('Infinity' AS DOUBLE), CAST('-Infinity' AS DOUBLE), CAST('Infinity' AS DOUBLE), 0.0))
136+
137+
-- negative zero
138+
query spark_answer_only
139+
SELECT array_distinct(array(0.0, -0.0, 1.0))
140+
141+
-- ===== FLOAT arrays =====
142+
143+
statement
144+
CREATE TABLE test_array_distinct_float(arr array<float>) USING parquet
145+
146+
statement
147+
INSERT INTO test_array_distinct_float VALUES
148+
(array(CAST(1.123 AS FLOAT), CAST(0.1234 AS FLOAT), CAST(1.121 AS FLOAT), CAST(1.123 AS FLOAT))),
149+
(NULL),
150+
(array(CAST(NULL AS FLOAT), CAST(1.0 AS FLOAT), CAST(NULL AS FLOAT)))
151+
152+
query spark_answer_only
153+
SELECT array_distinct(arr) FROM test_array_distinct_float
154+
155+
-- Float NaN deduplication
156+
query spark_answer_only
157+
SELECT array_distinct(array(CAST('NaN' AS FLOAT), CAST('NaN' AS FLOAT), CAST(1.0 AS FLOAT)))
158+
159+
-- ===== DECIMAL arrays =====
160+
161+
statement
162+
CREATE TABLE test_array_distinct_decimal(arr array<decimal(10,2)>) USING parquet
163+
164+
statement
165+
INSERT INTO test_array_distinct_decimal VALUES
166+
(array(1.10, 2.20, 1.10, 3.30)),
167+
(NULL),
168+
(array(NULL, 1.10, NULL, 1.10))
169+
170+
query spark_answer_only
171+
SELECT array_distinct(arr) FROM test_array_distinct_decimal
172+
173+
-- ===== Nested array (array of arrays) =====
174+
175+
query spark_answer_only
176+
SELECT array_distinct(array(array(1, 2), array(3, 4), array(1, 2), array(3, 4)))
177+
178+
query spark_answer_only
179+
SELECT array_distinct(array(array(1, 2), CAST(NULL AS array<int>), array(1, 2), CAST(NULL AS array<int>)))

0 commit comments

Comments
 (0)