Skip to content

Commit 4b7e86b

Browse files
committed
PARQUET-2249: Add IEEE-754 total order and nan count for floating types
1 parent 6e2f7bb commit 4b7e86b

File tree

24 files changed

+2176
-72
lines changed

24 files changed

+2176
-72
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.io.api.Binary;
22+
import org.apache.parquet.schema.ColumnOrder;
23+
import org.apache.parquet.schema.Float16;
24+
import org.apache.parquet.schema.LogicalTypeAnnotation;
2225
import org.apache.parquet.schema.PrimitiveType;
2326
import org.apache.parquet.schema.Types;
2427

@@ -28,6 +31,7 @@ public class BinaryStatistics extends Statistics<Binary> {
2831
private static final PrimitiveType DEFAULT_FAKE_TYPE =
2932
Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named("fake_binary_type");
3033

34+
private final boolean isFloat16;
3135
private Binary max;
3236
private Binary min;
3337

@@ -41,26 +45,51 @@ public BinaryStatistics() {
4145

4246
BinaryStatistics(PrimitiveType type) {
4347
super(type);
48+
this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation;
49+
if (isFloat16) {
50+
incrementNanCount(0);
51+
}
4452
}
4553

4654
private BinaryStatistics(BinaryStatistics other) {
4755
super(other.type());
56+
this.isFloat16 = other.isFloat16;
4857
if (other.hasNonNullValue()) {
4958
initializeStats(other.min, other.max);
5059
}
5160
setNumNulls(other.getNumNulls());
61+
incrementNanCount(other.getNanCount());
5262
}
5363

5464
@Override
5565
public void updateStats(Binary value) {
66+
if (isFloat16 && Float16.isNaN(value.get2BytesLittleEndian())) {
67+
incrementNanCount();
68+
}
5669
if (!this.hasNonNullValue()) {
5770
min = value.copy();
5871
max = value.copy();
5972
this.markAsNotEmpty();
60-
} else if (comparator().compare(min, value) > 0) {
61-
min = value.copy();
62-
} else if (comparator().compare(max, value) < 0) {
63-
max = value.copy();
73+
} else {
74+
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
75+
if (!Float16.isNaN(value.get2BytesLittleEndian())) {
76+
if (Float16.isNaN(min.get2BytesLittleEndian())
77+
|| comparator().compare(min, value) > 0) {
78+
min = value.copy();
79+
}
80+
if (Float16.isNaN(max.get2BytesLittleEndian())
81+
|| comparator().compare(max, value) < 0) {
82+
max = value.copy();
83+
}
84+
}
85+
return;
86+
}
87+
88+
if (comparator().compare(min, value) > 0) {
89+
min = value.copy();
90+
} else if (comparator().compare(max, value) < 0) {
91+
max = value.copy();
92+
}
6493
}
6594
}
6695

@@ -126,6 +155,20 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) {
126155
*/
127156
@Deprecated
128157
public void updateStats(Binary min_value, Binary max_value) {
158+
if (isFloat16 && type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
159+
if (!Float16.isNaN(min_value.get2BytesLittleEndian())) {
160+
if (Float16.isNaN(min.get2BytesLittleEndian()) || comparator().compare(min, min_value) > 0) {
161+
min = min_value.copy();
162+
}
163+
}
164+
if (!Float16.isNaN(max_value.get2BytesLittleEndian())) {
165+
if (Float16.isNaN(max.get2BytesLittleEndian()) || comparator().compare(max, max_value) < 0) {
166+
max = max_value.copy();
167+
}
168+
}
169+
return;
170+
}
171+
129172
if (comparator().compare(min, min_value) > 0) {
130173
min = min_value.copy();
131174
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.bytes.BytesUtils;
22+
import org.apache.parquet.schema.ColumnOrder;
2223
import org.apache.parquet.schema.PrimitiveType;
2324
import org.apache.parquet.schema.Types;
2425

@@ -41,6 +42,7 @@ public DoubleStatistics() {
4142

4243
DoubleStatistics(PrimitiveType type) {
4344
super(type);
45+
incrementNanCount(0);
4446
}
4547

4648
private DoubleStatistics(DoubleStatistics other) {
@@ -49,10 +51,14 @@ private DoubleStatistics(DoubleStatistics other) {
4951
initializeStats(other.min, other.max);
5052
}
5153
setNumNulls(other.getNumNulls());
54+
incrementNanCount(other.getNanCount());
5255
}
5356

5457
@Override
5558
public void updateStats(double value) {
59+
if (Double.isNaN(value)) {
60+
incrementNanCount();
61+
}
5662
if (!this.hasNonNullValue()) {
5763
initializeStats(value, value);
5864
} else {
@@ -98,6 +104,20 @@ public boolean isSmallerThan(long size) {
98104
}
99105

100106
public void updateStats(double min_value, double max_value) {
107+
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
108+
if (!Double.isNaN(min_value)) {
109+
if (Double.isNaN(min) || comparator().compare(min, min_value) > 0) {
110+
min = min_value;
111+
}
112+
}
113+
if (!Double.isNaN(max_value)) {
114+
if (Double.isNaN(max) || comparator().compare(max, max_value) < 0) {
115+
max = max_value;
116+
}
117+
}
118+
return;
119+
}
120+
101121
if (comparator().compare(min, min_value) > 0) {
102122
min = min_value;
103123
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import org.apache.parquet.bytes.BytesUtils;
22+
import org.apache.parquet.schema.ColumnOrder;
2223
import org.apache.parquet.schema.PrimitiveType;
2324
import org.apache.parquet.schema.Types;
2425

@@ -42,6 +43,7 @@ public FloatStatistics() {
4243

4344
FloatStatistics(PrimitiveType type) {
4445
super(type);
46+
incrementNanCount(0);
4547
}
4648

4749
private FloatStatistics(FloatStatistics other) {
@@ -50,10 +52,14 @@ private FloatStatistics(FloatStatistics other) {
5052
initializeStats(other.min, other.max);
5153
}
5254
setNumNulls(other.getNumNulls());
55+
incrementNanCount(other.getNanCount());
5356
}
5457

5558
@Override
5659
public void updateStats(float value) {
60+
if (Float.isNaN(value)) {
61+
incrementNanCount();
62+
}
5763
if (!this.hasNonNullValue()) {
5864
initializeStats(value, value);
5965
} else {
@@ -99,6 +105,20 @@ public boolean isSmallerThan(long size) {
99105
}
100106

101107
public void updateStats(float min_value, float max_value) {
108+
if (type().columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
109+
if (!Float.isNaN(min_value)) {
110+
if (Float.isNaN(min) || comparator().compare(min, min_value) > 0) {
111+
min = min_value;
112+
}
113+
}
114+
if (!Float.isNaN(max_value)) {
115+
if (Float.isNaN(max) || comparator().compare(max, max_value) < 0) {
116+
max = max_value;
117+
}
118+
}
119+
return;
120+
}
121+
102122
if (comparator().compare(min, min_value) > 0) {
103123
min = min_value;
104124
}

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 83 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
package org.apache.parquet.column.statistics;
2020

2121
import java.util.Arrays;
22+
import org.apache.parquet.Preconditions;
2223
import org.apache.parquet.column.UnknownColumnTypeException;
2324
import org.apache.parquet.io.api.Binary;
25+
import org.apache.parquet.schema.ColumnOrder;
2426
import org.apache.parquet.schema.Float16;
2527
import org.apache.parquet.schema.LogicalTypeAnnotation;
2628
import org.apache.parquet.schema.PrimitiveComparator;
@@ -40,10 +42,11 @@ public abstract class Statistics<T extends Comparable<T>> {
4042
* Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
4143
*/
4244
public static class Builder {
43-
private final PrimitiveType type;
45+
protected final PrimitiveType type;
4446
private byte[] min;
4547
private byte[] max;
4648
private long numNulls = -1;
49+
private long nanCount = -1;
4750

4851
private Builder(PrimitiveType type) {
4952
this.type = type;
@@ -64,12 +67,21 @@ public Builder withNumNulls(long numNulls) {
6467
return this;
6568
}
6669

70+
public Builder withNanCount(long nanCount) {
71+
this.nanCount = nanCount;
72+
return this;
73+
}
74+
6775
public Statistics<?> build() {
6876
Statistics<?> stats = createStats(type);
6977
if (min != null && max != null) {
7078
stats.setMinMaxFromBytes(min, max);
7179
}
7280
stats.num_nulls = this.numNulls;
81+
stats.nan_count = this.nanCount;
82+
Preconditions.checkState(
83+
!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder()) || stats.nan_count >= 0,
84+
"nan_count is required by IEEE 754 column order with type " + type);
7385
return stats;
7486
}
7587
}
@@ -87,10 +99,12 @@ public Statistics<?> build() {
8799
if (stats.hasNonNullValue()) {
88100
Float min = stats.genericGetMin();
89101
Float max = stats.genericGetMax();
90-
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
91102
if (min.isNaN() || max.isNaN()) {
92-
stats.setMinMax(0.0f, 0.0f);
93-
((Statistics<?>) stats).hasNonNullValue = false;
103+
if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
104+
// For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined
105+
stats.setMinMax(0.0f, 0.0f);
106+
((Statistics<?>) stats).hasNonNullValue = false;
107+
}
94108
} else {
95109
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
96110
if (Float.compare(min, 0.0f) == 0) {
@@ -120,10 +134,12 @@ public Statistics<?> build() {
120134
if (stats.hasNonNullValue()) {
121135
Double min = stats.genericGetMin();
122136
Double max = stats.genericGetMax();
123-
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
124137
if (min.isNaN() || max.isNaN()) {
125-
stats.setMinMax(0.0, 0.0);
126-
((Statistics<?>) stats).hasNonNullValue = false;
138+
if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
139+
// For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined
140+
stats.setMinMax(0.0, 0.0);
141+
((Statistics<?>) stats).hasNonNullValue = false;
142+
}
127143
} else {
128144
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
129145
if (Double.compare(min, 0.0) == 0) {
@@ -156,10 +172,12 @@ public Statistics<?> build() {
156172
Binary bMax = stats.genericGetMax();
157173
short min = bMin.get2BytesLittleEndian();
158174
short max = bMax.get2BytesLittleEndian();
159-
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
160175
if (Float16.isNaN(min) || Float16.isNaN(max)) {
161-
stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, Float16.POSITIVE_ZERO_LITTLE_ENDIAN);
162-
((Statistics<?>) stats).hasNonNullValue = false;
176+
if (!type.columnOrder().equals(ColumnOrder.ieee754TotalOrder())) {
177+
// For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined
178+
stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, Float16.POSITIVE_ZERO_LITTLE_ENDIAN);
179+
((Statistics<?>) stats).hasNonNullValue = false;
180+
}
163181
} else {
164182
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
165183
if (min == (short) 0x0000) {
@@ -180,6 +198,7 @@ public Statistics<?> build() {
180198
private final PrimitiveComparator<T> comparator;
181199
private boolean hasNonNullValue;
182200
private long num_nulls;
201+
private long nan_count = -1;
183202
final PrimitiveStringifier stringifier;
184203

185204
Statistics(PrimitiveType type) {
@@ -349,7 +368,8 @@ public boolean equals(Object other) {
349368
return type.equals(stats.type)
350369
&& Arrays.equals(stats.getMaxBytes(), this.getMaxBytes())
351370
&& Arrays.equals(stats.getMinBytes(), this.getMinBytes())
352-
&& stats.getNumNulls() == this.getNumNulls();
371+
&& stats.getNumNulls() == this.getNumNulls()
372+
&& stats.getNanCount() == this.getNanCount();
353373
}
354374

355375
/**
@@ -382,6 +402,11 @@ public void mergeStatistics(Statistics stats) {
382402
mergeStatisticsMinMax(stats);
383403
markAsNotEmpty();
384404
}
405+
if (isNanCountSet() && stats.isNanCountSet()) {
406+
incrementNanCount(stats.getNanCount());
407+
} else {
408+
unsetNanCount();
409+
}
385410
} else {
386411
throw StatisticsClassException.create(this, stats);
387412
}
@@ -533,6 +558,53 @@ public void incrementNumNulls(long increment) {
533558
num_nulls += increment;
534559
}
535560

561+
/**
562+
* Increments the NaN count by one. If nan_count was not set (-1), initializes it to 1.
563+
*/
564+
public void incrementNanCount() {
565+
if (nan_count < 0) {
566+
nan_count = 1;
567+
} else {
568+
nan_count++;
569+
}
570+
}
571+
572+
/**
573+
* Increments the NaN count by the parameter value. If nan_count was not set (-1), initializes it to increment.
574+
*
575+
* @param increment value to increment the NaN count by
576+
*/
577+
public void incrementNanCount(long increment) {
578+
if (nan_count < 0) {
579+
nan_count = increment;
580+
} else {
581+
nan_count += increment;
582+
}
583+
}
584+
585+
/**
586+
* Returns the NaN count
587+
*
588+
* @return NaN count or {@code -1} if the NaN count is not set
589+
*/
590+
public long getNanCount() {
591+
return nan_count;
592+
}
593+
594+
/**
595+
* @return whether nanCount is set and can be used
596+
*/
597+
public boolean isNanCountSet() {
598+
return nan_count >= 0;
599+
}
600+
601+
/**
602+
* Unsets the NaN count to -1.
603+
*/
604+
public void unsetNanCount() {
605+
nan_count = -1;
606+
}
607+
536608
/**
537609
* Returns the null count
538610
*

0 commit comments

Comments
 (0)