1919package org .apache .parquet .column .statistics ;
2020
2121import java .util .Arrays ;
22+ import org .apache .parquet .Preconditions ;
2223import org .apache .parquet .column .UnknownColumnTypeException ;
2324import org .apache .parquet .io .api .Binary ;
25+ import org .apache .parquet .schema .ColumnOrder ;
2426import org .apache .parquet .schema .Float16 ;
2527import org .apache .parquet .schema .LogicalTypeAnnotation ;
2628import org .apache .parquet .schema .PrimitiveComparator ;
@@ -40,10 +42,11 @@ public abstract class Statistics<T extends Comparable<T>> {
4042 * Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
4143 */
4244 public static class Builder {
43- private final PrimitiveType type ;
45+ protected final PrimitiveType type ;
4446 private byte [] min ;
4547 private byte [] max ;
4648 private long numNulls = -1 ;
49+ private long nanCount = -1 ;
4750
4851 private Builder (PrimitiveType type ) {
4952 this .type = type ;
@@ -64,12 +67,21 @@ public Builder withNumNulls(long numNulls) {
6467 return this ;
6568 }
6669
70+ public Builder withNanCount (long nanCount ) {
71+ this .nanCount = nanCount ;
72+ return this ;
73+ }
74+
6775 public Statistics <?> build () {
6876 Statistics <?> stats = createStats (type );
6977 if (min != null && max != null ) {
7078 stats .setMinMaxFromBytes (min , max );
7179 }
7280 stats .num_nulls = this .numNulls ;
81+ stats .nan_count = this .nanCount ;
82+ Preconditions .checkState (
83+ !type .columnOrder ().equals (ColumnOrder .ieee754TotalOrder ()) || stats .nan_count >= 0 ,
84+ "nan_count is required by IEEE 754 column order with type " + type );
7385 return stats ;
7486 }
7587 }
@@ -87,10 +99,12 @@ public Statistics<?> build() {
8799 if (stats .hasNonNullValue ()) {
88100 Float min = stats .genericGetMin ();
89101 Float max = stats .genericGetMax ();
90- // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
91102 if (min .isNaN () || max .isNaN ()) {
92- stats .setMinMax (0.0f , 0.0f );
93- ((Statistics <?>) stats ).hasNonNullValue = false ;
103+ if (!type .columnOrder ().equals (ColumnOrder .ieee754TotalOrder ())) {
104+ // For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined
105+ stats .setMinMax (0.0f , 0.0f );
106+ ((Statistics <?>) stats ).hasNonNullValue = false ;
107+ }
94108 } else {
95109 // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
96110 if (Float .compare (min , 0.0f ) == 0 ) {
@@ -120,10 +134,12 @@ public Statistics<?> build() {
120134 if (stats .hasNonNullValue ()) {
121135 Double min = stats .genericGetMin ();
122136 Double max = stats .genericGetMax ();
123- // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
124137 if (min .isNaN () || max .isNaN ()) {
125- stats .setMinMax (0.0 , 0.0 );
126- ((Statistics <?>) stats ).hasNonNullValue = false ;
138+ if (!type .columnOrder ().equals (ColumnOrder .ieee754TotalOrder ())) {
139+ // For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined
140+ stats .setMinMax (0.0 , 0.0 );
141+ ((Statistics <?>) stats ).hasNonNullValue = false ;
142+ }
127143 } else {
128144 // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
129145 if (Double .compare (min , 0.0 ) == 0 ) {
@@ -156,10 +172,12 @@ public Statistics<?> build() {
156172 Binary bMax = stats .genericGetMax ();
157173 short min = bMin .get2BytesLittleEndian ();
158174 short max = bMax .get2BytesLittleEndian ();
159- // Drop min/max values in case of NaN as the sorting order of values is undefined for this case
160175 if (Float16 .isNaN (min ) || Float16 .isNaN (max )) {
161- stats .setMinMax (Float16 .POSITIVE_ZERO_LITTLE_ENDIAN , Float16 .POSITIVE_ZERO_LITTLE_ENDIAN );
162- ((Statistics <?>) stats ).hasNonNullValue = false ;
176+ if (!type .columnOrder ().equals (ColumnOrder .ieee754TotalOrder ())) {
177+ // For TYPE_DEFINED_ORDER: drop min/max values as NaN ordering is undefined
178+ stats .setMinMax (Float16 .POSITIVE_ZERO_LITTLE_ENDIAN , Float16 .POSITIVE_ZERO_LITTLE_ENDIAN );
179+ ((Statistics <?>) stats ).hasNonNullValue = false ;
180+ }
163181 } else {
164182 // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
165183 if (min == (short ) 0x0000 ) {
@@ -180,6 +198,7 @@ public Statistics<?> build() {
180198 private final PrimitiveComparator <T > comparator ;
181199 private boolean hasNonNullValue ;
182200 private long num_nulls ;
201+ private long nan_count = -1 ;
183202 final PrimitiveStringifier stringifier ;
184203
185204 Statistics (PrimitiveType type ) {
@@ -349,7 +368,8 @@ public boolean equals(Object other) {
349368 return type .equals (stats .type )
350369 && Arrays .equals (stats .getMaxBytes (), this .getMaxBytes ())
351370 && Arrays .equals (stats .getMinBytes (), this .getMinBytes ())
352- && stats .getNumNulls () == this .getNumNulls ();
371+ && stats .getNumNulls () == this .getNumNulls ()
372+ && stats .getNanCount () == this .getNanCount ();
353373 }
354374
355375 /**
@@ -382,6 +402,11 @@ public void mergeStatistics(Statistics stats) {
382402 mergeStatisticsMinMax (stats );
383403 markAsNotEmpty ();
384404 }
405+ if (isNanCountSet () && stats .isNanCountSet ()) {
406+ incrementNanCount (stats .getNanCount ());
407+ } else {
408+ unsetNanCount ();
409+ }
385410 } else {
386411 throw StatisticsClassException .create (this , stats );
387412 }
@@ -533,6 +558,53 @@ public void incrementNumNulls(long increment) {
533558 num_nulls += increment ;
534559 }
535560
561+ /**
562+ * Increments the NaN count by one. If nan_count was not set (-1), initializes it to 1.
563+ */
564+ public void incrementNanCount () {
565+ if (nan_count < 0 ) {
566+ nan_count = 1 ;
567+ } else {
568+ nan_count ++;
569+ }
570+ }
571+
572+ /**
573+ * Increments the NaN count by the parameter value. If nan_count was not set (-1), initializes it to increment.
574+ *
575+ * @param increment value to increment the NaN count by
576+ */
577+ public void incrementNanCount (long increment ) {
578+ if (nan_count < 0 ) {
579+ nan_count = increment ;
580+ } else {
581+ nan_count += increment ;
582+ }
583+ }
584+
585+ /**
586+ * Returns the NaN count
587+ *
588+ * @return NaN count or {@code -1} if the NaN count is not set
589+ */
590+ public long getNanCount () {
591+ return nan_count ;
592+ }
593+
594+ /**
595+ * @return whether nanCount is set and can be used
596+ */
597+ public boolean isNanCountSet () {
598+ return nan_count >= 0 ;
599+ }
600+
601+ /**
602+ * Unsets the NaN count to -1.
603+ */
604+ public void unsetNanCount () {
605+ nan_count = -1 ;
606+ }
607+
536608 /**
537609 * Returns the null count
538610 *
0 commit comments