apache
diff --git a/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 5 additions & 0 deletions b/‎common/utils/src/main/resources/error/error-conditions.json‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/sql-ref-ansi-compliance.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/sql-ref-ansi-compliance.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4‎
Lines changed: 2 additions & 0 deletions b/‎sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4‎
Lines changed: 7 additions & 1 deletion b/‎sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SampleMethod.java‎
Lines changed: 33 additions & 0 deletions b/‎sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SampleMethod.java‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownTableSample.java‎
Lines changed: 18 additions & 1 deletion b/‎sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsPushDownTableSample.java‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala‎
Lines changed: 1 addition & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala‎
Lines changed: 22 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala‎
Lines changed: 21 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala‎
Lines changed: 21 additions & 1 deletion
@@ -8097,6 +8097,11 @@
         "message" : [
           "The target table is <table>."
         ]
+      },
+      "TABLESAMPLE_SYSTEM" : {
+        "message" : [
+          "TABLESAMPLE SYSTEM is only supported by data sources that implement block-level sampling."
+        ]
       }
     },
     "sqlState" : "42902"
 
@@ -429,6 +429,7 @@ Below is a list of all the keywords in Spark SQL.
 |ATOMIC|non-reserved|non-reserved|non-reserved|
 |AUTHORIZATION|reserved|non-reserved|reserved|
 |BEGIN|non-reserved|non-reserved|non-reserved|
+|BERNOULLI|non-reserved|non-reserved|non-reserved|
 |BETWEEN|non-reserved|non-reserved|reserved|
 |BIGINT|non-reserved|non-reserved|reserved|
 |BINARY|non-reserved|non-reserved|reserved|
@@ -754,6 +755,7 @@ Below is a list of all the keywords in Spark SQL.
 |SUBSTR|non-reserved|non-reserved|non-reserved|
 |SUBSTRING|non-reserved|non-reserved|non-reserved|
 |SYNC|non-reserved|non-reserved|non-reserved|
+|SYSTEM|non-reserved|non-reserved|reserved|
 |SYSTEM_TIME|non-reserved|non-reserved|non-reserved|
 |SYSTEM_VERSION|non-reserved|non-reserved|non-reserved|
 |TABLE|reserved|non-reserved|reserved|
 
@@ -148,6 +148,7 @@ AT: 'AT';
 ATOMIC: 'ATOMIC';
 AUTHORIZATION: 'AUTHORIZATION';
 BEGIN: 'BEGIN';
+BERNOULLI: 'BERNOULLI';
 BETWEEN: 'BETWEEN';
 BIGINT: 'BIGINT';
 BINARY: 'BINARY';
@@ -472,6 +473,7 @@ STRUCT: 'STRUCT' {incComplexTypeLevelCounter();};
 SUBSTR: 'SUBSTR';
 SUBSTRING: 'SUBSTRING';
 SYNC: 'SYNC';
+SYSTEM: 'SYSTEM';
 SYSTEM_TIME: 'SYSTEM_TIME';
 SYSTEM_VERSION: 'SYSTEM_VERSION';
 TABLE: 'TABLE';
 
@@ -1053,7 +1053,9 @@ joinCriteria
     ;
 
 sample
-    : TABLESAMPLE LEFT_PAREN sampleMethod? RIGHT_PAREN (REPEATABLE LEFT_PAREN seed=integerValue RIGHT_PAREN)?
+    : TABLESAMPLE (sampleType=(SYSTEM | BERNOULLI))?
+      LEFT_PAREN sampleMethod? RIGHT_PAREN
+      (REPEATABLE LEFT_PAREN seed=integerValue RIGHT_PAREN)?
     ;
 
 sampleMethod
@@ -1921,6 +1923,7 @@ ansiNonReserved
     | AT
     | ATOMIC
     | BEGIN
+    | BERNOULLI
     | BETWEEN
     | BIGINT
     | BINARY
@@ -2187,6 +2190,7 @@ ansiNonReserved
     | SUBSTR
     | SUBSTRING
     | SYNC
+    | SYSTEM
     | SYSTEM_TIME
     | SYSTEM_VERSION
     | TABLES
@@ -2291,6 +2295,7 @@ nonReserved
     | ATOMIC
     | AUTHORIZATION
     | BEGIN
+    | BERNOULLI
     | BETWEEN
     | BIGINT
     | BINARY
@@ -2604,6 +2609,7 @@ nonReserved
     | SUBSTR
     | SUBSTRING
     | SYNC
+    | SYSTEM
     | SYSTEM_TIME
     | SYSTEM_VERSION
     | TABLE
 
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.read;
+
+import org.apache.spark.annotation.Evolving;
+
+/**
+ * The sampling method for TABLESAMPLE.
+ *
+ * @since 4.1.0
+ */
+@Evolving
+public enum SampleMethod {
+  /** Row-level sampling (BERNOULLI). Each row is independently selected. */
+  BERNOULLI,
+  /** Block-level sampling (SYSTEM). Entire partitions/splits are included or skipped. */
+  SYSTEM
+}
@@ -29,11 +29,28 @@
 public interface SupportsPushDownTableSample extends ScanBuilder {
 
   /**
-   * Pushes down SAMPLE to the data source.
+   * Pushes down BERNOULLI (row-level) SAMPLE to the data source.
    */
   boolean pushTableSample(
       double lowerBound,
       double upperBound,
       boolean withReplacement,
       long seed);
+
+  /**
+   * Pushes down SAMPLE to the data source with the specified sampling method.
+   */
+  default boolean pushTableSample(
+      double lowerBound,
+      double upperBound,
+      boolean withReplacement,
+      long seed,
+      SampleMethod sampleMethod) {
+    if (sampleMethod == SampleMethod.SYSTEM) {
+      // If the data source hasn't overridden this method, it must have not added support
+      // for SYSTEM sampling. Don't apply sample pushdown.
+      return false;
+    }
+    return pushTableSample(lowerBound, upperBound, withReplacement, seed);
+  }
 }
@@ -530,7 +530,7 @@ object UnsupportedOperationChecker extends Logging {
           throwError("Sorting is not supported on streaming DataFrames/Datasets, unless it is on " +
             "aggregated DataFrame/Dataset in Complete output mode")
 
-        case Sample(_, _, _, _, child) if child.isStreaming =>
+        case Sample(_, _, _, _, child, _) if child.isStreaming =>
           throwError("Sampling is not supported on streaming DataFrames/Datasets")
 
         case Window(windowExpression, _, _, child, _) if child.isStreaming =>
 
@@ -1292,7 +1292,7 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper {
         limit.copy(child = p2.copy(projectList = newProjectList))
       case Project(l1, r @ Repartition(_, _, p @ Project(l2, _))) if isRenaming(l1, l2) =>
         r.copy(child = p.copy(projectList = buildCleanedProjectList(l1, p.projectList)))
-      case Project(l1, s @ Sample(_, _, _, _, p2 @ Project(l2, _))) if isRenaming(l1, l2) =>
+      case Project(l1, s @ Sample(_, _, _, _, p2 @ Project(l2, _), _)) if isRenaming(l1, l2) =>
         s.copy(child = p2.copy(projectList = buildCleanedProjectList(l1, p2.projectList)))
       case o => o
     }
 
@@ -2381,10 +2381,14 @@ class AstBuilder extends DataTypeAstBuilder
    * - TABLESAMPLE(x ROWS): Sample the table down to the given number of rows.
    * - TABLESAMPLE(x PERCENT) [REPEATABLE (y)]: Sample the table down to the given percentage with
    * seed 'y'. Note that percentages are defined as a number between 0 and 100.
+   * - TABLESAMPLE SYSTEM(x PERCENT): Sample by data source dependent blocks or file splits.
    * - TABLESAMPLE(BUCKET x OUT OF y) [REPEATABLE (z)]: Sample the table down to a 'x' divided by
    * 'y' fraction with seed 'z'.
    */
   private def withSample(ctx: SampleContext, query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
+    val isSystem = ctx.sampleType != null &&
+      ctx.sampleType.getType == SqlBaseParser.SYSTEM
+
     // Create a sampled plan if we need one.
     def sample(fraction: Double, seed: Option[Long]): Sample = {
       // The range of fraction accepted by Sample is [0, 1]. Because Hive's block sampling
@@ -2394,17 +2398,25 @@ class AstBuilder extends DataTypeAstBuilder
       validate(fraction >= 0.0 - eps && fraction <= 1.0 + eps,
         s"Sampling fraction ($fraction) must be on interval [0, 1]",
         ctx)
-      Sample(0.0, fraction, withReplacement = false, seed, query)
+      val method = if (isSystem) SampleMethod.System else SampleMethod.Bernoulli
+      Sample(0.0, fraction, withReplacement = false, seed, query, method)
     }
 
     if (ctx.sampleMethod() == null) {
       throw QueryParsingErrors.emptyInputForTableSampleError(ctx)
     }
 
+    if (isSystem && ctx.seed != null) {
+      operationNotAllowed("TABLESAMPLE SYSTEM does not support REPEATABLE", ctx)
+    }
+
     val seed: Option[Long] = Option(ctx.seed).map(_.getText.toLong)
 
     ctx.sampleMethod() match {
       case ctx: SampleByRowsContext =>
+        if (isSystem) {
+          operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx)
+        }
         Limit(expression(ctx.expression), query)
 
       case ctx: SampleByPercentileContext =>
@@ -2416,6 +2428,9 @@ class AstBuilder extends DataTypeAstBuilder
         sample(sign * fraction / 100.0d, seed)
 
       case ctx: SampleByBytesContext =>
+        if (isSystem) {
+          operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx)
+        }
         val bytesStr = ctx.bytes.getText
         if (bytesStr.matches("[0-9]+[bBkKmMgG]")) {
           throw QueryParsingErrors.tableSampleByBytesUnsupportedError("byteLengthLiteral", ctx)
@@ -2424,6 +2439,9 @@ class AstBuilder extends DataTypeAstBuilder
         }
 
       case ctx: SampleByBucketContext if ctx.ON() != null =>
+        if (isSystem) {
+          operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx)
+        }
         if (ctx.identifier != null) {
           throw QueryParsingErrors.tableSampleByBytesUnsupportedError(
             "BUCKET x OUT OF y ON colname", ctx)
@@ -2433,6 +2451,9 @@ class AstBuilder extends DataTypeAstBuilder
         }
 
       case ctx: SampleByBucketContext =>
+        if (isSystem) {
+          operationNotAllowed("TABLESAMPLE SYSTEM only supports PERCENT sampling", ctx)
+        }
         sample(ctx.numerator.getText.toDouble / ctx.denominator.getText.toDouble, seed)
     }
   }
 
@@ -1912,6 +1912,14 @@ object SubqueryAlias {
   }
 }
 
+sealed trait SampleMethod extends Serializable
+object SampleMethod {
+  /** Row-level sampling (BERNOULLI). Each row independently selected. No I/O savings. */
+  case object Bernoulli extends SampleMethod
+  /** System-level sampling (SYSTEM). Entire partitions/splits included or skipped. */
+  case object System extends SampleMethod
+}
+
 object Sample {
   /**
    * Convenience constructor that wraps a concrete seed in [[Some]].
@@ -1926,6 +1934,16 @@ object Sample {
       child: LogicalPlan): Sample = {
     new Sample(lowerBound, upperBound, withReplacement, Some(seed), child)
   }
+
+  def apply(
+      lowerBound: Double,
+      upperBound: Double,
+      withReplacement: Boolean,
+      seed: Long,
+      child: LogicalPlan,
+      sampleMethod: SampleMethod): Sample = {
+    new Sample(lowerBound, upperBound, withReplacement, Some(seed), child, sampleMethod)
+  }
 }
 
 /**
@@ -1939,13 +1957,15 @@ object Sample {
  *             (SQL `REPEATABLE` clause or programmatic API), `None` when no seed was
  *             specified and a random seed should be generated at execution time.
  * @param child the LogicalPlan
+ * @param sampleMethod the sampling method (Bernoulli or System)
  */
 case class Sample(
     lowerBound: Double,
     upperBound: Double,
     withReplacement: Boolean,
     seed: Option[Long],
-    child: LogicalPlan) extends UnaryNode {
+    child: LogicalPlan,
+    sampleMethod: SampleMethod = SampleMethod.Bernoulli) extends UnaryNode {
 
   val eps = RandomSampler.roundingEpsilon
   val fraction = upperBound - lowerBound
Original file line number	Diff line number	Diff line change
`@@ -8097,6 +8097,11 @@`
`8097`	`8097`	`"message" : [`
`8098`	`8098`	`"The target table is <table>."`
`8099`	`8099`	`]`
	`8100`	`+ },`
	`8101`	`+ "TABLESAMPLE_SYSTEM" : {`
	`8102`	`+ "message" : [`
	`8103`	`+ "TABLESAMPLE SYSTEM is only supported by data sources that implement block-level sampling."`
	`8104`	`+ ]`
`8100`	`8105`	`}`
`8101`	`8106`	`},`
`8102`	`8107`	`"sqlState" : "42902"`
Original file line number	Diff line number	Diff line change
`@@ -1292,7 +1292,7 @@ object CollapseProject extends Rule[LogicalPlan] with AliasHelper {`
`1292`	`1292`	`limit.copy(child = p2.copy(projectList = newProjectList))`
`1293`	`1293`	`case Project(l1, r @ Repartition(_, _, p @ Project(l2, _))) if isRenaming(l1, l2) =>`
`1294`	`1294`	`r.copy(child = p.copy(projectList = buildCleanedProjectList(l1, p.projectList)))`
`1295`		`- case Project(l1, s @ Sample(_, _, _, _, p2 @ Project(l2, _))) if isRenaming(l1, l2) =>`
	`1295`	`+ case Project(l1, s @ Sample(_, _, _, _, p2 @ Project(l2, _), _)) if isRenaming(l1, l2) =>`
`1296`	`1296`	`s.copy(child = p2.copy(projectList = buildCleanedProjectList(l1, p2.projectList)))`
`1297`	`1297`	`case o => o`
`1298`	`1298`	`}`