apache · okumin · Mar 27, 2026 · Jan 28, 2025 · Jan 28, 2025 · Apr 10, 2025
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
@@ -75,7 +75,7 @@ public StringSubstrColStart() {
    * @param len length of the bytes the string holds in the byte array
    * @param substrStart the Start index for the substring operation
    */
-  static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
+  public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
     int end = start + len;
 
     if (substrStart < 0) {

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
@@ -83,10 +83,10 @@ public StringSubstrColStartLen() {
    * @param start start offset of the byte array the string starts at
    * @param len length of the bytes the string holds in the byte array
    * @param substrStart the Start index for the substring operation
-   * @param substrLen the length of the substring
-   * @param offsetArray the array that indexes are populated to. Assume its length >= 2.
+   * @param substrLength the length of the substring
+   * @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}.
    */
-  static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
+  public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
       int substrLength, int[] offsetArray) {
     int curIdx = -1;
     offsetArray[0] = -1;

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
@@ -96,13 +96,12 @@
       return r;
     }
 
-    String s = t.toString();
-    int[] index = makeIndex(pos, len, s.length());
-    if (index == null) {
+    StringSubstrColStartLen.populateSubstrOffsets(t.getBytes(), 0, t.getLength(), adjustStartPos(pos), len, index);
+    if (index[0] == -1) {
       return r;
     }
 
-    r.set(s.substring(index[0], index[1]));
+    r.set(t.getBytes(), index[0], index[1]);
     return r;
   }
 
@@ -133,16 +132,44 @@
 
   private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);
 
-  // Even though we are using longs, substr can only deal with ints, so we use
+  //Even though we are using longs, substr can only deal with ints, so we use
   // the maximum int value as the maxValue
   private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);
 
+  private Text evaluateInternal(Text t, int pos) {
+    r.clear();
+
+    int offset = StringSubstrColStart.getSubstrStartOffset(t.getBytes(), 0, t.getLength(), adjustStartPos(pos));
+    if (offset == -1) {
+      return r;
+    }
+
+    r.set(t.getBytes(), offset, t.getLength() - offset);
+    return r;
+  }
+
   public Text evaluate(Text s, IntWritable pos) {
-    return evaluate(s, pos, maxValue);
+    if ((s == null) || (pos == null)) {
+      return null;
+    }
+
+    return evaluateInternal(s, pos.get());
   }
 
   public Text evaluate(Text s, LongWritable pos) {
-    return evaluate(s, pos, maxLongValue);
+    if ((s == null) || (pos == null)) {
+      return null;
+    }
+
+    long longPos = pos.get();
+    // If an unsupported value is seen, we don't want to return a string
+    // that doesn't match what the user expects, so we return NULL (still
+    // unexpected, of course, but probably better than a bad string).
+    if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) {
+      return null;
+    }
+
+    return evaluateInternal(s, (int) pos.get());
   }
 
   public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) {
@@ -198,6 +225,14 @@
     return new SubStrStatEstimator();
   }
 
+  private int adjustStartPos(int pos) {
+    if (pos <= 0) {
+      return pos;
+    }
+
+    return pos - 1;
+  }
+
   private static class SubStrStatEstimator implements StatEstimator {
 
     @Override
@@ -208,7 +243,6 @@
       // 99 rows with 0 length
       // orig avg is 10
       // new avg is 5 (if substr(5)) ; but in reality it will stay ~10
-      Optional<Double> start = getRangeWidth(csList.get(1).getRange());
       Range startRange = csList.get(1).getRange();
       if (startRange != null && startRange.minValue != null) {
         double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue();
@@ -219,23 +253,13 @@
       if (csList.size() > 2) {
         Range lengthRange = csList.get(2).getRange();
         if (lengthRange != null && lengthRange.maxValue != null) {
-          Double w = lengthRange.maxValue.doubleValue();
+          double w = lengthRange.maxValue.doubleValue();
           if (cs.getAvgColLen() > w) {
             cs.setAvgColLen(w);
           }
         }
       }
       return Optional.of(cs);
     }
-
-    private Optional<Double> getRangeWidth(Range range) {
-      if (range != null) {
-        if (range.minValue != null && range.maxValue != null) {
-          return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue());
-        }
-      }
-      return Optional.empty();
-    }
-
-  }
+ }
 }
diff --git a/ql/src/test/queries/clientpositive/udf_substr.q b/ql/src/test/queries/clientpositive/udf_substr.q
@@ -89,3 +89,11 @@ FROM src tablesample (1 rows);
 SELECT
   substr('ABC', cast(2147483649 as bigint))
 FROM src tablesample (1 rows);
+
+--test 4-byte charactor
+set hive.vectorized.execution.enabled=false;
+SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows);
diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out
@@ -240,3 +240,20 @@ POSTHOOK: type: QUERY
 POSTHOOK: Input: default@src
 #### A masked pattern was here ####
 NULL
+PREHOOK: query: SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+あa🤎	🤎いiうu	🤎いiうu