Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public StringSubstrColStart() {
* @param len length of the bytes the string holds in the byte array
* @param substrStart the Start index for the substring operation
*/
static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
int end = start + len;

if (substrStart < 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ public StringSubstrColStartLen() {
* @param start start offset of the byte array the string starts at
* @param len length of the bytes the string holds in the byte array
* @param substrStart the Start index for the substring operation
* @param substrLen the length of the substring
* @param offsetArray the array that indexes are populated to. Assume its length >= 2.
* @param substrLength the length of the substring
* @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}.
*/
static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
int substrLength, int[] offsetArray) {
int curIdx = -1;
offsetArray[0] = -1;
Expand Down
64 changes: 44 additions & 20 deletions ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,12 @@
return r;
}

String s = t.toString();
int[] index = makeIndex(pos, len, s.length());
if (index == null) {
StringSubstrColStartLen.populateSubstrOffsets(t.getBytes(), 0, t.getLength(), adjustStartPos(pos), len, index);
if (index[0] == -1) {
return r;
}

r.set(s.substring(index[0], index[1]));
r.set(t.getBytes(), index[0], index[1]);
return r;
}

Expand Down Expand Up @@ -133,16 +132,44 @@

private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);

// Even though we are using longs, substr can only deal with ints, so we use
//Even though we are using longs, substr can only deal with ints, so we use
Comment thread
okumin marked this conversation as resolved.
Outdated
// the maximum int value as the maxValue
private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);

private Text evaluateInternal(Text t, int pos) {
r.clear();

int offset = StringSubstrColStart.getSubstrStartOffset(t.getBytes(), 0, t.getLength(), adjustStartPos(pos));
if (offset == -1) {
return r;
}

r.set(t.getBytes(), offset, t.getLength() - offset);
return r;
}

public Text evaluate(Text s, IntWritable pos) {
return evaluate(s, pos, maxValue);
if ((s == null) || (pos == null)) {
return null;
}

return evaluateInternal(s, pos.get());
}

public Text evaluate(Text s, LongWritable pos) {
return evaluate(s, pos, maxLongValue);
if ((s == null) || (pos == null)) {
return null;
}

long longPos = pos.get();
// If an unsupported value is seen, we don't want to return a string
// that doesn't match what the user expects, so we return NULL (still
// unexpected, of course, but probably better than a bad string).
if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) {
return null;
}

return evaluateInternal(s, (int) pos.get());
}

public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) {
Expand Down Expand Up @@ -198,6 +225,14 @@
return new SubStrStatEstimator();
}

private int adjustStartPos(int pos) {
if (pos <= 0) {
return pos;
}

return pos - 1;
}

private static class SubStrStatEstimator implements StatEstimator {

@Override
Expand All @@ -208,7 +243,6 @@
// 99 rows with 0 length
// orig avg is 10
// new avg is 5 (if substr(5)) ; but in reality it will stay ~10
Optional<Double> start = getRangeWidth(csList.get(1).getRange());
Range startRange = csList.get(1).getRange();
if (startRange != null && startRange.minValue != null) {
double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue();
Expand All @@ -219,23 +253,13 @@
if (csList.size() > 2) {
Range lengthRange = csList.get(2).getRange();
if (lengthRange != null && lengthRange.maxValue != null) {
Double w = lengthRange.maxValue.doubleValue();
double w = lengthRange.maxValue.doubleValue();
if (cs.getAvgColLen() > w) {
cs.setAvgColLen(w);
}
}
}
return Optional.of(cs);
}

private Optional<Double> getRangeWidth(Range range) {
if (range != null) {
if (range.minValue != null && range.maxValue != null) {
return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue());
}
}
return Optional.empty();
}

}
}

Check warning on line 264 in ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

'class def rcurly' has incorrect indentation level 1, expected level should be 2.

See more on https://sonarcloud.io/project/issues?id=apache_hive&issues=AZz644QgvjDTsnvOKbfl&open=AZz644QgvjDTsnvOKbfl&pullRequest=6340
Comment thread
ryukobayashi marked this conversation as resolved.
Outdated
}
8 changes: 8 additions & 0 deletions ql/src/test/queries/clientpositive/udf_substr.q
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,11 @@ FROM src tablesample (1 rows);
SELECT
substr('ABC', cast(2147483649 as bigint))
FROM src tablesample (1 rows);

--test 4-byte charactor
set hive.vectorized.execution.enabled=false;
SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows);
17 changes: 17 additions & 0 deletions ql/src/test/results/clientpositive/llap/udf_substr.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
NULL
PREHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
あa🤎 🤎いiうu 🤎いiうu
Loading