Skip to content

Commit 7583975

Browse files
committed
Introduce Arroyo dialect
1 parent ed41654 commit 7583975

4 files changed

Lines changed: 237 additions & 19 deletions

File tree

src/dialect/arroyo.rs

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
// Licensed under the Apache License, Version 2.0 (the "License");
2+
// you may not use this file except in compliance with the License.
3+
// You may obtain a copy of the License at
4+
//
5+
// http://www.apache.org/licenses/LICENSE-2.0
6+
//
7+
// Unless required by applicable law or agreed to in writing, software
8+
// distributed under the License is distributed on an "AS IS" BASIS,
9+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
// See the License for the specific language governing permissions and
11+
// limitations under the License.
12+
use log::debug;
13+
14+
use crate::dialect::{Dialect, Precedence};
15+
use crate::keywords::Keyword;
16+
use crate::parser::{Parser, ParserError};
17+
use crate::tokenizer::Token;
18+
19+
/// A [`Dialect`] for [Arroyo](https://www.arroyo.dev/)
20+
/// This is based on the Postgres dialect
21+
///
22+
/// Currently the Arroyo dialect differs from postgres in one respect: it supports
23+
/// Hive/Biquery/etc. struct syntax (`struct<a INT, b TEXT>`)
24+
#[derive(Debug)]
25+
pub struct ArroyoDialect {}
26+
27+
const PERIOD_PREC: u8 = 200;
28+
const DOUBLE_COLON_PREC: u8 = 140;
29+
const BRACKET_PREC: u8 = 130;
30+
const COLLATE_PREC: u8 = 120;
31+
const AT_TZ_PREC: u8 = 110;
32+
const CARET_PREC: u8 = 100;
33+
const MUL_DIV_MOD_OP_PREC: u8 = 90;
34+
const PLUS_MINUS_PREC: u8 = 80;
35+
// there's no XOR operator in PostgreSQL, but support it here to avoid breaking tests
36+
const XOR_PREC: u8 = 75;
37+
const PG_OTHER_PREC: u8 = 70;
38+
const BETWEEN_LIKE_PREC: u8 = 60;
39+
const EQ_PREC: u8 = 50;
40+
const IS_PREC: u8 = 40;
41+
const NOT_PREC: u8 = 30;
42+
const AND_PREC: u8 = 20;
43+
const OR_PREC: u8 = 10;
44+
45+
impl Dialect for ArroyoDialect {
46+
fn identifier_quote_style(&self, _identifier: &str) -> Option<char> {
47+
Some('"')
48+
}
49+
50+
fn is_delimited_identifier_start(&self, ch: char) -> bool {
51+
ch == '"' // Postgres does not support backticks to quote identifiers
52+
}
53+
54+
fn is_identifier_start(&self, ch: char) -> bool {
55+
// See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
56+
// We don't yet support identifiers beginning with "letters with
57+
// diacritical marks"
58+
ch.is_alphabetic() || ch == '_'
59+
}
60+
61+
fn is_identifier_part(&self, ch: char) -> bool {
62+
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
63+
}
64+
65+
fn supports_unicode_string_literal(&self) -> bool {
66+
true
67+
}
68+
69+
/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
70+
fn is_custom_operator_part(&self, ch: char) -> bool {
71+
matches!(
72+
ch,
73+
'+' | '-'
74+
| '*'
75+
| '/'
76+
| '<'
77+
| '>'
78+
| '='
79+
| '~'
80+
| '!'
81+
| '@'
82+
| '#'
83+
| '%'
84+
| '^'
85+
| '&'
86+
| '|'
87+
| '`'
88+
| '?'
89+
)
90+
}
91+
92+
fn get_next_precedence(&self, parser: &Parser) -> Option<Result<u8, ParserError>> {
93+
let token = parser.peek_token();
94+
debug!("get_next_precedence() {:?}", token);
95+
96+
// we only return some custom value here when the behaviour (not merely the numeric value) differs
97+
// from the default implementation
98+
match token.token {
99+
Token::Word(w) if w.keyword == Keyword::COLLATE => Some(Ok(COLLATE_PREC)),
100+
Token::LBracket => Some(Ok(BRACKET_PREC)),
101+
Token::Arrow
102+
| Token::LongArrow
103+
| Token::HashArrow
104+
| Token::HashLongArrow
105+
| Token::AtArrow
106+
| Token::ArrowAt
107+
| Token::HashMinus
108+
| Token::AtQuestion
109+
| Token::AtAt
110+
| Token::Question
111+
| Token::QuestionAnd
112+
| Token::QuestionPipe
113+
| Token::ExclamationMark
114+
| Token::Overlap
115+
| Token::CaretAt
116+
| Token::StringConcat
117+
| Token::Sharp
118+
| Token::ShiftRight
119+
| Token::ShiftLeft
120+
| Token::CustomBinaryOperator(_) => Some(Ok(PG_OTHER_PREC)),
121+
_ => None,
122+
}
123+
}
124+
125+
126+
fn supports_filter_during_aggregation(&self) -> bool {
127+
true
128+
}
129+
130+
fn supports_group_by_expr(&self) -> bool {
131+
true
132+
}
133+
134+
fn prec_value(&self, prec: Precedence) -> u8 {
135+
match prec {
136+
Precedence::Period => PERIOD_PREC,
137+
Precedence::DoubleColon => DOUBLE_COLON_PREC,
138+
Precedence::AtTz => AT_TZ_PREC,
139+
Precedence::MulDivModOp => MUL_DIV_MOD_OP_PREC,
140+
Precedence::PlusMinus => PLUS_MINUS_PREC,
141+
Precedence::Xor => XOR_PREC,
142+
Precedence::Ampersand => PG_OTHER_PREC,
143+
Precedence::Caret => CARET_PREC,
144+
Precedence::Pipe => PG_OTHER_PREC,
145+
Precedence::Between => BETWEEN_LIKE_PREC,
146+
Precedence::Eq => EQ_PREC,
147+
Precedence::Like => BETWEEN_LIKE_PREC,
148+
Precedence::Is => IS_PREC,
149+
Precedence::PgOther => PG_OTHER_PREC,
150+
Precedence::UnaryNot => NOT_PREC,
151+
Precedence::And => AND_PREC,
152+
Precedence::Or => OR_PREC,
153+
}
154+
}
155+
156+
fn allow_extract_custom(&self) -> bool {
157+
true
158+
}
159+
160+
fn allow_extract_single_quotes(&self) -> bool {
161+
true
162+
}
163+
164+
/// see <https://www.postgresql.org/docs/13/functions-math.html>
165+
fn supports_factorial_operator(&self) -> bool {
166+
true
167+
}
168+
169+
/// see <https://www.postgresql.org/docs/current/sql-comment.html>
170+
fn supports_comment_on(&self) -> bool {
171+
true
172+
}
173+
174+
175+
/// Return true if the dialect supports empty projections in SELECT statements
176+
///
177+
/// Example
178+
/// ```sql
179+
/// SELECT from table_name
180+
/// ```
181+
fn supports_empty_projections(&self) -> bool {
182+
true
183+
}
184+
185+
fn supports_nested_comments(&self) -> bool {
186+
true
187+
}
188+
189+
fn supports_string_escape_constant(&self) -> bool {
190+
true
191+
}
192+
193+
fn supports_numeric_literal_underscores(&self) -> bool {
194+
true
195+
}
196+
197+
/// See: <https://www.postgresql.org/docs/current/arrays.html#ARRAYS-DECLARATION>
198+
fn supports_array_typedef_with_brackets(&self) -> bool {
199+
true
200+
}
201+
202+
fn supports_geometric_types(&self) -> bool {
203+
true
204+
}
205+
206+
// arroyo-specific features
207+
fn supports_partiql(&self) -> bool {
208+
true
209+
}
210+
211+
fn supports_struct_literal(&self) -> bool {
212+
true
213+
}
214+
}

src/dialect/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717

1818
mod ansi;
19+
mod arroyo;
1920
mod bigquery;
2021
mod clickhouse;
2122
mod databricks;
@@ -37,6 +38,7 @@ use core::str::Chars;
3738
use log::debug;
3839

3940
pub use self::ansi::AnsiDialect;
41+
pub use self::arroyo::ArroyoDialect;
4042
pub use self::bigquery::BigQueryDialect;
4143
pub use self::clickhouse::ClickHouseDialect;
4244
pub use self::databricks::DatabricksDialect;

src/parser/mod.rs

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,7 +1393,7 @@ impl<'a> Parser<'a> {
13931393
| tok @ Token::PGCubeRoot
13941394
| tok @ Token::AtSign
13951395
| tok @ Token::Tilde
1396-
if dialect_is!(dialect is PostgreSqlDialect) =>
1396+
if dialect_is!(dialect is PostgreSqlDialect | ArroyoDialect) =>
13971397
{
13981398
let op = match tok {
13991399
Token::DoubleExclamationMark => UnaryOperator::PGPrefixFactorial,
@@ -1437,7 +1437,7 @@ impl<'a> Parser<'a> {
14371437
),
14381438
})
14391439
}
1440-
Token::EscapedStringLiteral(_) if dialect_is!(dialect is PostgreSqlDialect | GenericDialect) =>
1440+
Token::EscapedStringLiteral(_) if dialect_is!(dialect is PostgreSqlDialect | GenericDialect | ArroyoDialect) =>
14411441
{
14421442
self.prev_token();
14431443
Ok(Expr::Value(self.parse_value()?))
@@ -3103,7 +3103,7 @@ impl<'a> Parser<'a> {
31033103
Token::Caret => {
31043104
// In PostgreSQL, ^ stands for the exponentiation operation,
31053105
// and # stands for XOR. See https://www.postgresql.org/docs/current/functions-math.html
3106-
if dialect_is!(dialect is PostgreSqlDialect) {
3106+
if dialect_is!(dialect is PostgreSqlDialect | ArroyoDialect) {
31073107
Some(BinaryOperator::PGExp)
31083108
} else {
31093109
Some(BinaryOperator::BitwiseXor)
@@ -3114,22 +3114,22 @@ impl<'a> Parser<'a> {
31143114
Token::DuckIntDiv if dialect_is!(dialect is DuckDbDialect | GenericDialect) => {
31153115
Some(BinaryOperator::DuckIntegerDivide)
31163116
}
3117-
Token::ShiftLeft if dialect_is!(dialect is PostgreSqlDialect | DuckDbDialect | GenericDialect | RedshiftSqlDialect) => {
3117+
Token::ShiftLeft if dialect_is!(dialect is PostgreSqlDialect | DuckDbDialect | ArroyoDialect | GenericDialect | RedshiftSqlDialect) => {
31183118
Some(BinaryOperator::PGBitwiseShiftLeft)
31193119
}
3120-
Token::ShiftRight if dialect_is!(dialect is PostgreSqlDialect | DuckDbDialect | GenericDialect | RedshiftSqlDialect) => {
3120+
Token::ShiftRight if dialect_is!(dialect is PostgreSqlDialect | DuckDbDialect | ArroyoDialect | GenericDialect | RedshiftSqlDialect) => {
31213121
Some(BinaryOperator::PGBitwiseShiftRight)
31223122
}
3123-
Token::Sharp if dialect_is!(dialect is PostgreSqlDialect | RedshiftSqlDialect) => {
3123+
Token::Sharp if dialect_is!(dialect is PostgreSqlDialect | RedshiftSqlDialect | ArroyoDialect) => {
31243124
Some(BinaryOperator::PGBitwiseXor)
31253125
}
31263126
Token::Overlap if dialect_is!(dialect is PostgreSqlDialect | RedshiftSqlDialect) => {
31273127
Some(BinaryOperator::PGOverlap)
31283128
}
3129-
Token::Overlap if dialect_is!(dialect is PostgreSqlDialect | GenericDialect) => {
3129+
Token::Overlap if dialect_is!(dialect is PostgreSqlDialect | ArroyoDialect | GenericDialect) => {
31303130
Some(BinaryOperator::PGOverlap)
31313131
}
3132-
Token::CaretAt if dialect_is!(dialect is PostgreSqlDialect | GenericDialect) => {
3132+
Token::CaretAt if dialect_is!(dialect is PostgreSqlDialect | ArroyoDialect | GenericDialect) => {
31333133
Some(BinaryOperator::PGStartsWith)
31343134
}
31353135
Token::Tilde => Some(BinaryOperator::PGRegexMatch),
@@ -7282,7 +7282,7 @@ impl<'a> Parser<'a> {
72827282
GeneratedAs::ExpStored,
72837283
Some(GeneratedExpressionMode::Stored),
72847284
))
7285-
} else if dialect_of!(self is PostgreSqlDialect) {
7285+
} else if dialect_of!(self is PostgreSqlDialect | ArroyoDialect) {
72867286
// Postgres' AS IDENTITY branches are above, this one needs STORED
72877287
self.expected("STORED", self.peek_token())
72887288
} else if self.parse_keywords(&[Keyword::VIRTUAL]) {
@@ -8793,7 +8793,7 @@ impl<'a> Parser<'a> {
87938793
}) => Ok(value),
87948794
Token::SingleQuotedString(s) => Ok(s),
87958795
Token::DoubleQuotedString(s) => Ok(s),
8796-
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
8796+
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | ArroyoDialect | GenericDialect) => {
87978797
Ok(s)
87988798
}
87998799
Token::UnicodeStringLiteral(s) => Ok(s),
@@ -9115,7 +9115,8 @@ impl<'a> Parser<'a> {
91159115
let field_defs = self.parse_duckdb_struct_type_def()?;
91169116
Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses))
91179117
}
9118-
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | GenericDialect) => {
9118+
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | ArroyoDialect | GenericDialect) =>
9119+
{
91199120
self.prev_token();
91209121
let (field_defs, _trailing_bracket) =
91219122
self.parse_struct_type_def(Self::parse_struct_field_def)?;
@@ -11664,7 +11665,7 @@ impl<'a> Parser<'a> {
1166411665
}),
1166511666
alias,
1166611667
})
11667-
} else if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | GenericDialect)
11668+
} else if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | ArroyoDialect | GenericDialect)
1166811669
&& self.parse_keyword(Keyword::UNNEST)
1166911670
{
1167011671
self.expect_token(&Token::LParen)?;
@@ -12919,12 +12920,13 @@ impl<'a> Parser<'a> {
1291912920
let table = self.parse_keyword(Keyword::TABLE);
1292012921
let table_object = self.parse_table_object()?;
1292112922

12922-
let table_alias =
12923-
if dialect_of!(self is PostgreSqlDialect) && self.parse_keyword(Keyword::AS) {
12924-
Some(self.parse_identifier()?)
12925-
} else {
12926-
None
12927-
};
12923+
let table_alias = if dialect_of!(self is PostgreSqlDialect | ArroyoDialect)
12924+
&& self.parse_keyword(Keyword::AS)
12925+
{
12926+
Some(self.parse_identifier()?)
12927+
} else {
12928+
None
12929+
};
1292812930

1292912931
let is_mysql = dialect_of!(self is MySqlDialect);
1293012932

src/tokenizer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ use serde::{Deserialize, Serialize};
4040
#[cfg(feature = "visitor")]
4141
use sqlparser_derive::{Visit, VisitMut};
4242

43-
use crate::dialect::Dialect;
43+
use crate::dialect::{Dialect};
4444
use crate::dialect::{
4545
BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
4646
SnowflakeDialect,

0 commit comments

Comments
 (0)