Skip to content

Commit

Permalink
Support underscore separators in numbers for Clickhouse. Fixes #1659 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
graup authored Jan 28, 2025
1 parent 8de3a62 commit 269967a
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 3 deletions.
4 changes: 4 additions & 0 deletions src/dialect/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ impl Dialect for ClickHouseDialect {
true
}

fn supports_numeric_literal_underscores(&self) -> bool {
true
}

// ClickHouse uses this for some FORMAT expressions in `INSERT` context, e.g. when inserting
// with FORMAT JSONEachRow a raw JSON key-value expression is valid and expected.
//
Expand Down
5 changes: 5 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,11 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports numbers containing underscores, e.g. `10_000_000`
fn supports_numeric_literal_underscores(&self) -> bool {
false
}

/// Returns true if the dialects supports specifying null treatment
/// as part of a window function's parameter list as opposed
/// to after the parameter list.
Expand Down
4 changes: 4 additions & 0 deletions src/dialect/postgresql.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ impl Dialect for PostgreSqlDialect {
fn supports_string_escape_constant(&self) -> bool {
true
}

fn supports_numeric_literal_underscores(&self) -> bool {
true
}
}

pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
Expand Down
74 changes: 71 additions & 3 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1136,12 +1136,24 @@ impl<'a> Tokenizer<'a> {
}
// numbers and period
'0'..='9' | '.' => {
let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
// Some dialects support underscore as number separator
// There can only be one at a time and it must be followed by another digit
let is_number_separator = |ch: char, next_char: Option<char>| {
self.dialect.supports_numeric_literal_underscores()
&& ch == '_'
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
};

let mut s = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// match binary literal that starts with 0x
if s == "0" && chars.peek() == Some(&'x') {
chars.next();
let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
});
return Ok(Some(Token::HexStringLiteral(s2)));
}

Expand All @@ -1150,7 +1162,10 @@ impl<'a> Tokenizer<'a> {
s.push('.');
chars.next();
}
s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());

s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});

// No number -> Token::Period
if s == "." {
Expand Down Expand Up @@ -1946,6 +1961,24 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
s
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
chars: &mut State,
mut predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
let mut s = String::new();
while let Some(&ch) = chars.peek() {
let next_char = chars.peekable.clone().nth(1);
if predicate(ch, next_char) {
chars.next(); // consume
s.push(ch);
} else {
break;
}
}
s
}

fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
Unescape::new(chars).unescape()
}
Expand Down Expand Up @@ -2227,6 +2260,41 @@ mod tests {
compare(expected, tokens);
}

#[test]
fn tokenize_numeric_literal_underscore() {
let dialect = GenericDialect {};
let sql = String::from("SELECT 10_000");
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("10".to_string(), false),
Token::make_word("_000", None),
];
compare(expected, tokens);

all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
"SELECT 10_000, _10_000, 10_00_, 10___0",
vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("10_000".to_string(), false),
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number("10_00".to_string(), false),
Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
Token::Comma,
Token::Whitespace(Whitespace::Space),
Token::Number("10".to_string(), false),
Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
],
);
}

#[test]
fn tokenize_select_exponent() {
let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
Expand Down
15 changes: 15 additions & 0 deletions tests/sqlparser_clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1649,6 +1649,21 @@ fn parse_table_sample() {
clickhouse().verified_stmt("SELECT * FROM tbl SAMPLE 1 / 10 OFFSET 1 / 2");
}

#[test]
fn parse_numbers_with_underscore() {
let canonical = if cfg!(feature = "bigdecimal") {
"SELECT 10000"
} else {
"SELECT 10_000"
};
let select = clickhouse().verified_only_select_with_canonical("SELECT 10_000", canonical);

assert_eq!(
select.projection,
vec![SelectItem::UnnamedExpr(Expr::Value(number("10_000")))]
)
}

fn clickhouse() -> TestedDialects {
TestedDialects::new(vec![Box::new(ClickHouseDialect {})])
}
Expand Down

0 comments on commit 269967a

Please sign in to comment.