Skip to content

Commit

Permalink
Support underscore separators in numbers for Clickhouse. Fixes apache…
Browse files Browse the repository at this point in the history
  • Loading branch information
graup committed Jan 22, 2025
1 parent c7c0de6 commit 09872b4
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 1 deletion.
4 changes: 4 additions & 0 deletions src/dialect/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ impl Dialect for ClickHouseDialect {
true
}

fn supports_underscore_separator(&self) -> bool {
true
}

// ClickHouse uses this for some FORMAT expressions in `INSERT` context, e.g. when inserting
// with FORMAT JSONEachRow a raw JSON key-value expression is valid and expected.
//
Expand Down
5 changes: 5 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,11 @@ pub trait Dialect: Debug + Any {
false
}

/// Returns true if the dialect supports numbers containing underscores
fn supports_underscore_separator(&self) -> bool {
false
}

/// Returns true if the dialects supports specifying null treatment
/// as part of a window function's parameter list as opposed
/// to after the parameter list.
Expand Down
27 changes: 26 additions & 1 deletion src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1147,7 +1147,16 @@ impl<'a> Tokenizer<'a> {
s.push('.');
chars.next();
}
s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());

let is_ascii_digit = |ch: char| ch.is_ascii_digit();
s += &peeking_take_while(chars, is_ascii_digit);
// In some dialects, numbers can include underscores that should be ignored
if self.dialect.supports_underscore_separator() {
while let Some('_') = chars.peek() {
chars.next();
s += &peeking_take_while(chars, is_ascii_digit);
}
}

// No number -> Token::Period
if s == "." {
Expand Down Expand Up @@ -2223,6 +2232,22 @@ mod tests {
compare(expected, tokens);
}

#[test]
fn tokenize_clickhouse_underscore_separator() {
let sql = String::from("SELECT 10_000");
let dialect = ClickHouseDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();

let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("10000".to_string(), false),
];

compare(expected, tokens);
}

#[test]
fn tokenize_select_exponent() {
let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
Expand Down
13 changes: 13 additions & 0 deletions tests/sqlparser_clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1646,6 +1646,19 @@ fn parse_table_sample() {
clickhouse().verified_stmt("SELECT * FROM tbl SAMPLE 1 / 10 OFFSET 1 / 2");
}

#[test]
fn parse_numbers_with_underscore() {
let statement = clickhouse()
.parse_sql_statements("SELECT 10_000")
.unwrap()
.pop()
.unwrap();

// Formatting the statement does not give exactly the same SQL back
// as the tokenizer simply ignores underscores in numbers.
assert_eq!(statement.to_string(), "SELECT 10000");
}

fn clickhouse() -> TestedDialects {
TestedDialects::new(vec![Box::new(ClickHouseDialect {})])
}
Expand Down

0 comments on commit 09872b4

Please sign in to comment.