From 00f2308d1ea81b3dc831cd0e2a8d3cde680e5f33 Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Tue, 15 Oct 2024 17:35:59 +0200 Subject: [PATCH 1/9] added parquet & json read options + added struct field support for glue --- dbt/adapters/duckdb/plugins/glue.py | 12 ++++++- .../macros/materializations/external.sql | 34 +++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/dbt/adapters/duckdb/plugins/glue.py b/dbt/adapters/duckdb/plugins/glue.py index cf447dde..cff007f2 100644 --- a/dbt/adapters/duckdb/plugins/glue.py +++ b/dbt/adapters/duckdb/plugins/glue.py @@ -3,7 +3,7 @@ from typing import List from typing import Optional from typing import Sequence - +import re import boto3 from mypy_boto3_glue import GlueClient from mypy_boto3_glue.type_defs import ColumnTypeDef @@ -65,6 +65,16 @@ def _dbt2glue(dtype: str, ignore_null: bool = False) -> str: # pragma: no cover return "date" if data_type.lower() in ["blob", "bytea", "binary", "varbinary"]: return "binary" + if data_type.lower() in ["struct"]: + struct_fields = re.findall(r"(\w+)\s+(\w+)", dtype[dtype.find("(")+1:dtype.rfind(")")]) + glue_fields = [] + for field_name, field_type in struct_fields: + glue_field_type = _dbt2glue(field_type) + glue_fields.append(f"{field_name}:{glue_field_type}") + struct_schema = f"struct<{','.join(glue_fields)}>" + if dtype.strip().endswith("[]"): + return f"array<{struct_schema}>" + return struct_schema if data_type is None: if ignore_null: return "" diff --git a/dbt/include/duckdb/macros/materializations/external.sql b/dbt/include/duckdb/macros/materializations/external.sql index 91e9b5ab..8ad2c90a 100644 --- a/dbt/include/duckdb/macros/materializations/external.sql +++ b/dbt/include/duckdb/macros/materializations/external.sql @@ -5,6 +5,8 @@ {%- set format = config.get('format', 'parquet') -%} {%- set write_options = adapter.external_write_options(location, rendered_options) -%} {%- set read_location = adapter.external_read_location(location, rendered_options) -%} + {%- set parquet_read_options = config.get('parquet_read_options', {'union_by_name': False}) -%} + {%- set json_read_options = config.get('json_read_options', {'maximum_object_size': 1000000000}) -%} -- set language - python or sql {%- set language = model['language'] -%} @@ -49,9 +51,37 @@ {{ write_to_file(temp_relation, location, write_options) }} -- create a view on top of the location {% call statement('main', language='sql') -%} - create or replace view {{ intermediate_relation }} as ( + {% elif format == 'json' %} + create or replace view {{ intermediate_relation }} as ( + select * from read_json_auto('{{ read_location }}' + {%- for key, value in json_read_options.items() -%} + , {{ key }}= + {%- if value is string -%} + '{{ value }}' + {%- else -%} + {{ value }} + {%- endif -%} + {%- endfor -%} + ) + ); + {% elif format == 'parquet' %} + create or replace view {{ intermediate_relation }} as ( + select * from read_parquet('{{ read_location }}' + {%- for key, value in parquet_read_options.items() -%} + , {{ key }}= + {%- if value is string -%} + '{{ value }}' + {%- else -%} + {{ value }} + {%- endif -%} + {%- endfor -%} + ) + ); + {% else %} + create or replace view {{ intermediate_relation }} as ( select * from '{{ read_location }}' - ); + ); + {% endif %} {%- endcall %} -- cleanup From 0fee2877f045178322610cf8c724f050c4bbba4c Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Tue, 15 Oct 2024 17:42:46 +0200 Subject: [PATCH 2/9] fixed ruff issues --- dbt/adapters/duckdb/plugins/glue.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbt/adapters/duckdb/plugins/glue.py b/dbt/adapters/duckdb/plugins/glue.py index cff007f2..9c176e18 100644 --- a/dbt/adapters/duckdb/plugins/glue.py +++ b/dbt/adapters/duckdb/plugins/glue.py @@ -1,3 +1,4 @@ +import re from typing import Any from typing import Dict from typing import List @@ -66,7 +67,7 @@ def _dbt2glue(dtype: str, ignore_null: bool = False) -> str: # pragma: no cover if data_type.lower() in ["blob", "bytea", "binary", "varbinary"]: return "binary" if data_type.lower() in ["struct"]: - struct_fields = re.findall(r"(\w+)\s+(\w+)", dtype[dtype.find("(")+1:dtype.rfind(")")]) + struct_fields = re.findall(r"(\w+)\s+(\w+)", dtype[dtype.find("(") + 1 : dtype.rfind(")")]) glue_fields = [] for field_name, field_type in struct_fields: glue_field_type = _dbt2glue(field_type) From 65c7e471d19a46f8610890e270b1aac9ffc98b6f Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Tue, 15 Oct 2024 15:44:33 +0000 Subject: [PATCH 3/9] removed double inport --- dbt/adapters/duckdb/plugins/glue.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/adapters/duckdb/plugins/glue.py b/dbt/adapters/duckdb/plugins/glue.py index 9c176e18..f01a3ff1 100644 --- a/dbt/adapters/duckdb/plugins/glue.py +++ b/dbt/adapters/duckdb/plugins/glue.py @@ -4,7 +4,6 @@ from typing import List from typing import Optional from typing import Sequence -import re import boto3 from mypy_boto3_glue import GlueClient from mypy_boto3_glue.type_defs import ColumnTypeDef From c801217e1d95e34df0db339f0f02c4839869eae7 Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Wed, 16 Oct 2024 09:05:52 +0000 Subject: [PATCH 4/9] set default values for csv/json/parquet read options --- .../macros/materializations/external.sql | 27 ++++++++++++++----- tests/functional/adapter/test_external.py | 4 +-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/dbt/include/duckdb/macros/materializations/external.sql b/dbt/include/duckdb/macros/materializations/external.sql index 8ad2c90a..973f54f5 100644 --- a/dbt/include/duckdb/macros/materializations/external.sql +++ b/dbt/include/duckdb/macros/materializations/external.sql @@ -6,7 +6,8 @@ {%- set write_options = adapter.external_write_options(location, rendered_options) -%} {%- set read_location = adapter.external_read_location(location, rendered_options) -%} {%- set parquet_read_options = config.get('parquet_read_options', {'union_by_name': False}) -%} - {%- set json_read_options = config.get('json_read_options', {'maximum_object_size': 1000000000}) -%} + {%- set json_read_options = config.get('json_read_options', {'auto_detect': False}) -%} + {%- set csv_read_options = config.get('csv_read_options', {'auto_detect': False}) -%} -- set language - python or sql {%- set language = model['language'] -%} @@ -47,13 +48,14 @@ {{- create_table_as(False, temp_relation, compiled_code, language) }} {%- endcall %} - -- write an temp relation into file - {{ write_to_file(temp_relation, location, write_options) }} - -- create a view on top of the location +-- write a temp relation into file +{{ write_to_file(temp_relation, location, write_options) }} + +-- create a view on top of the location {% call statement('main', language='sql') -%} - {% elif format == 'json' %} + {% if format == 'json' %} create or replace view {{ intermediate_relation }} as ( - select * from read_json_auto('{{ read_location }}' + select * from read_json('{{ read_location }}' {%- for key, value in json_read_options.items() -%} , {{ key }}= {%- if value is string -%} @@ -77,6 +79,19 @@ {%- endfor -%} ) ); + {% elif format == 'csv' %} + create or replace view {{ intermediate_relation }} as ( + select * from read_csv('{{ read_location }}' + {%- for key, value in csv_read_options.items() -%} + , {{ key }}= + {%- if value is string -%} + '{{ value }}' + {%- else -%} + {{ value }} + {%- endif -%} + {%- endfor -%} + ) + ); {% else %} create or replace view {{ intermediate_relation }} as ( select * from '{{ read_location }}' diff --git a/tests/functional/adapter/test_external.py b/tests/functional/adapter/test_external.py index 66256cc5..aeac649f 100644 --- a/tests/functional/adapter/test_external.py +++ b/tests/functional/adapter/test_external.py @@ -23,11 +23,11 @@ """ config_materialized_parquet_location = """ - {{ config(materialized="external", location="{{ adapter.external_root() }}/test.parquet") }} + {{ config(materialized="external", location="{{ adapter.external_root() }}/test.parquet", format="parquet") }} """ config_materialized_csv_location_delim = """ - {{ config(materialized="external", location="{{ adapter.external_root() }}/test_delim.csv", delimiter="|") }} + {{ config(materialized="external", location="{{ adapter.external_root() }}/test_delim.csv", delimiter="|", format="csv") }} """ config_json = """ From 2a067efe28a00095704ce17b3e7e59e96db67614 Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Wed, 16 Oct 2024 09:13:42 +0000 Subject: [PATCH 5/9] auto_detect=true default values for json and csv --- dbt/include/duckdb/macros/materializations/external.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/include/duckdb/macros/materializations/external.sql b/dbt/include/duckdb/macros/materializations/external.sql index 973f54f5..0aeb79a8 100644 --- a/dbt/include/duckdb/macros/materializations/external.sql +++ b/dbt/include/duckdb/macros/materializations/external.sql @@ -6,8 +6,8 @@ {%- set write_options = adapter.external_write_options(location, rendered_options) -%} {%- set read_location = adapter.external_read_location(location, rendered_options) -%} {%- set parquet_read_options = config.get('parquet_read_options', {'union_by_name': False}) -%} - {%- set json_read_options = config.get('json_read_options', {'auto_detect': False}) -%} - {%- set csv_read_options = config.get('csv_read_options', {'auto_detect': False}) -%} + {%- set json_read_options = config.get('json_read_options', {'auto_detect': True}) -%} + {%- set csv_read_options = config.get('csv_read_options', {'auto_detect': True}) -%} -- set language - python or sql {%- set language = model['language'] -%} From fdf3987062e168e9e8bbabbec1a16b47e94c2c68 Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Wed, 16 Oct 2024 09:27:21 +0000 Subject: [PATCH 6/9] removed empty line --- dbt/adapters/duckdb/plugins/glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/adapters/duckdb/plugins/glue.py b/dbt/adapters/duckdb/plugins/glue.py index f01a3ff1..f4274001 100644 --- a/dbt/adapters/duckdb/plugins/glue.py +++ b/dbt/adapters/duckdb/plugins/glue.py @@ -4,6 +4,7 @@ from typing import List from typing import Optional from typing import Sequence + import boto3 from mypy_boto3_glue import GlueClient from mypy_boto3_glue.type_defs import ColumnTypeDef From 1a5155a2b25c4bf50e33f68816006c690198342f Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Wed, 16 Oct 2024 14:53:47 +0200 Subject: [PATCH 7/9] added glue client fix (local variable 'client' referenced before assignment) --- dbt/adapters/duckdb/plugins/glue.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dbt/adapters/duckdb/plugins/glue.py b/dbt/adapters/duckdb/plugins/glue.py index f4274001..1735bf1b 100644 --- a/dbt/adapters/duckdb/plugins/glue.py +++ b/dbt/adapters/duckdb/plugins/glue.py @@ -278,6 +278,7 @@ def _get_table_def( def _get_glue_client( settings: Dict[str, Any], secrets: Optional[List[Dict[str, Any]]] ) -> "GlueClient": + client = None if secrets is not None: for secret in secrets: if isinstance(secret, Secret) and "config" == str(secret.provider).lower(): @@ -290,16 +291,17 @@ def _get_glue_client( region_name=secret_kwargs.get("region"), ) break - elif settings: - client = boto3.client( - "glue", - aws_access_key_id=settings.get("s3_access_key_id"), - aws_secret_access_key=settings.get("s3_secret_access_key"), - aws_session_token=settings.get("s3_session_token"), - region_name=settings.get("s3_region"), - ) - else: - client = boto3.client("glue") + if client is None: + if settings: + client = boto3.client( + "glue", + aws_access_key_id=settings.get("s3_access_key_id"), + aws_secret_access_key=settings.get("s3_secret_access_key"), + aws_session_token=settings.get("s3_session_token"), + region_name=settings.get("s3_region"), + ) + else: + client = boto3.client("glue") return client From 1cf821eb24b09a2bd7cf77ecede9f686ebca937b Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Thu, 17 Oct 2024 14:08:25 +0000 Subject: [PATCH 8/9] infer type from location, default to parquet if not set --- .../duckdb/macros/materializations/external.sql | 12 +++++++++++- tests/functional/adapter/test_external.py | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/dbt/include/duckdb/macros/materializations/external.sql b/dbt/include/duckdb/macros/materializations/external.sql index 0aeb79a8..19c77de9 100644 --- a/dbt/include/duckdb/macros/materializations/external.sql +++ b/dbt/include/duckdb/macros/materializations/external.sql @@ -2,7 +2,17 @@ {%- set location = render(config.get('location', default=external_location(this, config))) -%}) {%- set rendered_options = render_write_options(config) -%} - {%- set format = config.get('format', 'parquet') -%} + + {%- set format = config.get('format') -%} + {%- set allowed_formats = ['csv', 'parquet', 'json'] -%} + + {%- if format -%} + {%- set format = format if format in allowed_formats else 'parquet' -%} + {%- else -%} + {%- set format = location.split('.')[-1] if '.' in location else 'parquet' -%} + {%- set format = format if format in allowed_formats else 'parquet' -%} + {%- endif -%} + {%- set write_options = adapter.external_write_options(location, rendered_options) -%} {%- set read_location = adapter.external_read_location(location, rendered_options) -%} {%- set parquet_read_options = config.get('parquet_read_options', {'union_by_name': False}) -%} diff --git a/tests/functional/adapter/test_external.py b/tests/functional/adapter/test_external.py index aeac649f..66256cc5 100644 --- a/tests/functional/adapter/test_external.py +++ b/tests/functional/adapter/test_external.py @@ -23,11 +23,11 @@ """ config_materialized_parquet_location = """ - {{ config(materialized="external", location="{{ adapter.external_root() }}/test.parquet", format="parquet") }} + {{ config(materialized="external", location="{{ adapter.external_root() }}/test.parquet") }} """ config_materialized_csv_location_delim = """ - {{ config(materialized="external", location="{{ adapter.external_root() }}/test_delim.csv", delimiter="|", format="csv") }} + {{ config(materialized="external", location="{{ adapter.external_root() }}/test_delim.csv", delimiter="|") }} """ config_json = """ From 26253ddbfa20e2138eeb13176c77f87f26b7323f Mon Sep 17 00:00:00 2001 From: Arno Roos Date: Thu, 17 Oct 2024 15:25:08 +0000 Subject: [PATCH 9/9] resolved indentation & format issues --- .../duckdb/macros/materializations/external.sql | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/dbt/include/duckdb/macros/materializations/external.sql b/dbt/include/duckdb/macros/materializations/external.sql index 19c77de9..70d7dae9 100644 --- a/dbt/include/duckdb/macros/materializations/external.sql +++ b/dbt/include/duckdb/macros/materializations/external.sql @@ -5,12 +5,13 @@ {%- set format = config.get('format') -%} {%- set allowed_formats = ['csv', 'parquet', 'json'] -%} - {%- if format -%} - {%- set format = format if format in allowed_formats else 'parquet' -%} + {%- if format not in allowed_formats -%} + {{ exceptions.raise_compiler_error("Invalid format: " ~ format ~ ". Allowed formats are: " ~ allowed_formats | join(', ')) }} + {%- endif -%} {%- else -%} - {%- set format = location.split('.')[-1] if '.' in location else 'parquet' -%} - {%- set format = format if format in allowed_formats else 'parquet' -%} + {%- set format = location.split('.')[-1].lower() if '.' in location else 'parquet' -%} + {%- set format = format if format in allowed_formats else 'parquet' -%} {%- endif -%} {%- set write_options = adapter.external_write_options(location, rendered_options) -%} @@ -58,8 +59,8 @@ {{- create_table_as(False, temp_relation, compiled_code, language) }} {%- endcall %} --- write a temp relation into file -{{ write_to_file(temp_relation, location, write_options) }} + -- write a temp relation into file + {{ write_to_file(temp_relation, location, write_options) }} -- create a view on top of the location {% call statement('main', language='sql') -%} @@ -102,10 +103,6 @@ {%- endfor -%} ) ); - {% else %} - create or replace view {{ intermediate_relation }} as ( - select * from '{{ read_location }}' - ); {% endif %} {%- endcall %}