From 149142f77bb5ab16fdbf7a01483b41f56d5d629b Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Mon, 3 Nov 2025 14:42:09 -0500 Subject: [PATCH 1/7] =?UTF-8?q?feat(duckDB):=20Cast=20inputs=20(BLOB=20?= =?UTF-8?q?=E2=86=92=20VARCHAR)=20for=20duckDB=20STARTS=5FWITH?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sqlglot/dialects/duckdb.py | 25 +++++++++++++++++++++++++ tests/dialects/test_bigquery.py | 15 +++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index 140a33ced3..9d00f825cf 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1209,6 +1209,31 @@ def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: return self.func("STRUCT_INSERT", this, kv_sql) + def startswith_sql(self, expression: exp.StartsWith) -> str: + this = expression.this + expr = expression.expression + + if not this.type: + from sqlglot.optimizer.annotate_types import annotate_types + + this = annotate_types(this, dialect=self.dialect) + + if not expr.type: + from sqlglot.optimizer.annotate_types import annotate_types + + expr = annotate_types(expr, dialect=self.dialect) + + # DuckDB's starts_with only accepts VARCHAR, not BLOB + if this.is_type(exp.DataType.Type.BINARY): + expression.this.replace(exp.cast(expression.this, exp.DataType.Type.VARCHAR)) + + if expr.is_type(exp.DataType.Type.BINARY): + expression.expression.replace( + exp.cast(expression.expression, exp.DataType.Type.VARCHAR) + ) + + return self.func("STARTS_WITH", expression.this, expression.expression) + def unnest_sql(self, expression: exp.Unnest) -> str: explode_array = expression.args.get("explode_array") if explode_array: diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py index b260d8f859..54c8d5c2a6 100644 --- a/tests/dialects/test_bigquery.py +++ b/tests/dialects/test_bigquery.py @@ -1205,6 +1205,21 @@ def test_bigquery(self): "spark": "CAST(a AS BINARY)", }, ) + # Test STARTS_WITH with BYTES/BLOB handling from BigQuery to DuckDB + self.validate_all( + "STARTS_WITH(CAST('foo' AS BYTES), CAST('f' AS BYTES))", + write={ + "bigquery": "STARTS_WITH(CAST('foo' AS BYTES), CAST('f' AS BYTES))", + "duckdb": "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), CAST(CAST('f' AS BLOB) AS TEXT))", + }, + ) + self.validate_all( + "STARTS_WITH(CAST('foo' AS BYTES), b'f')", + write={ + "bigquery": "STARTS_WITH(CAST('foo' AS BYTES), b'f')", + "duckdb": "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), e'f')", + }, + ) self.validate_all( "CAST(a AS NUMERIC)", write={ From 81ee6f660021ace280bfbfcf8523f4cd5e45ec9b Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Fri, 7 Nov 2025 10:14:14 -0500 Subject: [PATCH 2/7] feat(duckDB): Addressed review comments --- sqlglot/dialects/duckdb.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index 9d00f825cf..cf90ec7efc 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1223,11 +1223,14 @@ def startswith_sql(self, expression: exp.StartsWith) -> str: expr = annotate_types(expr, dialect=self.dialect) - # DuckDB's starts_with only accepts VARCHAR, not BLOB - if this.is_type(exp.DataType.Type.BINARY): + if this.type and not this.is_type( + exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN + ): expression.this.replace(exp.cast(expression.this, exp.DataType.Type.VARCHAR)) - if expr.is_type(exp.DataType.Type.BINARY): + if expr.type and not expr.is_type( + exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN + ): expression.expression.replace( exp.cast(expression.expression, exp.DataType.Type.VARCHAR) ) From 6ea59aba06af5e81972e6b2af1582c5f1f4abde5 Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Fri, 7 Nov 2025 10:21:22 -0500 Subject: [PATCH 3/7] fix: Applied formatting --- sqlglot/dialects/duckdb.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index cf90ec7efc..5678a8f32e 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1223,14 +1223,10 @@ def startswith_sql(self, expression: exp.StartsWith) -> str: expr = annotate_types(expr, dialect=self.dialect) - if this.type and not this.is_type( - exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN - ): + if this.type and not this.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): expression.this.replace(exp.cast(expression.this, exp.DataType.Type.VARCHAR)) - if expr.type and not expr.is_type( - exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN - ): + if expr.type and not expr.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): expression.expression.replace( exp.cast(expression.expression, exp.DataType.Type.VARCHAR) ) From f2f91a6f8aba4cae8137c2e036b91335504407bc Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Fri, 7 Nov 2025 11:03:51 -0500 Subject: [PATCH 4/7] feat(duckDB): Fix code after rebase as bytestring handling was changed in generator --- sqlglot/dialects/duckdb.py | 12 ++++++++++-- tests/dialects/test_bigquery.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index 5678a8f32e..5b3fba2649 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1223,10 +1223,18 @@ def startswith_sql(self, expression: exp.StartsWith) -> str: expr = annotate_types(expr, dialect=self.dialect) - if this.type and not this.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): + if isinstance(expression.this, exp.ByteString): + expression.this.replace(exp.Literal.string(expression.this.this)) + elif this.type and not this.is_type( + exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN + ): expression.this.replace(exp.cast(expression.this, exp.DataType.Type.VARCHAR)) - if expr.type and not expr.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): + if isinstance(expression.expression, exp.ByteString): + expression.expression.replace(exp.Literal.string(expression.expression.this)) + elif expr.type and not expr.is_type( + exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN + ): expression.expression.replace( exp.cast(expression.expression, exp.DataType.Type.VARCHAR) ) diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py index 54c8d5c2a6..caf3ea1bc6 100644 --- a/tests/dialects/test_bigquery.py +++ b/tests/dialects/test_bigquery.py @@ -1217,7 +1217,7 @@ def test_bigquery(self): "STARTS_WITH(CAST('foo' AS BYTES), b'f')", write={ "bigquery": "STARTS_WITH(CAST('foo' AS BYTES), b'f')", - "duckdb": "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), e'f')", + "duckdb": "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), 'f')", }, ) self.validate_all( From fd278a0b36d4f5399b49452a94a15e2996032b02 Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Mon, 10 Nov 2025 12:29:28 -0500 Subject: [PATCH 5/7] feat(duckdb): Added a helper method --- sqlglot/dialects/duckdb.py | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index 5b3fba2649..e425f62c63 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1209,35 +1209,35 @@ def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: return self.func("STRUCT_INSERT", this, kv_sql) - def startswith_sql(self, expression: exp.StartsWith) -> str: - this = expression.this - expr = expression.expression + def _prepare_startswith_arg(self, arg: exp.Expression) -> None: + """Prepare argument for STARTS_WITH by converting to VARCHAR. + + ByteString literals are converted to regular string literals to avoid + BLOB casting by the generator. Non-VARCHAR types are cast to VARCHAR. + """ + # Convert ByteString to String literal before generation + # ByteStrings get typed as UNKNOWN and would be wrapped in CAST(...AS BLOB) by generator + if isinstance(arg, exp.ByteString): + arg.replace(exp.Literal.string(arg.this)) + # Cast non-VARCHAR types to VARCHAR + elif arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): + arg.replace(exp.cast(arg, exp.DataType.Type.VARCHAR)) - if not this.type: + def startswith_sql(self, expression: exp.StartsWith) -> str: + # Annotate types if needed for type-based casting + if not expression.this.type: from sqlglot.optimizer.annotate_types import annotate_types - this = annotate_types(this, dialect=self.dialect) + annotate_types(expression.this, dialect=self.dialect) - if not expr.type: + if not expression.expression.type: from sqlglot.optimizer.annotate_types import annotate_types - expr = annotate_types(expr, dialect=self.dialect) + annotate_types(expression.expression, dialect=self.dialect) - if isinstance(expression.this, exp.ByteString): - expression.this.replace(exp.Literal.string(expression.this.this)) - elif this.type and not this.is_type( - exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN - ): - expression.this.replace(exp.cast(expression.this, exp.DataType.Type.VARCHAR)) - - if isinstance(expression.expression, exp.ByteString): - expression.expression.replace(exp.Literal.string(expression.expression.this)) - elif expr.type and not expr.is_type( - exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN - ): - expression.expression.replace( - exp.cast(expression.expression, exp.DataType.Type.VARCHAR) - ) + # Prepare both arguments for STARTS_WITH + self._prepare_startswith_arg(expression.this) + self._prepare_startswith_arg(expression.expression) return self.func("STARTS_WITH", expression.this, expression.expression) From b0c7ee65e790d1f5eddece408465d7082239e63e Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Mon, 10 Nov 2025 12:45:50 -0500 Subject: [PATCH 6/7] feat(duckDB): Refactored code --- sqlglot/dialects/duckdb.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index e425f62c63..e0eb99b63c 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1215,6 +1215,12 @@ def _prepare_startswith_arg(self, arg: exp.Expression) -> None: ByteString literals are converted to regular string literals to avoid BLOB casting by the generator. Non-VARCHAR types are cast to VARCHAR. """ + # Annotate types if needed for type-based casting + if not arg.type: + from sqlglot.optimizer.annotate_types import annotate_types + + annotate_types(arg, dialect=self.dialect) + # Convert ByteString to String literal before generation # ByteStrings get typed as UNKNOWN and would be wrapped in CAST(...AS BLOB) by generator if isinstance(arg, exp.ByteString): @@ -1224,18 +1230,7 @@ def _prepare_startswith_arg(self, arg: exp.Expression) -> None: arg.replace(exp.cast(arg, exp.DataType.Type.VARCHAR)) def startswith_sql(self, expression: exp.StartsWith) -> str: - # Annotate types if needed for type-based casting - if not expression.this.type: - from sqlglot.optimizer.annotate_types import annotate_types - - annotate_types(expression.this, dialect=self.dialect) - - if not expression.expression.type: - from sqlglot.optimizer.annotate_types import annotate_types - - annotate_types(expression.expression, dialect=self.dialect) - - # Prepare both arguments for STARTS_WITH + # Prepare both arguments for STARTS_WITH (annotates types and converts to VARCHAR) self._prepare_startswith_arg(expression.this) self._prepare_startswith_arg(expression.expression) From 76a47b63e0525cc8906d7080cfa88b9f46629d8f Mon Sep 17 00:00:00 2001 From: fivetran-amrutabhimsenayachit Date: Tue, 11 Nov 2025 10:46:44 -0500 Subject: [PATCH 7/7] feat(duckdb): Removed bytestring check, following double-cast --- sqlglot/dialects/duckdb.py | 22 ++++------------------ tests/dialects/test_bigquery.py | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 30 deletions(-) diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py index e0eb99b63c..578bc9483c 100644 --- a/sqlglot/dialects/duckdb.py +++ b/sqlglot/dialects/duckdb.py @@ -1210,27 +1210,13 @@ def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: return self.func("STRUCT_INSERT", this, kv_sql) def _prepare_startswith_arg(self, arg: exp.Expression) -> None: - """Prepare argument for STARTS_WITH by converting to VARCHAR. - - ByteString literals are converted to regular string literals to avoid - BLOB casting by the generator. Non-VARCHAR types are cast to VARCHAR. - """ - # Annotate types if needed for type-based casting - if not arg.type: - from sqlglot.optimizer.annotate_types import annotate_types - - annotate_types(arg, dialect=self.dialect) - - # Convert ByteString to String literal before generation - # ByteStrings get typed as UNKNOWN and would be wrapped in CAST(...AS BLOB) by generator - if isinstance(arg, exp.ByteString): - arg.replace(exp.Literal.string(arg.this)) - # Cast non-VARCHAR types to VARCHAR - elif arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): + """Prepare argument for STARTS_WITH by converting to VARCHAR.""" + # Cast non-VARCHAR types to VARCHAR (includes double-cast for BLOB types) + if arg.type and not arg.is_type(exp.DataType.Type.VARCHAR, exp.DataType.Type.UNKNOWN): arg.replace(exp.cast(arg, exp.DataType.Type.VARCHAR)) def startswith_sql(self, expression: exp.StartsWith) -> str: - # Prepare both arguments for STARTS_WITH (annotates types and converts to VARCHAR) + # Prepare both arguments for STARTS_WITH (converts to VARCHAR) self._prepare_startswith_arg(expression.this) self._prepare_startswith_arg(expression.expression) diff --git a/tests/dialects/test_bigquery.py b/tests/dialects/test_bigquery.py index caf3ea1bc6..4f8822f1ee 100644 --- a/tests/dialects/test_bigquery.py +++ b/tests/dialects/test_bigquery.py @@ -1206,19 +1206,19 @@ def test_bigquery(self): }, ) # Test STARTS_WITH with BYTES/BLOB handling from BigQuery to DuckDB - self.validate_all( - "STARTS_WITH(CAST('foo' AS BYTES), CAST('f' AS BYTES))", - write={ - "bigquery": "STARTS_WITH(CAST('foo' AS BYTES), CAST('f' AS BYTES))", - "duckdb": "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), CAST(CAST('f' AS BLOB) AS TEXT))", - }, + # Requires type annotation for proper BLOB -> VARCHAR casting + expr = self.parse_one("STARTS_WITH(CAST('foo' AS BYTES), CAST('f' AS BYTES))") + annotated = annotate_types(expr, dialect="bigquery") + self.assertEqual( + annotated.sql("duckdb"), + "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), CAST(CAST('f' AS BLOB) AS TEXT))", ) - self.validate_all( - "STARTS_WITH(CAST('foo' AS BYTES), b'f')", - write={ - "bigquery": "STARTS_WITH(CAST('foo' AS BYTES), b'f')", - "duckdb": "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), 'f')", - }, + + expr = self.parse_one("STARTS_WITH(CAST('foo' AS BYTES), b'f')") + annotated = annotate_types(expr, dialect="bigquery") + self.assertEqual( + annotated.sql("duckdb"), + "STARTS_WITH(CAST(CAST('foo' AS BLOB) AS TEXT), CAST(CAST(e'f' AS BLOB) AS TEXT))", ) self.validate_all( "CAST(a AS NUMERIC)",