Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions sqlglot/expressions/temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,18 @@ class TsOrDsDiff(Expression, Func, TimeUnit):
arg_types = {"this": True, "expression": True, "unit": False}


class DateAdd2(Expression, Func):
"""2-arg date_add(startDate, numDays) as found in Databricks/Spark.

Distinct from TsOrDsAdd so generators can emit the plain 2-arg form
(DATE_ADD(start, days) in Databricks, DATEADD(DAY, val, start) in T-SQL)
without the CAST wrapping that TsOrDsAdd requires for T-SQL type safety.
Always returns DATE.
"""

arg_types = {"this": True, "expression": True}
Comment on lines +158 to +167
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this got a bit too creative 😆 We don't want to split this into multiple AST nodes, the arity is expressed through the optionality (boolean value) of arg_types.

If we go back to the previous version I think we only need to fix the transpilation of Spark/DBX -> T-SQL. This can be done by adding generation support for exp.TimestampAdd in T-SQL e.g:

  # sqlglot.generators.tsql
  TRANSFORMS = {
    ...,
    exp.TimestampAdd: date_delta_sql("DATEADD")
  }

This is mostly what I meant by fixing the regression.

From a quick search, the APIs are the following:

Spark Databricks
date_add(start, days) (2-arg) DATE DATE
dateadd(start, days) (2-arg) DATE (alias) DATE (alias)
date_add(unit, value, expr) (3-arg) TIMESTAMP (Spark 3.4+) TIMESTAMP
dateadd(unit, value, expr) (3-arg) TIMESTAMP (alias) TIMESTAMP (alias)
timestampadd(unit, value, expr) TIMESTAMP TIMESTAMP

And this is what happens on main today afaict, Spark looks more valid than DBX so the latter should consolidate to the former imo:

SQL read="spark" read="databricks"
date_add(e, 24) (2-arg) TsOrDsAdd(unit=DAY) DateAdd(unit=DAY)
dateadd(e, 24) (2-arg) TsOrDsAdd(unit=DAY) DateAdd(unit=DAY)
date_add(MONTH, 1, e) (3-arg) TimestampAdd(unit=MONTH) DateAdd(unit=MONTH)
dateadd(MONTH, 1, e) (3-arg) TimestampAdd(unit=MONTH) DateAdd(unit=MONTH)
timestampadd(MONTH, 1, e) TimestampAdd(unit=MONTH) TimestampAdd(unit=MONTH)



# Truncation


Expand Down
1 change: 1 addition & 0 deletions sqlglot/generators/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class DatabricksGenerator(SparkGenerator):
**SparkGenerator.TRANSFORMS,
exp.CurrentVersion: lambda *_: "CURRENT_VERSION()",
exp.DateAdd: date_delta_sql("DATEADD"),
exp.DateAdd2: lambda self, e: self.func("DATE_ADD", e.this, e.expression),
exp.DateDiff: date_delta_sql("DATEDIFF"),
exp.DatetimeAdd: lambda self, e: self.func(
"TIMESTAMPADD", e.unit, e.expression, e.this
Expand Down
2 changes: 2 additions & 0 deletions sqlglot/generators/tsql.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ class TSQLGenerator(generator.Generator):
exp.Ceil: rename_func("CEILING"),
exp.Chr: rename_func("CHAR"),
exp.DateAdd: date_delta_sql("DATEADD"),
exp.DateAdd2: date_delta_sql("DATEADD"),
exp.TimestampAdd: date_delta_sql("DATEADD"),
exp.CTE: transforms.preprocess([qualify_derived_table_outputs]),
exp.CurrentDate: rename_func("GETDATE"),
exp.CurrentTimestamp: rename_func("GETDATE"),
Expand Down
19 changes: 17 additions & 2 deletions sqlglot/parsers/databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,21 @@
from sqlglot.tokens import TokenType


def _build_date_add(args: list) -> exp.Expr:
# Databricks treats `date_add` and `dateadd` as full aliases, with arity
# selecting the semantic:
# - 2-arg (startDate, numDays): always returns DATE -> exp.TsOrDsAdd
# - 3-arg (unit, value, expr): preserves expr type -> exp.DateAdd
#
# We route to distinct nodes so that the Databricks generator's existing
# exp.DateAdd -> DATEADD entry preserves the round-trip, while cross-dialect
# sources (Spark, Redshift, BigQuery) that produce exp.TimestampAdd or
# exp.TsOrDsAdd continue to emit DATE_ADD via the inherited SparkGenerator.
if len(args) == 2:
return exp.DateAdd2(this=seq_get(args, 0), expression=seq_get(args, 1))
return build_date_delta(exp.DateAdd)(args)


class DatabricksParser(SparkParser):
LOG_DEFAULTS_TO_LN = True
STRICT_CAST = True
Expand All @@ -16,8 +31,8 @@ class DatabricksParser(SparkParser):
**SparkParser.FUNCTIONS,
"IFF": exp.If.from_arg_list,
"GETDATE": exp.CurrentTimestamp.from_arg_list,
"DATEADD": build_date_delta(exp.DateAdd),
"DATE_ADD": build_date_delta(exp.DateAdd),
"DATEADD": _build_date_add,
"DATE_ADD": _build_date_add,
"DATEDIFF": build_date_delta(exp.DateDiff),
"DATE_DIFF": build_date_delta(exp.DateDiff),
"NOW": exp.CurrentTimestamp.from_arg_list,
Expand Down
4 changes: 4 additions & 0 deletions sqlglot/typing/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,8 @@
exp.Localtimestamp: {"returns": exp.DType.TIMESTAMPNTZ},
exp.ToBinary: {"returns": exp.DType.BINARY},
exp.DateFromUnixDate: {"returns": exp.DType.DATE},
# 2-arg `date_add(startDate, numDays)` returns DATE in both Spark and Databricks.
# Hive/Spark parsers produce TsOrDsAdd; the Databricks parser produces DateAdd2.
exp.TsOrDsAdd: {"returns": exp.DType.DATE},
exp.DateAdd2: {"returns": exp.DType.DATE},
}
15 changes: 14 additions & 1 deletion tests/dialects/test_databricks.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@ def test_datediff(self):
)

def test_add_date(self):
# 3-arg unit form (Databricks extension; Spark OSS uses TIMESTAMPADD).
# Parses to exp.DateAdd; Databricks and T-SQL both emit DATEADD.
self.validate_all(
"SELECT DATEADD(year, 1, '2020-01-01')",
write={
Expand All @@ -393,11 +395,22 @@ def test_add_date(self):
"SELECT DATEDIFF('end', 'start')",
write={"databricks": "SELECT DATEDIFF(DAY, 'start', 'end')"},
)
# 2-arg form: parses to exp.DateAdd2; always returns DATE.
self.validate_all(
"SELECT DATE_ADD('2020-01-01', 1)",
write={
"tsql": "SELECT DATEADD(DAY, 1, '2020-01-01')",
"databricks": "SELECT DATEADD(DAY, 1, '2020-01-01')",
"databricks": "SELECT DATE_ADD('2020-01-01', 1)",
},
)
# 3-arg round-trip via the alternate name.
self.validate_identity("SELECT DATEADD(MONTH, 1, '2020-01-01')")
# 2-arg alias: `dateadd` and `date_add` are interchangeable in Databricks.
self.validate_all(
"SELECT DATEADD(e, 24) FROM t",
write={
"tsql": "SELECT DATEADD(DAY, 24, e) FROM t",
"databricks": "SELECT DATE_ADD(e, 24) FROM t",
},
)

Expand Down
51 changes: 51 additions & 0 deletions tests/test_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,57 @@ def test_interval_math_annotation(self):
self.assertEqual(expected_type, expression.expressions[0].type.this)
self.assertEqual(sql, expression.sql())

def test_hive_chain_date_add_descent(self):
# Hive/Spark parse 2-arg DATE_ADD to TsOrDsAdd (parsers/hive.py); the
# Databricks parser overrides this to produce DateAdd2 instead, so the
# two semantics (DATE return vs operand-type-preserving 3-arg form) map
# to distinct AST nodes. The DATE annotator is registered at Spark level
# for both nodes; Hive stays UNKNOWN because older Hive returned STRING.
# Schema must be built per-dialect since TypeAnnotator dispatches on
# schema.dialect, not the call-site dialect.
UNKNOWN, DATE = exp.DataType.Type.UNKNOWN, exp.DataType.Type.DATE
sql = "SELECT date_add(e, 24) AS r FROM t"
for dialect, expected_class, expected_type in [
("hive", exp.TsOrDsAdd, UNKNOWN),
("spark", exp.TsOrDsAdd, DATE),
("databricks", exp.DateAdd2, DATE),
]:
with self.subTest(dialect):
schema = MappingSchema({"t": {"e": "TIMESTAMP"}}, dialect=dialect)
ast = optimizer.qualify.qualify(
parse_one(sql, read=dialect), schema=schema, dialect=dialect
)
annotated = annotate_types(ast, schema=schema, dialect=dialect)
projected = annotated.selects[0].this
self.assertIsInstance(projected, expected_class)
self.assertEqual(expected_type, projected.type.this)

def test_databricks_date_add_annotation(self):
# `date_add` and `dateadd` are aliases in Databricks; arity selects the
# semantic. DatabricksParser._build_date_add routes:
# - 2-arg → DateAdd2 (always returns DATE)
# - 3-arg → DateAdd (preserves operand type via _annotate_timeunit)
DateAdd2, DateAdd = exp.DateAdd2, exp.DateAdd
DATE, TIMESTAMPTZ = exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ
schema = MappingSchema({"t": {"e": "TIMESTAMP"}}, dialect="databricks")
for sql, expected_class, expected_type in [
("SELECT date_add(e, 24) AS r FROM t", DateAdd2, DATE),
("SELECT dateadd(e, 24) AS r FROM t", DateAdd2, DATE),
("SELECT date_add(month, 1, e) AS r FROM t", DateAdd, TIMESTAMPTZ),
("SELECT dateadd(day, 24, e) AS r FROM t", DateAdd, TIMESTAMPTZ),
]:
with self.subTest(sql):
expression = annotate_types(
optimizer.qualify.qualify(
parse_one(sql, read="databricks"), schema=schema, dialect="databricks"
),
schema=schema,
dialect="databricks",
)
projected = expression.selects[0].this
self.assertIsInstance(projected, expected_class)
self.assertEqual(expected_type, projected.type.this)

def test_lateral_annotation(self):
expression = optimizer.optimize(
parse_one("SELECT c FROM (select 1 a) as x LATERAL VIEW EXPLODE (a) AS c")
Expand Down
Loading