diff --git a/bigframes/core/pyformat.py b/bigframes/core/pyformat.py new file mode 100644 index 0000000000..98f175d300 --- /dev/null +++ b/bigframes/core/pyformat.py @@ -0,0 +1,111 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the pyformat feature.""" + +# TODO(tswast): consolidate with pandas-gbq and bigquery-magics. See: +# https://github.com/googleapis/python-bigquery-magics/blob/main/bigquery_magics/pyformat.py + +from __future__ import annotations + +import string +import typing +from typing import Any, Union + +import google.cloud.bigquery +import google.cloud.bigquery.table + +_BQ_TABLE_TYPES = Union[ + google.cloud.bigquery.Table, + google.cloud.bigquery.TableReference, + google.cloud.bigquery.table.TableListItem, +] + + +def _table_to_sql(table: _BQ_TABLE_TYPES) -> str: + return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`" + + +def _field_to_template_value(name: str, value: Any) -> str: + """Convert value to something embeddable in a SQL string.""" + import bigframes.core.sql # Avoid circular imports + + _validate_type(name, value) + + table_types = typing.get_args(_BQ_TABLE_TYPES) + if isinstance(value, table_types): + return _table_to_sql(value) + + # TODO(tswast): convert DataFrame objects to gbq tables or a literals subquery. + return bigframes.core.sql.simple_literal(value) + + +def _validate_type(name: str, value: Any): + """Raises TypeError if value is unsupported.""" + import bigframes.core.sql # Avoid circular imports + + if value is None: + return # None can't be used in isinstance, but is a valid literal. + + supported_types = typing.get_args(_BQ_TABLE_TYPES) + typing.get_args( + bigframes.core.sql.SIMPLE_LITERAL_TYPES + ) + if not isinstance(value, supported_types): + raise TypeError( + f"{name} has unsupported type: {type(value)}. " + f"Only {supported_types} are supported." + ) + + +def _parse_fields(sql_template: str) -> list[str]: + return [ + field_name + for _, field_name, _, _ in string.Formatter().parse(sql_template) + if field_name is not None + ] + + +def pyformat( + sql_template: str, + *, + pyformat_args: dict, + # TODO: add dry_run parameter to avoid expensive API calls in conversion + # TODO: and session to upload data / convert to table if necessary +) -> str: + """Unsafe Python-style string formatting of SQL string. + + Only some data types supported. + + Warning: strings are **not** escaped. This allows them to be used in + contexts such as table identifiers, where normal query parameters are not + supported. + + Args: + sql_template (str): + SQL string with 0+ {var_name}-style format options. + pyformat_args (dict): + Variable namespace to use for formatting. + + Raises: + TypeError: if a referenced variable is not of a supported type. + KeyError: if a referenced variable is not found. + """ + fields = _parse_fields(sql_template) + + format_kwargs = {} + for name in fields: + value = pyformat_args[name] + format_kwargs[name] = _field_to_template_value(name, value) + + return sql_template.format(**format_kwargs) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 04e678e713..ccd2a16ddc 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -42,10 +42,22 @@ to_wkt = dumps +SIMPLE_LITERAL_TYPES = Union[ + bytes, + str, + int, + bool, + float, + datetime.datetime, + datetime.date, + datetime.time, + decimal.Decimal, + list, +] + + ### Writing SQL Values (literals, column references, table references, etc.) -def simple_literal( - value: bytes | str | int | bool | float | datetime.datetime | list | None, -): +def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: """Return quoted input string.""" # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 7260553c14..b54dc1d691 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -61,6 +61,7 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.clients from bigframes.core import blocks +import bigframes.core.pyformat # Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. @@ -480,16 +481,38 @@ def _read_gbq_colab( self, query: str, # TODO: Add a callback parameter that takes some kind of Event object. - # TODO: Add parameter for variables for string formatting. # TODO: Add dry_run parameter. + *, + pyformat_args: Optional[Dict[str, Any]] = None, ) -> dataframe.DataFrame: """A version of read_gbq that has the necessary default values for use in colab integrations. This includes, no ordering, no index, no progress bar, always use string formatting for embedding local variables / dataframes. + + Args: + query (str): + A SQL query string to execute. Results (if any) are turned into + a DataFrame. + pyformat_args (dict): + A dictionary of potential variables to replace in ``query``. + Note: strings are _not_ escaped. Use query parameters for these, + instead. Note: unlike read_gbq / read_gbq_query, even if set to + None, this function always assumes {var} refers to a variable + that is supposed to be supplied in this dictionary. """ + # TODO: Allow for a table ID to avoid queries like with read_gbq? + + if pyformat_args is None: + pyformat_args = {} + + # TODO: move this to read_gbq_query if/when we expose this feature + # beyond in _read_gbq_colab. + query = bigframes.core.pyformat.pyformat( + query, + pyformat_args=pyformat_args, + ) - # TODO: Allow for a table ID to avoid queries like read_gbq? return self._loader.read_gbq_query( query=query, index_col=bigframes.enums.DefaultIndexKind.NULL, diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index 7ade85b2b2..54fdd4014e 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -14,6 +14,9 @@ """System tests for read_gbq_colab helper functions.""" +import pandas +import pandas.testing + def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): df = maybe_ordered_session._read_gbq_colab( @@ -39,3 +42,66 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi total_rows += len(batch.index) assert total_rows > 0 + + +def test_read_gbq_colab_includes_formatted_scalars(session): + pyformat_args = { + "some_integer": 123, + "some_string": "This could be dangerous, but we esape it", + # This is not a supported type, but ignored if not referenced. + "some_object": object(), + } + df = session._read_gbq_colab( + """ + SELECT {some_integer} as some_integer, + {some_string} as some_string, + '{{escaped}}' as escaped + """, + pyformat_args=pyformat_args, + ) + result = df.to_pandas() + pandas.testing.assert_frame_equal( + result, + pandas.DataFrame( + { + "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()), + "some_string": pandas.Series( + ["This could be dangerous, but we esape it"], + dtype="string[pyarrow]", + ), + "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), + } + ), + ) + + +def test_read_gbq_colab_includes_formatted_bigframes_dataframe(session): + pyformat_args = { + # TODO: put a bigframes DataFrame here. + "some_integer": 123, + "some_string": "This could be dangerous, but we esape it", + # This is not a supported type, but ignored if not referenced. + "some_object": object(), + } + df = session._read_gbq_colab( + """ + SELECT {some_integer} as some_integer, + {some_string} as some_string, + '{{escaped}}' as escaped + """, + pyformat_args=pyformat_args, + ) + result = df.to_pandas() + pandas.testing.assert_frame_equal( + result, + pandas.DataFrame( + { + "some_integer": pandas.Series([123], dtype=pandas.Int64Dtype()), + "some_string": pandas.Series( + ["This could be dangerous, but we esape it"], + dtype="string[pyarrow]", + ), + "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), + } + ), + ) diff --git a/tests/unit/core/test_pyformat.py b/tests/unit/core/test_pyformat.py new file mode 100644 index 0000000000..466f3d6116 --- /dev/null +++ b/tests/unit/core/test_pyformat.py @@ -0,0 +1,145 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for the pyformat feature.""" + +# TODO(tswast): consolidate with pandas-gbq and bigquery-magics. See: +# https://github.com/googleapis/python-bigquery-magics/blob/main/tests/unit/bigquery/test_pyformat.py + +from __future__ import annotations + +from typing import Any, Dict, List + +import google.cloud.bigquery +import google.cloud.bigquery.table +import pytest + +import bigframes.core.pyformat as pyformat + + +@pytest.mark.parametrize( + ("sql_template", "expected"), + ( + ( + "{my_project}.{my_dataset}.{my_table}", + ["my_project", "my_dataset", "my_table"], + ), + ( + "{{not a format variable}}", + [], + ), + ), +) +def test_parse_fields(sql_template: str, expected: List[str]): + fields = pyformat._parse_fields(sql_template) + fields.sort() + expected.sort() + assert fields == expected + + +def test_pyformat_with_unsupported_type_raises_typeerror(): + pyformat_args = {"my_object": object()} + sql = "SELECT {my_object}" + + with pytest.raises(TypeError, match="my_object has unsupported type: "): + pyformat.pyformat(sql, pyformat_args=pyformat_args) + + +def test_pyformat_with_missing_variable_raises_keyerror(): + pyformat_args: Dict[str, Any] = {} + sql = "SELECT {my_object}" + + with pytest.raises(KeyError, match="my_object"): + pyformat.pyformat(sql, pyformat_args=pyformat_args) + + +def test_pyformat_with_no_variables(): + pyformat_args: Dict[str, Any] = {} + sql = "SELECT '{{escaped curly brackets}}'" + expected_sql = "SELECT '{escaped curly brackets}'" + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + assert got_sql == expected_sql + + +def test_pyformat_with_query_string_replaces_variables(): + pyformat_args = { + "my_string": "some string value", + "max_value": 2.25, + "year": 2025, + "null_value": None, + # Unreferenced values of unsupported type shouldn't cause issues. + "my_object": object(), + } + + sql = """ + SELECT {year} - year AS age, + @myparam AS myparam, + '{{my_string}}' AS escaped_string, + {my_string} AS my_string, + {null_value} AS null_value, + FROM my_dataset.my_table + WHERE height < {max_value} + """.strip() + + expected_sql = """ + SELECT 2025 - year AS age, + @myparam AS myparam, + '{my_string}' AS escaped_string, + 'some string value' AS my_string, + NULL AS null_value, + FROM my_dataset.my_table + WHERE height < 2.25 + """.strip() + + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + assert got_sql == expected_sql + + +@pytest.mark.parametrize( + ("table", "expected_sql"), + ( + ( + google.cloud.bigquery.Table("my-project.my_dataset.my_table"), + "SELECT * FROM `my-project`.`my_dataset`.`my_table`", + ), + ( + google.cloud.bigquery.TableReference( + google.cloud.bigquery.DatasetReference("some-project", "some_dataset"), + "some_table", + ), + "SELECT * FROM `some-project`.`some_dataset`.`some_table`", + ), + ( + google.cloud.bigquery.table.TableListItem( + { + "tableReference": { + "projectId": "ListedProject", + "datasetId": "ListedDataset", + "tableId": "ListedTable", + } + } + ), + "SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`", + ), + ), +) +def test_pyformat_with_table_replaces_variables(table, expected_sql): + pyformat_args = { + "table": table, + # Unreferenced values of unsupported type shouldn't cause issues. + "my_object": object(), + } + sql = "SELECT * FROM {table}" + got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args) + assert got_sql == expected_sql