Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit 073d62d

Browse files
authored
Merge branch 'main' into udf-refa
2 parents 1a78ddf + fed8039 commit 073d62d

18 files changed

Lines changed: 698 additions & 95 deletions

File tree

bigframes/core/block_transforms.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -355,24 +355,28 @@ def value_counts(
355355
normalize: bool = False,
356356
sort: bool = True,
357357
ascending: bool = False,
358-
dropna: bool = True,
358+
drop_na: bool = True,
359+
grouping_keys: typing.Sequence[str] = (),
359360
):
360-
block, dummy = block.create_constant(1)
361+
if grouping_keys and drop_na:
362+
# only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
363+
block = dropna(block, columns, how="any")
361364
block, agg_ids = block.aggregate(
362-
by_column_ids=columns,
363-
aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
364-
dropna=dropna,
365+
by_column_ids=(*grouping_keys, *columns),
366+
aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
367+
dropna=drop_na and not grouping_keys,
365368
)
366369
count_id = agg_ids[0]
367370
if normalize:
368-
unbound_window = windows.unbound()
371+
unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
369372
block, total_count_id = block.apply_window_op(
370373
count_id, agg_ops.sum_op, unbound_window
371374
)
372375
block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)
373376

374377
if sort:
375-
block = block.order_by(
378+
order_parts = [ordering.ascending_over(id) for id in grouping_keys]
379+
order_parts.extend(
376380
[
377381
ordering.OrderingExpression(
378382
ex.deref(count_id),
@@ -382,6 +386,7 @@ def value_counts(
382386
)
383387
]
384388
)
389+
block = block.order_by(order_parts)
385390
return block.select_column(count_id).with_column_labels(
386391
["proportion" if normalize else "count"]
387392
)

bigframes/core/compile/sqlglot/expressions/binary_compiler.py

Lines changed: 78 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import bigframes_vendored.constants as constants
1718
import sqlglot.expressions as sge
1819

1920
from bigframes import dtypes
@@ -35,8 +36,83 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
3536
# String addition
3637
return sge.Concat(expressions=[left.expr, right.expr])
3738

38-
# Numerical addition
39-
return sge.Add(this=left.expr, expression=right.expr)
39+
if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype):
40+
left_expr = left.expr
41+
if left.dtype == dtypes.BOOL_DTYPE:
42+
left_expr = sge.Cast(this=left_expr, to="INT64")
43+
right_expr = right.expr
44+
if right.dtype == dtypes.BOOL_DTYPE:
45+
right_expr = sge.Cast(this=right_expr, to="INT64")
46+
return sge.Add(this=left_expr, expression=right_expr)
47+
48+
if (
49+
dtypes.is_time_or_date_like(left.dtype)
50+
and right.dtype == dtypes.TIMEDELTA_DTYPE
51+
):
52+
left_expr = left.expr
53+
if left.dtype == dtypes.DATE_DTYPE:
54+
left_expr = sge.Cast(this=left_expr, to="DATETIME")
55+
return sge.TimestampAdd(
56+
this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND")
57+
)
58+
if (
59+
dtypes.is_time_or_date_like(right.dtype)
60+
and left.dtype == dtypes.TIMEDELTA_DTYPE
61+
):
62+
right_expr = right.expr
63+
if right.dtype == dtypes.DATE_DTYPE:
64+
right_expr = sge.Cast(this=right_expr, to="DATETIME")
65+
return sge.TimestampAdd(
66+
this=right_expr, expression=left.expr, unit=sge.Var(this="MICROSECOND")
67+
)
68+
if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE:
69+
return sge.Add(this=left.expr, expression=right.expr)
70+
71+
raise TypeError(
72+
f"Cannot add type {left.dtype} and {right.dtype}. {constants.FEEDBACK_LINK}"
73+
)
74+
75+
76+
@BINARY_OP_REGISTRATION.register(ops.sub_op)
77+
def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
78+
if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype):
79+
left_expr = left.expr
80+
if left.dtype == dtypes.BOOL_DTYPE:
81+
left_expr = sge.Cast(this=left_expr, to="INT64")
82+
right_expr = right.expr
83+
if right.dtype == dtypes.BOOL_DTYPE:
84+
right_expr = sge.Cast(this=right_expr, to="INT64")
85+
return sge.Sub(this=left_expr, expression=right_expr)
86+
87+
if (
88+
dtypes.is_time_or_date_like(left.dtype)
89+
and right.dtype == dtypes.TIMEDELTA_DTYPE
90+
):
91+
left_expr = left.expr
92+
if left.dtype == dtypes.DATE_DTYPE:
93+
left_expr = sge.Cast(this=left_expr, to="DATETIME")
94+
return sge.TimestampSub(
95+
this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND")
96+
)
97+
if dtypes.is_time_or_date_like(left.dtype) and dtypes.is_time_or_date_like(
98+
right.dtype
99+
):
100+
left_expr = left.expr
101+
if left.dtype == dtypes.DATE_DTYPE:
102+
left_expr = sge.Cast(this=left_expr, to="DATETIME")
103+
right_expr = right.expr
104+
if right.dtype == dtypes.DATE_DTYPE:
105+
right_expr = sge.Cast(this=right_expr, to="DATETIME")
106+
return sge.TimestampDiff(
107+
this=left_expr, expression=right_expr, unit=sge.Var(this="MICROSECOND")
108+
)
109+
110+
if left.dtype == dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.TIMEDELTA_DTYPE:
111+
return sge.Sub(this=left.expr, expression=right.expr)
112+
113+
raise TypeError(
114+
f"Cannot subtract type {left.dtype} and {right.dtype}. {constants.FEEDBACK_LINK}"
115+
)
40116

41117

42118
@BINARY_OP_REGISTRATION.register(ops.ge_op)

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import datetime
1818
import typing
19-
from typing import Literal, Sequence, Tuple, Union
19+
from typing import Literal, Optional, Sequence, Tuple, Union
2020

2121
import bigframes_vendored.constants as constants
2222
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -372,6 +372,39 @@ def diff(self, periods=1) -> series.Series:
372372
)
373373
return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
374374

375+
def value_counts(
376+
self,
377+
subset: Optional[Sequence[blocks.Label]] = None,
378+
normalize: bool = False,
379+
sort: bool = True,
380+
ascending: bool = False,
381+
dropna: bool = True,
382+
) -> Union[df.DataFrame, series.Series]:
383+
if subset is None:
384+
columns = self._selected_cols
385+
else:
386+
columns = [
387+
column
388+
for column in self._block.value_columns
389+
if self._block.col_id_to_label[column] in subset
390+
]
391+
block = self._block
392+
if self._dropna: # this drops null grouping columns
393+
block = block_ops.dropna(block, self._by_col_ids)
394+
block = block_ops.value_counts(
395+
block,
396+
columns,
397+
normalize=normalize,
398+
sort=sort,
399+
ascending=ascending,
400+
drop_na=dropna, # this drops null value columns
401+
grouping_keys=self._by_col_ids,
402+
)
403+
if self._as_index:
404+
return series.Series(block)
405+
else:
406+
return series.Series(block).to_frame().reset_index(drop=False)
407+
375408
@validations.requires_ordering()
376409
def rolling(
377410
self,

bigframes/core/groupby/series_group_by.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
244244

245245
aggregate = agg
246246

247+
def value_counts(
248+
self,
249+
normalize: bool = False,
250+
sort: bool = True,
251+
ascending: bool = False,
252+
dropna: bool = True,
253+
) -> Union[df.DataFrame, series.Series]:
254+
columns = [self._value_column]
255+
block = self._block
256+
if self._dropna: # this drops null grouping columns
257+
block = block_ops.dropna(block, self._by_col_ids)
258+
block = block_ops.value_counts(
259+
block,
260+
columns,
261+
normalize=normalize,
262+
sort=sort,
263+
ascending=ascending,
264+
drop_na=dropna, # this drops null value columns
265+
grouping_keys=self._by_col_ids,
266+
)
267+
# TODO: once as_index=Fales supported, return DataFrame instead by resetting index
268+
# with .to_frame().reset_index(drop=False)
269+
return series.Series(block)
270+
247271
@validations.requires_ordering()
248272
def cumsum(self, *args, **kwargs) -> series.Series:
249273
return self._apply_window_op(

bigframes/core/indexes/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def value_counts(
489489
self._block.index_columns,
490490
normalize=normalize,
491491
ascending=ascending,
492-
dropna=dropna,
492+
drop_na=dropna,
493493
)
494494
import bigframes.series as series
495495

bigframes/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2475,7 +2475,7 @@ def value_counts(
24752475
normalize=normalize,
24762476
sort=sort,
24772477
ascending=ascending,
2478-
dropna=dropna,
2478+
drop_na=dropna,
24792479
)
24802480
return bigframes.series.Series(block)
24812481

bigframes/dtypes.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,10 @@ def is_time_like(type_: ExpressionType) -> bool:
289289
return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE)
290290

291291

292+
def is_time_or_date_like(type_: ExpressionType) -> bool:
293+
return type_ in (DATE_DTYPE, DATETIME_DTYPE, TIME_DTYPE, TIMESTAMP_DTYPE)
294+
295+
292296
def is_geo_like(type_: ExpressionType) -> bool:
293297
return type_ in (GEO_DTYPE,)
294298

bigframes/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1631,7 +1631,7 @@ def value_counts(
16311631
[self._value_column],
16321632
normalize=normalize,
16331633
ascending=ascending,
1634-
dropna=dropna,
1634+
drop_na=dropna,
16351635
)
16361636
return Series(block)
16371637

samples/snippets/quickstart.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
def run_quickstart(project_id: str) -> None:
1717
your_gcp_project_id = project_id
1818

19-
# [START bigquery_bigframes_quickstart]
19+
# [START bigquery_bigframes_quickstart_create_dataframe]
2020
import bigframes.pandas as bpd
2121

2222
# Set BigQuery DataFrames options
@@ -37,12 +37,16 @@ def run_quickstart(project_id: str) -> None:
3737

3838
# Efficiently preview the results using the .peek() method.
3939
df.peek()
40+
# [END bigquery_bigframes_quickstart_create_dataframe]
4041

42+
# [START bigquery_bigframes_quickstart_calculate_print]
4143
# Use the DataFrame just as you would a pandas DataFrame, but calculations
4244
# happen in the BigQuery query engine instead of the local system.
4345
average_body_mass = df["body_mass_g"].mean()
4446
print(f"average_body_mass: {average_body_mass}")
47+
# [END bigquery_bigframes_quickstart_calculate_print]
4548

49+
# [START bigquery_bigframes_quickstart_eval_metrics]
4650
# Create the Linear Regression model
4751
from bigframes.ml.linear_model import LinearRegression
4852

@@ -70,7 +74,7 @@ def run_quickstart(project_id: str) -> None:
7074
model = LinearRegression(fit_intercept=False)
7175
model.fit(X, y)
7276
model.score(X, y)
73-
# [END bigquery_bigframes_quickstart]
77+
# [END bigquery_bigframes_quickstart_eval_metrics]
7478

7579
# close session and reset option so not to affect other tests
7680
bpd.close_session()

tests/system/small/engines/test_numeric_ops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def apply_op_pairwise(
5353
return new_arr
5454

5555

56-
@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
56+
@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)
5757
def test_engines_project_add(
5858
scalars_array_value: array_value.ArrayValue,
5959
engine,
@@ -62,7 +62,7 @@ def test_engines_project_add(
6262
assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine)
6363

6464

65-
@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True)
65+
@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True)
6666
def test_engines_project_sub(
6767
scalars_array_value: array_value.ArrayValue,
6868
engine,

0 commit comments

Comments
 (0)