Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit 2e53e8f

Browse files
Add any, all, prod to dataframe.
Change-Id: I0d7d53c0a7174f9a1251bb8b6261d855db69fdf2
1 parent da17791 commit 2e53e8f

6 files changed

Lines changed: 150 additions & 13 deletions

File tree

bigframes/core/__init__.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -797,15 +797,20 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal
797797
)
798798

799799
def transpose_single_row(
800-
self, labels, *, index_col_id: str = "index", value_col_id: str = "values"
800+
self,
801+
labels,
802+
*,
803+
index_col_id: str = "index",
804+
value_col_id: str = "values",
805+
dtype=pandas.Float64Dtype(),
801806
) -> ArrayValue:
802-
"""Pivot a single row into a 3 column expression with index, values and offsets. Only works if all values can be cast to float."""
807+
"""Pivot a single row into a 3 column expression with index, values and offsets. Only works if all values can be cast to a common type."""
803808
table = self.to_ibis_expr(ordering_mode="unordered")
804809
sub_expressions = []
805810
for i, col_id in enumerate(self._column_names.keys()):
806811
sub_expr = table.select(
807812
ibis_types.literal(labels[i]).name(index_col_id),
808-
_numeric_to_float(table[col_id]).name(value_col_id),
813+
ops.AsTypeOp(dtype)._as_ibis(table[col_id]).name(value_col_id),
809814
ibis_types.literal(i).name(ORDER_ID_COLUMN),
810815
)
811816
sub_expressions.append(sub_expr)
@@ -998,12 +1003,3 @@ def _as_identity(value: ibis_types.Value):
9981003
if value.type().is_float64() or value.type().is_geospatial():
9991004
return value.cast(ibis_dtypes.str)
10001005
return value
1001-
1002-
1003-
def _numeric_to_float(value: ibis_types.Value):
1004-
if value.type().is_float64():
1005-
return value
1006-
if value.type().is_boolean():
1007-
return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64)
1008-
else:
1009-
return value.cast(ibis_dtypes.float64)

bigframes/core/blocks.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,12 +571,16 @@ def aggregate_all_and_pivot(
571571
*,
572572
value_col_id: str = "values",
573573
dropna: bool = True,
574+
dtype=pd.Float64Dtype(),
574575
) -> Block:
575576
aggregations = [(col_id, operation, col_id) for col_id in self.value_columns]
576577
result_expr = self.expr.aggregate(
577578
aggregations, dropna=dropna
578579
).transpose_single_row(
579-
labels=self.column_labels, index_col_id="index", value_col_id=value_col_id
580+
labels=self.column_labels,
581+
index_col_id="index",
582+
value_col_id=value_col_id,
583+
dtype=dtype,
580584
)
581585
return Block(result_expr, index_columns=["index"], column_labels=[None])
582586

bigframes/dataframe.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,30 @@ def dropna(self) -> DataFrame:
892892

893893
return DataFrame(block)
894894

895+
def any(
896+
self,
897+
*,
898+
bool_only: bool = False,
899+
) -> bigframes.series.Series:
900+
if not bool_only:
901+
frame = self._raise_on_non_boolean("any")
902+
else:
903+
frame = self._drop_non_bool()
904+
block = frame._block.aggregate_all_and_pivot(
905+
agg_ops.any_op, dtype=pd.BooleanDtype()
906+
)
907+
return bigframes.series.Series(block.select_column("values"))
908+
909+
def all(self, *, bool_only: bool = False) -> bigframes.series.Series:
910+
if not bool_only:
911+
frame = self._raise_on_non_boolean("all")
912+
else:
913+
frame = self._drop_non_bool()
914+
block = frame._block.aggregate_all_and_pivot(
915+
agg_ops.all_op, dtype=pd.BooleanDtype()
916+
)
917+
return bigframes.series.Series(block.select_column("values"))
918+
895919
def sum(self, *, numeric_only: bool = False) -> bigframes.series.Series:
896920
if not numeric_only:
897921
frame = self._raise_on_non_numeric("sum")
@@ -940,6 +964,16 @@ def max(self, *, numeric_only: bool = False) -> bigframes.series.Series:
940964
block = frame._block.aggregate_all_and_pivot(agg_ops.max_op)
941965
return bigframes.series.Series(block.select_column("values"))
942966

967+
def prod(self, *, numeric_only: bool = False) -> bigframes.series.Series:
968+
if not numeric_only:
969+
frame = self._raise_on_non_numeric("prod")
970+
else:
971+
frame = self._drop_non_numeric()
972+
block = frame._block.aggregate_all_and_pivot(agg_ops.product_op)
973+
return bigframes.series.Series(block.select_column("values"))
974+
975+
product = prod
976+
943977
def count(self, *, numeric_only: bool = False) -> bigframes.series.Series:
944978
if not numeric_only:
945979
frame = self
@@ -960,6 +994,14 @@ def _drop_non_numeric(self) -> DataFrame:
960994
]
961995
return DataFrame(self._block.drop_columns(non_numeric_cols))
962996

997+
def _drop_non_bool(self) -> DataFrame:
998+
non_bool_cols = [
999+
col_id
1000+
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
1001+
if dtype not in bigframes.dtypes.BOOL_BIGFRAMES_TYPES
1002+
]
1003+
return DataFrame(self._block.drop_columns(non_bool_cols))
1004+
9631005
def _raise_on_non_numeric(self, op: str):
9641006
if not all(
9651007
dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES
@@ -970,6 +1012,16 @@ def _raise_on_non_numeric(self, op: str):
9701012
)
9711013
return self
9721014

1015+
def _raise_on_non_boolean(self, op: str):
1016+
if not all(
1017+
dtype in bigframes.dtypes.BOOL_BIGFRAMES_TYPES
1018+
for dtype in self._block.dtypes
1019+
):
1020+
raise NotImplementedError(
1021+
f"'{op}' does not support non-bool columns. Set 'bool_only'=True to ignore non-bool columns"
1022+
)
1023+
return self
1024+
9731025
def merge(
9741026
self,
9751027
right: DataFrame,

bigframes/dtypes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@
6161
ibis_dtypes.Timestamp,
6262
]
6363

64+
BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()]
65+
6466
# Several operations are restricted to these types.
6567
NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()]
6668

@@ -296,4 +298,7 @@ def cast_ibis_value(value: ibis_types.Value, to_type: IbisDtype) -> ibis_types.V
296298
if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.string:
297299
return typing.cast(ibis_types.StringValue, value.cast(to_type)).capitalize()
298300

301+
if value.type() == ibis_dtypes.bool and to_type == ibis_dtypes.float64:
302+
return value.cast(ibis_dtypes.int64).cast(ibis_dtypes.float64)
303+
299304
raise TypeError(f"Unsupported cast {value.type()} to {to_type}")

tests/system/small/test_dataframe.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,39 @@ def test_dataframe_aggregates(scalars_df_index, scalars_pandas_df_index, op):
13051305
pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False)
13061306

13071307

1308+
@pytest.mark.parametrize(
1309+
("op"),
1310+
[
1311+
(lambda x: x.all(bool_only=True)),
1312+
(lambda x: x.any(bool_only=True)),
1313+
],
1314+
ids=["all", "any"],
1315+
)
1316+
def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op):
1317+
# Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later
1318+
scalars_pandas_df_index = scalars_pandas_df_index.assign(
1319+
bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool")
1320+
)
1321+
bf_series = op(scalars_df_index)
1322+
pd_series = op(scalars_pandas_df_index).astype("boolean")
1323+
bf_result = bf_series.compute()
1324+
1325+
# Pandas has object index type
1326+
pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False)
1327+
1328+
1329+
def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index):
1330+
col_names = ["int64_too", "float64_col"]
1331+
bf_series = scalars_df_index[col_names].prod()
1332+
pd_series = scalars_pandas_df_index[col_names].prod()
1333+
bf_result = bf_series.compute()
1334+
1335+
# Pandas may produce narrower numeric types, but bigframes always produces Float64
1336+
pd_series = pd_series.astype("Float64")
1337+
# Pandas has object index type
1338+
pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False)
1339+
1340+
13081341
@pytest.mark.parametrize(
13091342
("frac", "n", "random_state"),
13101343
[

third_party/bigframes_vendored/pandas/core/frame.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,53 @@ def merge(
874874
# ----------------------------------------------------------------------
875875
# ndarray-like stats methods
876876

877+
def any(self, *, bool_only: bool = False):
878+
"""
879+
Return whether any element is True, potentially over an axis.
880+
881+
Returns False unless there is at least one element within a series or
882+
along a Dataframe axis that is True or equivalent (e.g. non-zero or
883+
non-empty).
884+
885+
Args:
886+
bool_only:
887+
Include only boolean columns.
888+
889+
Returns:
890+
Series
891+
"""
892+
raise NotImplementedError("abstract method")
893+
894+
def all(self, *, bool_only: bool = False):
895+
"""
896+
Return whether all elements are True, potentially over an axis.
897+
898+
Returns True unless there at least one element within a series or
899+
along a Dataframe axis that is False or equivalent (e.g. zero or
900+
empty).
901+
902+
Args:
903+
bool_only:
904+
Include only boolean columns.
905+
906+
Returns:
907+
Series
908+
"""
909+
raise NotImplementedError("abstract method")
910+
911+
def prod(self, *, numeric_only: bool = False):
912+
"""
913+
Return the product of the values over the requested axis.
914+
915+
Args:
916+
numeric_only:
917+
Include only float, int, boolean columns.
918+
919+
Returns:
920+
Series
921+
"""
922+
raise NotImplementedError("abstract method")
923+
877924
def min(self, *, numeric_only: bool = False):
878925
"""Return the minimum of the values over the requested axis.
879926

0 commit comments

Comments
 (0)