This repository was archived by the owner on Apr 1, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 69
Expand file tree
/
Copy pathgbq.py
More file actions
180 lines (153 loc) · 8.5 KB
/
gbq.py
File metadata and controls
180 lines (153 loc) · 8.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/io/gbq.py
""" Google BigQuery support """
from __future__ import annotations
from typing import Any, Dict, Iterable, Literal, Optional, Tuple, Union
from bigframes import constants
import bigframes.enums
FilterOps = Literal["in", "not in", "<", "<=", "==", "!=", ">=", ">", "LIKE"]
FilterType = Tuple[str, FilterOps, Any]
FiltersType = Union[Iterable[FilterType], Iterable[Iterable[FilterType]]]
class GBQIOMixin:
def read_gbq(
self,
query_or_table: str,
*,
index_col: Union[Iterable[str], str, bigframes.enums.DefaultIndexKind] = (),
columns: Iterable[str] = (),
configuration: Optional[Dict] = None,
max_results: Optional[int] = None,
filters: FiltersType = (),
use_cache: Optional[bool] = None,
col_order: Iterable[str] = (),
allow_large_results: bool = True,
):
"""Loads a DataFrame from BigQuery.
BigQuery tables are an unordered, unindexed data source. To add support
pandas-compatibility, the following indexing options are supported via
the ``index_col`` parameter:
* (Empty iterable, default) A default index. **Behavior may change.**
Explicitly set ``index_col`` if your application makes use of
specific index values.
If a table has primary key(s), those are used as the index,
otherwise a sequential index is generated.
* (:attr:`bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64`) Add an
arbitrary sequential index and ordering. **Warning** This uses an
analytic windowed operation that prevents filtering push down. Avoid
using on large clustered or partitioned tables.
* (Recommended) Set the ``index_col`` argument to one or more columns.
Unique values for the row labels are recommended. Duplicate labels
are possible, but note that joins on a non-unique index can duplicate
rows via pandas-compatible outer join behavior.
.. note::
By default, even SQL query inputs with an ORDER BY clause create a
DataFrame with an arbitrary ordering. Use ``row_number() OVER
(ORDER BY ...) AS rowindex`` in your SQL query and set
``index_col='rowindex'`` to preserve the desired ordering.
If your query doesn't have an ordering, select ``GENERATE_UUID() AS
rowindex`` in your SQL and set ``index_col='rowindex'`` for the
best performance.
**Examples:**
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
If the input is a table ID:
>>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
Read table path with wildcard suffix and filters:
>>> df = bpd.read_gbq_table("bigquery-public-data.noaa_gsod.gsod19*", filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")])
Preserve ordering in a query input.
>>> df = bpd.read_gbq('''
... SELECT
... -- Instead of an ORDER BY clause on the query, use
... -- ROW_NUMBER() to create an ordered DataFrame.
... ROW_NUMBER() OVER (ORDER BY AVG(pitchSpeed) DESC)
... AS rowindex,
...
... pitcherFirstName,
... pitcherLastName,
... AVG(pitchSpeed) AS averagePitchSpeed
... FROM `bigquery-public-data.baseball.games_wide`
... WHERE year = 2016
... GROUP BY pitcherFirstName, pitcherLastName
... ''', index_col="rowindex")
>>> df.head(2)
pitcherFirstName pitcherLastName averagePitchSpeed
rowindex
1 Albertin Chapman 96.514113
2 Zachary Britton 94.591039
<BLANKLINE>
[2 rows x 3 columns]
Reading data with `columns` and `filters` parameters:
>>> columns = ['pitcherFirstName', 'pitcherLastName', 'year', 'pitchSpeed']
>>> filters = [('year', '==', 2016), ('pitcherFirstName', 'in', ['John', 'Doe']), ('pitcherLastName', 'in', ['Gant']), ('pitchSpeed', '>', 94)]
>>> df = bpd.read_gbq(
... "bigquery-public-data.baseball.games_wide",
... columns=columns,
... filters=filters,
... )
>>> df.head(1)
pitcherFirstName pitcherLastName year pitchSpeed
0 John Gant 2016 95
<BLANKLINE>
[1 rows x 4 columns]
Args:
query_or_table (str):
A SQL string to be executed or a BigQuery table to be read. The
table must be specified in the format of
`project.dataset.tablename` or `dataset.tablename`.
Can also take wildcard table name, such as `project.dataset.table_prefix*`.
In tha case, will read all the matched table as one DataFrame.
index_col (Iterable[str], str, bigframes.enums.DefaultIndexKind):
Name of result column(s) to use for index in results DataFrame.
If an empty iterable, such as ``()``, a default index is
generated. Do not depend on specific index values in this case.
**New in bigframes version 1.3.0**: If ``index_cols`` is not
set, the primary key(s) of the table are used as the index.
**New in bigframes version 1.4.0**: Support
:class:`bigframes.enums.DefaultIndexKind` to override default index
behavior.
columns (Iterable[str]):
List of BigQuery column names in the desired order for results
DataFrame.
configuration (dict, optional):
Query config parameters for job processing.
For example: configuration = {'query': {'useQueryCache': False}}.
For more information see `BigQuery REST API Reference
<https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
max_results (Optional[int], default None):
If set, limit the maximum number of rows to fetch from the
query results.
filters (Union[Iterable[FilterType], Iterable[Iterable[FilterType]]], default ()): To
filter out data. Filter syntax: [[(column, op, val), …],…] where
op is [==, >, >=, <, <=, !=, in, not in, LIKE]. The innermost tuples
are transposed into a set of filters applied through an AND
operation. The outer Iterable combines these sets of filters
through an OR operation. A single Iterable of tuples can also
be used, meaning that no OR operation between set of filters
is to be conducted.
If using wildcard table suffix in query_or_table, can specify
'_table_suffix' pseudo column to filter the tables to be read
into the DataFrame.
use_cache (Optional[bool], default None):
Caches query results if set to `True`. When `None`, it behaves
as `True`, but should not be combined with `useQueryCache` in
`configuration` to avoid conflicts.
col_order (Iterable[str]):
Alias for columns, retained for backwards compatibility.
allow_large_results (bool, optional):
Whether to allow large query results. If ``True``, the query
results can be larger than the maximum response size. This
option is only applicable when ``query_or_table`` is a query.
Defaults to ``True``.
Raises:
bigframes.exceptions.DefaultIndexWarning:
Using the default index is discouraged, such as with clustered
or partitioned tables without primary keys.
ValueError:
When both ``columns`` and ``col_order`` are specified.
ValueError:
If ``configuration`` is specified when directly reading
from a table.
Returns:
bigframes.pandas.DataFrame:
A DataFrame representing results of the query or table.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)