Skip to content

Commit 37666e4

Browse files
GarrettWutswastgcf-owl-bot[bot]
authored
feat!: add required param 'engine' to multimodal functions (#1834)
* feat: add required param 'engine' to multimodal functions * add missing engine to exif test * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Tim Sweña (Swast) <[email protected]> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Release-As: 2.8.0
1 parent c670f9d commit 37666e4

File tree

4 files changed

+72
-19
lines changed

4 files changed

+72
-19
lines changed

bigframes/operations/blob.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ def get_runtime_json_str(
303303
def exif(
304304
self,
305305
*,
306+
engine: Literal[None, "pillow"] = None,
306307
connection: Optional[str] = None,
307308
max_batching_rows: int = 8192,
308309
container_cpu: Union[float, int] = 0.33,
@@ -311,6 +312,7 @@ def exif(
311312
"""Extract EXIF data. Now only support image types.
312313
313314
Args:
315+
engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
314316
connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
315317
max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
316318
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
@@ -319,6 +321,8 @@ def exif(
319321
Returns:
320322
bigframes.series.Series: JSON series of key-value pairs.
321323
"""
324+
if engine is None or engine.casefold() != "pillow":
325+
raise ValueError("Must specify the engine, supported value is 'pillow'.")
322326

323327
import bigframes.bigquery as bbq
324328
import bigframes.blob._functions as blob_func
@@ -344,6 +348,7 @@ def image_blur(
344348
self,
345349
ksize: tuple[int, int],
346350
*,
351+
engine: Literal[None, "opencv"] = None,
347352
dst: Optional[Union[str, bigframes.series.Series]] = None,
348353
connection: Optional[str] = None,
349354
max_batching_rows: int = 8192,
@@ -354,6 +359,7 @@ def image_blur(
354359
355360
Args:
356361
ksize (tuple(int, int)): Kernel size.
362+
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
357363
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
358364
str: GCS folder str. The output filenames are the same as the input files.
359365
blob Series: The output file paths are determined by the uris of the blob Series.
@@ -367,6 +373,9 @@ def image_blur(
367373
Returns:
368374
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
369375
"""
376+
if engine is None or engine.casefold() != "opencv":
377+
raise ValueError("Must specify the engine, supported value is 'opencv'.")
378+
370379
import bigframes.blob._functions as blob_func
371380

372381
connection = self._resolve_connection(connection)
@@ -424,6 +433,7 @@ def image_resize(
424433
self,
425434
dsize: tuple[int, int] = (0, 0),
426435
*,
436+
engine: Literal[None, "opencv"] = None,
427437
fx: float = 0.0,
428438
fy: float = 0.0,
429439
dst: Optional[Union[str, bigframes.series.Series]] = None,
@@ -436,6 +446,7 @@ def image_resize(
436446
437447
Args:
438448
dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size.
449+
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
439450
fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size.
440451
fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size.
441452
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
@@ -451,6 +462,9 @@ def image_resize(
451462
Returns:
452463
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
453464
"""
465+
if engine is None or engine.casefold() != "opencv":
466+
raise ValueError("Must specify the engine, supported value is 'opencv'.")
467+
454468
dsize_set = dsize[0] > 0 and dsize[1] > 0
455469
fsize_set = fx > 0.0 and fy > 0.0
456470
if not dsize_set ^ fsize_set:
@@ -516,6 +530,7 @@ def image_resize(
516530
def image_normalize(
517531
self,
518532
*,
533+
engine: Literal[None, "opencv"] = None,
519534
alpha: float = 1.0,
520535
beta: float = 0.0,
521536
norm_type: str = "l2",
@@ -528,6 +543,7 @@ def image_normalize(
528543
"""Normalize images.
529544
530545
Args:
546+
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
531547
alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization.
532548
beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization.
533549
norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax".
@@ -544,6 +560,9 @@ def image_normalize(
544560
Returns:
545561
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
546562
"""
563+
if engine is None or engine.casefold() != "opencv":
564+
raise ValueError("Must specify the engine, supported value is 'opencv'.")
565+
547566
import bigframes.blob._functions as blob_func
548567

549568
connection = self._resolve_connection(connection)
@@ -604,6 +623,7 @@ def image_normalize(
604623
def pdf_extract(
605624
self,
606625
*,
626+
engine: Literal[None, "pypdf"] = None,
607627
connection: Optional[str] = None,
608628
max_batching_rows: int = 1,
609629
container_cpu: Union[float, int] = 2,
@@ -613,6 +633,7 @@ def pdf_extract(
613633
"""Extracts text from PDF URLs and saves the text as string.
614634
615635
Args:
636+
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
616637
connection (str or None, default None): BQ connection used for
617638
function internet transactions, and the output blob if "dst"
618639
is str. If None, uses default connection of the session.
@@ -631,6 +652,9 @@ def pdf_extract(
631652
Contains the extracted text from the PDF file.
632653
Includes error messages if verbosity is enabled.
633654
"""
655+
if engine is None or engine.casefold() != "pypdf":
656+
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
657+
634658
import bigframes.bigquery as bbq
635659
import bigframes.blob._functions as blob_func
636660
import bigframes.pandas as bpd
@@ -663,6 +687,7 @@ def pdf_extract(
663687
def pdf_chunk(
664688
self,
665689
*,
690+
engine: Literal[None, "pypdf"] = None,
666691
connection: Optional[str] = None,
667692
chunk_size: int = 2000,
668693
overlap_size: int = 200,
@@ -675,6 +700,7 @@ def pdf_chunk(
675700
arrays of strings.
676701
677702
Args:
703+
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
678704
connection (str or None, default None): BQ connection used for
679705
function internet transactions, and the output blob if "dst"
680706
is str. If None, uses default connection of the session.
@@ -698,6 +724,8 @@ def pdf_chunk(
698724
where each string is a chunk of text extracted from PDF.
699725
Includes error messages if verbosity is enabled.
700726
"""
727+
if engine is None or engine.casefold() != "pypdf":
728+
raise ValueError("Must specify the engine, supported value is 'pypdf'.")
701729

702730
import bigframes.bigquery as bbq
703731
import bigframes.blob._functions as blob_func
@@ -740,6 +768,7 @@ def pdf_chunk(
740768
def audio_transcribe(
741769
self,
742770
*,
771+
engine: Literal["bigquery"] = "bigquery",
743772
connection: Optional[str] = None,
744773
model_name: Optional[
745774
Literal[
@@ -753,6 +782,7 @@ def audio_transcribe(
753782
Transcribe audio content using a Gemini multimodal model.
754783
755784
Args:
785+
engine ('bigquery'): The engine (bigquery or third party library) used for the function.
756786
connection (str or None, default None): BQ connection used for
757787
function internet transactions, and the output blob if "dst"
758788
is str. If None, uses default connection of the session.
@@ -770,6 +800,9 @@ def audio_transcribe(
770800
Contains the transcribed text from the audio file.
771801
Includes error messages if verbosity is enabled.
772802
"""
803+
if engine.casefold() != "bigquery":
804+
raise ValueError("Must specify the engine, supported value is 'bigquery'.")
805+
773806
import bigframes.bigquery as bbq
774807
import bigframes.ml.llm as llm
775808
import bigframes.pandas as bpd

notebooks/multimodal/multimodal_dataframe.ipynb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,16 +254,17 @@
254254
"outputs": [],
255255
"source": [
256256
"df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n",
257-
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n",
257+
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n",
258258
")\n",
259259
"df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n",
260-
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n",
260+
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n",
261261
")\n",
262262
"df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n",
263263
" alpha=50.0,\n",
264264
" beta=150.0,\n",
265265
" norm_type=\"minmax\",\n",
266266
" dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n",
267+
" engine=\"opencv\",\n",
267268
")"
268269
]
269270
},
@@ -280,7 +281,7 @@
280281
"outputs": [],
281282
"source": [
282283
"# You can also chain functions together\n",
283-
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")"
284+
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")"
284285
]
285286
},
286287
{
@@ -419,7 +420,7 @@
419420
},
420421
"outputs": [],
421422
"source": [
422-
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()"
423+
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
423424
]
424425
},
425426
{

samples/snippets/multimodal_test.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
5656

5757
# [START bigquery_dataframes_multimodal_dataframe_image_transform]
5858
df_image["blurred"] = df_image["image"].blob.image_blur(
59-
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/"
59+
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv"
6060
)
6161
df_image["resized"] = df_image["image"].blob.image_resize(
62-
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/"
62+
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv"
6363
)
6464
df_image["normalized"] = df_image["image"].blob.image_normalize(
6565
alpha=50.0,
6666
beta=150.0,
6767
norm_type="minmax",
6868
dst=f"{dst_bucket}/image_normalize_transformed/",
69+
engine="opencv",
6970
)
7071

7172
# You can also chain functions together
7273
df_image["blur_resized"] = df_image["blurred"].blob.image_resize(
73-
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/"
74+
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv"
7475
)
7576
df_image
7677
# [END bigquery_dataframes_multimodal_dataframe_image_transform]
@@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
113114
df_pdf = bpd.from_glob_path(
114115
"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf"
115116
)
116-
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk()
117+
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf")
117118
chunked = df_pdf["chunked"].explode()
118119
chunked
119120
# [END bigquery_dataframes_multimodal_dataframe_pdf_chunk]

tests/system/large/blob/test_function.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ def test_blob_exif(
6161
connection=bq_connection,
6262
)
6363

64-
actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection)
64+
actual = exif_image_df["blob_col"].blob.exif(
65+
engine="pillow", connection=bq_connection
66+
)
6567
expected = bpd.Series(
6668
['{"ExifOffset": 47, "Make": "MyCamera"}'],
6769
session=session,
@@ -86,7 +88,7 @@ def test_blob_image_blur_to_series(
8688
)
8789

8890
actual = images_mm_df["blob_col"].blob.image_blur(
89-
(8, 8), dst=series, connection=bq_connection
91+
(8, 8), dst=series, connection=bq_connection, engine="opencv"
9092
)
9193
expected_df = pd.DataFrame(
9294
{
@@ -114,7 +116,7 @@ def test_blob_image_blur_to_folder(
114116
images_output_uris: list[str],
115117
):
116118
actual = images_mm_df["blob_col"].blob.image_blur(
117-
(8, 8), dst=images_output_folder, connection=bq_connection
119+
(8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv"
118120
)
119121
expected_df = pd.DataFrame(
120122
{
@@ -136,7 +138,9 @@ def test_blob_image_blur_to_folder(
136138

137139

138140
def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
139-
actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection)
141+
actual = images_mm_df["blob_col"].blob.image_blur(
142+
(8, 8), connection=bq_connection, engine="opencv"
143+
)
140144

141145
assert isinstance(actual, bpd.Series)
142146
assert len(actual) == 2
@@ -154,7 +158,7 @@ def test_blob_image_resize_to_series(
154158
)
155159

156160
actual = images_mm_df["blob_col"].blob.image_resize(
157-
(200, 300), dst=series, connection=bq_connection
161+
(200, 300), dst=series, connection=bq_connection, engine="opencv"
158162
)
159163
expected_df = pd.DataFrame(
160164
{
@@ -182,7 +186,7 @@ def test_blob_image_resize_to_folder(
182186
images_output_uris: list[str],
183187
):
184188
actual = images_mm_df["blob_col"].blob.image_resize(
185-
(200, 300), dst=images_output_folder, connection=bq_connection
189+
(200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv"
186190
)
187191
expected_df = pd.DataFrame(
188192
{
@@ -205,7 +209,7 @@ def test_blob_image_resize_to_folder(
205209

206210
def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
207211
actual = images_mm_df["blob_col"].blob.image_resize(
208-
(200, 300), connection=bq_connection
212+
(200, 300), connection=bq_connection, engine="opencv"
209213
)
210214

211215
assert isinstance(actual, bpd.Series)
@@ -224,7 +228,12 @@ def test_blob_image_normalize_to_series(
224228
)
225229

226230
actual = images_mm_df["blob_col"].blob.image_normalize(
227-
alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection
231+
alpha=50.0,
232+
beta=150.0,
233+
norm_type="minmax",
234+
dst=series,
235+
connection=bq_connection,
236+
engine="opencv",
228237
)
229238
expected_df = pd.DataFrame(
230239
{
@@ -257,6 +266,7 @@ def test_blob_image_normalize_to_folder(
257266
norm_type="minmax",
258267
dst=images_output_folder,
259268
connection=bq_connection,
269+
engine="opencv",
260270
)
261271
expected_df = pd.DataFrame(
262272
{
@@ -279,7 +289,11 @@ def test_blob_image_normalize_to_folder(
279289

280290
def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
281291
actual = images_mm_df["blob_col"].blob.image_normalize(
282-
alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection
292+
alpha=50.0,
293+
beta=150.0,
294+
norm_type="minmax",
295+
connection=bq_connection,
296+
engine="opencv",
283297
)
284298

285299
assert isinstance(actual, bpd.Series)
@@ -322,7 +336,7 @@ def test_blob_pdf_extract(
322336
):
323337
actual = (
324338
pdf_mm_df["pdf"]
325-
.blob.pdf_extract(connection=bq_connection, verbose=verbose)
339+
.blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf")
326340
.explode()
327341
.to_pandas()
328342
)
@@ -373,7 +387,11 @@ def test_blob_pdf_chunk(
373387
actual = (
374388
pdf_mm_df["pdf"]
375389
.blob.pdf_chunk(
376-
connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose
390+
connection=bq_connection,
391+
chunk_size=50,
392+
overlap_size=10,
393+
verbose=verbose,
394+
engine="pypdf",
377395
)
378396
.explode()
379397
.to_pandas()

0 commit comments

Comments
 (0)