feat!: add required param 'engine' to multimodal functions (#1834)

GarrettWu · tswast · gcf-owl-bot[bot] · web-flow · commit 37666e4c137d · 2025-06-18T09:42:47.000-05:00
* feat: add required param 'engine' to multimodal functions * add missing engine to exif test * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Tim Sweña (Swast) <tswast@gmail.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Release-As: 2.8.0
diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py
@@ -303,6 +303,7 @@ def get_runtime_json_str(
     def exif(
         self,
         *,
+        engine: Literal[None, "pillow"] = None,
         connection: Optional[str] = None,
         max_batching_rows: int = 8192,
         container_cpu: Union[float, int] = 0.33,
@@ -311,6 +312,7 @@ def exif(
         """Extract EXIF data. Now only support image types.
 
         Args:
+            engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
             connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
             max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
             container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
@@ -319,6 +321,8 @@ def exif(
         Returns:
             bigframes.series.Series: JSON series of key-value pairs.
         """
+        if engine is None or engine.casefold() != "pillow":
+            raise ValueError("Must specify the engine, supported value is 'pillow'.")
 
         import bigframes.bigquery as bbq
         import bigframes.blob._functions as blob_func
@@ -344,6 +348,7 @@ def image_blur(
         self,
         ksize: tuple[int, int],
         *,
+        engine: Literal[None, "opencv"] = None,
         dst: Optional[Union[str, bigframes.series.Series]] = None,
         connection: Optional[str] = None,
         max_batching_rows: int = 8192,
@@ -354,6 +359,7 @@ def image_blur(
 
         Args:
             ksize (tuple(int, int)): Kernel size.
+            engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
             dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
                 str: GCS folder str. The output filenames are the same as the input files.
                 blob Series: The output file paths are determined by the uris of the blob Series.
@@ -367,6 +373,9 @@ def image_blur(
         Returns:
             bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
         """
+        if engine is None or engine.casefold() != "opencv":
+            raise ValueError("Must specify the engine, supported value is 'opencv'.")
+
         import bigframes.blob._functions as blob_func
 
         connection = self._resolve_connection(connection)
@@ -424,6 +433,7 @@ def image_resize(
         self,
         dsize: tuple[int, int] = (0, 0),
         *,
+        engine: Literal[None, "opencv"] = None,
         fx: float = 0.0,
         fy: float = 0.0,
         dst: Optional[Union[str, bigframes.series.Series]] = None,
@@ -436,6 +446,7 @@ def image_resize(
 
         Args:
             dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size.
+            engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
             fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size.
             fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size.
             dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
@@ -451,6 +462,9 @@ def image_resize(
         Returns:
             bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
         """
+        if engine is None or engine.casefold() != "opencv":
+            raise ValueError("Must specify the engine, supported value is 'opencv'.")
+
         dsize_set = dsize[0] > 0 and dsize[1] > 0
         fsize_set = fx > 0.0 and fy > 0.0
         if not dsize_set ^ fsize_set:
@@ -516,6 +530,7 @@ def image_resize(
     def image_normalize(
         self,
         *,
+        engine: Literal[None, "opencv"] = None,
         alpha: float = 1.0,
         beta: float = 0.0,
         norm_type: str = "l2",
@@ -528,6 +543,7 @@ def image_normalize(
         """Normalize images.
 
         Args:
+            engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
             alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization.
             beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization.
             norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax".
@@ -544,6 +560,9 @@ def image_normalize(
         Returns:
             bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
         """
+        if engine is None or engine.casefold() != "opencv":
+            raise ValueError("Must specify the engine, supported value is 'opencv'.")
+
         import bigframes.blob._functions as blob_func
 
         connection = self._resolve_connection(connection)
@@ -604,6 +623,7 @@ def image_normalize(
     def pdf_extract(
         self,
         *,
+        engine: Literal[None, "pypdf"] = None,
         connection: Optional[str] = None,
         max_batching_rows: int = 1,
         container_cpu: Union[float, int] = 2,
@@ -613,6 +633,7 @@ def pdf_extract(
         """Extracts text from PDF URLs and saves the text as string.
 
         Args:
+            engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
             connection (str or None, default None): BQ connection used for
                 function internet transactions, and the output blob if "dst"
                 is str. If None, uses default connection of the session.
@@ -631,6 +652,9 @@ def pdf_extract(
                 Contains the extracted text from the PDF file.
                 Includes error messages if verbosity is enabled.
         """
+        if engine is None or engine.casefold() != "pypdf":
+            raise ValueError("Must specify the engine, supported value is 'pypdf'.")
+
         import bigframes.bigquery as bbq
         import bigframes.blob._functions as blob_func
         import bigframes.pandas as bpd
@@ -663,6 +687,7 @@ def pdf_extract(
     def pdf_chunk(
         self,
         *,
+        engine: Literal[None, "pypdf"] = None,
         connection: Optional[str] = None,
         chunk_size: int = 2000,
         overlap_size: int = 200,
@@ -675,6 +700,7 @@ def pdf_chunk(
            arrays of strings.
 
         Args:
+            engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
             connection (str or None, default None): BQ connection used for
                 function internet transactions, and the output blob if "dst"
                 is str. If None, uses default connection of the session.
@@ -698,6 +724,8 @@ def pdf_chunk(
                 where each string is a chunk of text extracted from PDF.
                 Includes error messages if verbosity is enabled.
         """
+        if engine is None or engine.casefold() != "pypdf":
+            raise ValueError("Must specify the engine, supported value is 'pypdf'.")
 
         import bigframes.bigquery as bbq
         import bigframes.blob._functions as blob_func
@@ -740,6 +768,7 @@ def pdf_chunk(
     def audio_transcribe(
         self,
         *,
+        engine: Literal["bigquery"] = "bigquery",
         connection: Optional[str] = None,
         model_name: Optional[
             Literal[
@@ -753,6 +782,7 @@ def audio_transcribe(
         Transcribe audio content using a Gemini multimodal model.
 
         Args:
+            engine ('bigquery'): The engine (bigquery or third party library) used for the function.
             connection (str or None, default None): BQ connection used for
                 function internet transactions, and the output blob if "dst"
                 is str. If None, uses default connection of the session.
@@ -770,6 +800,9 @@ def audio_transcribe(
                 Contains the transcribed text from the audio file.
                 Includes error messages if verbosity is enabled.
         """
+        if engine.casefold() != "bigquery":
+            raise ValueError("Must specify the engine, supported value is 'bigquery'.")
+
         import bigframes.bigquery as bbq
         import bigframes.ml.llm as llm
         import bigframes.pandas as bpd
diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb
@@ -254,16 +254,17 @@
       "outputs": [],
       "source": [
         "df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n",
-        "    (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n",
+        "    (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n",
         ")\n",
         "df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n",
-        "    (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n",
+        "    (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n",
         ")\n",
         "df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n",
         "    alpha=50.0,\n",
         "    beta=150.0,\n",
         "    norm_type=\"minmax\",\n",
         "    dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n",
+        "    engine=\"opencv\",\n",
         ")"
       ]
     },
@@ -280,7 +281,7 @@
       "outputs": [],
       "source": [
         "# You can also chain functions together\n",
-        "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")"
+        "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")"
       ]
     },
     {
@@ -419,7 +420,7 @@
       },
       "outputs": [],
       "source": [
-        "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()"
+        "df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
       ]
     },
     {
diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py
@@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
 
     # [START bigquery_dataframes_multimodal_dataframe_image_transform]
     df_image["blurred"] = df_image["image"].blob.image_blur(
-        (20, 20), dst=f"{dst_bucket}/image_blur_transformed/"
+        (20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv"
     )
     df_image["resized"] = df_image["image"].blob.image_resize(
-        (300, 200), dst=f"{dst_bucket}/image_resize_transformed/"
+        (300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv"
     )
     df_image["normalized"] = df_image["image"].blob.image_normalize(
         alpha=50.0,
         beta=150.0,
         norm_type="minmax",
         dst=f"{dst_bucket}/image_normalize_transformed/",
+        engine="opencv",
     )
 
     # You can also chain functions together
     df_image["blur_resized"] = df_image["blurred"].blob.image_resize(
-        (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/"
+        (300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv"
     )
     df_image
     # [END bigquery_dataframes_multimodal_dataframe_image_transform]
@@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
     df_pdf = bpd.from_glob_path(
         "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf"
     )
-    df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk()
+    df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf")
     chunked = df_pdf["chunked"].explode()
     chunked
     # [END bigquery_dataframes_multimodal_dataframe_pdf_chunk]
diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py
@@ -61,7 +61,9 @@ def test_blob_exif(
         connection=bq_connection,
     )
 
-    actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection)
+    actual = exif_image_df["blob_col"].blob.exif(
+        engine="pillow", connection=bq_connection
+    )
     expected = bpd.Series(
         ['{"ExifOffset": 47, "Make": "MyCamera"}'],
         session=session,
@@ -86,7 +88,7 @@ def test_blob_image_blur_to_series(
     )
 
     actual = images_mm_df["blob_col"].blob.image_blur(
-        (8, 8), dst=series, connection=bq_connection
+        (8, 8), dst=series, connection=bq_connection, engine="opencv"
     )
     expected_df = pd.DataFrame(
         {
@@ -114,7 +116,7 @@ def test_blob_image_blur_to_folder(
     images_output_uris: list[str],
 ):
     actual = images_mm_df["blob_col"].blob.image_blur(
-        (8, 8), dst=images_output_folder, connection=bq_connection
+        (8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv"
     )
     expected_df = pd.DataFrame(
         {
@@ -136,7 +138,9 @@ def test_blob_image_blur_to_folder(
 
 
 def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
-    actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection)
+    actual = images_mm_df["blob_col"].blob.image_blur(
+        (8, 8), connection=bq_connection, engine="opencv"
+    )
 
     assert isinstance(actual, bpd.Series)
     assert len(actual) == 2
@@ -154,7 +158,7 @@ def test_blob_image_resize_to_series(
     )
 
     actual = images_mm_df["blob_col"].blob.image_resize(
-        (200, 300), dst=series, connection=bq_connection
+        (200, 300), dst=series, connection=bq_connection, engine="opencv"
     )
     expected_df = pd.DataFrame(
         {
@@ -182,7 +186,7 @@ def test_blob_image_resize_to_folder(
     images_output_uris: list[str],
 ):
     actual = images_mm_df["blob_col"].blob.image_resize(
-        (200, 300), dst=images_output_folder, connection=bq_connection
+        (200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv"
     )
     expected_df = pd.DataFrame(
         {
@@ -205,7 +209,7 @@ def test_blob_image_resize_to_folder(
 
 def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
     actual = images_mm_df["blob_col"].blob.image_resize(
-        (200, 300), connection=bq_connection
+        (200, 300), connection=bq_connection, engine="opencv"
     )
 
     assert isinstance(actual, bpd.Series)
@@ -224,7 +228,12 @@ def test_blob_image_normalize_to_series(
     )
 
     actual = images_mm_df["blob_col"].blob.image_normalize(
-        alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection
+        alpha=50.0,
+        beta=150.0,
+        norm_type="minmax",
+        dst=series,
+        connection=bq_connection,
+        engine="opencv",
     )
     expected_df = pd.DataFrame(
         {
@@ -257,6 +266,7 @@ def test_blob_image_normalize_to_folder(
         norm_type="minmax",
         dst=images_output_folder,
         connection=bq_connection,
+        engine="opencv",
     )
     expected_df = pd.DataFrame(
         {
@@ -279,7 +289,11 @@ def test_blob_image_normalize_to_folder(
 
 def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
     actual = images_mm_df["blob_col"].blob.image_normalize(
-        alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection
+        alpha=50.0,
+        beta=150.0,
+        norm_type="minmax",
+        connection=bq_connection,
+        engine="opencv",
     )
 
     assert isinstance(actual, bpd.Series)
@@ -322,7 +336,7 @@ def test_blob_pdf_extract(
 ):
     actual = (
         pdf_mm_df["pdf"]
-        .blob.pdf_extract(connection=bq_connection, verbose=verbose)
+        .blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf")
         .explode()
         .to_pandas()
     )
@@ -373,7 +387,11 @@ def test_blob_pdf_chunk(
     actual = (
         pdf_mm_df["pdf"]
         .blob.pdf_chunk(
-            connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose
+            connection=bq_connection,
+            chunk_size=50,
+            overlap_size=10,
+            verbose=verbose,
+            engine="pypdf",
         )
         .explode()
         .to_pandas()

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,9 @@ def test_blob_exif(`
`61`	`61`	`connection=bq_connection,`
`62`	`62`	`)`
`63`	`63`
`64`		`- actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection)`
	`64`	`+ actual = exif_image_df["blob_col"].blob.exif(`
	`65`	`+ engine="pillow", connection=bq_connection`
	`66`	`+ )`
`65`	`67`	`expected = bpd.Series(`
`66`	`68`	`['{"ExifOffset": 47, "Make": "MyCamera"}'],`
`67`	`69`	`session=session,`
`@@ -86,7 +88,7 @@ def test_blob_image_blur_to_series(`
`86`	`88`	`)`
`87`	`89`
`88`	`90`	`actual = images_mm_df["blob_col"].blob.image_blur(`
`89`		`- (8, 8), dst=series, connection=bq_connection`
	`91`	`+ (8, 8), dst=series, connection=bq_connection, engine="opencv"`
`90`	`92`	`)`
`91`	`93`	`expected_df = pd.DataFrame(`
`92`	`94`	`{`
`@@ -114,7 +116,7 @@ def test_blob_image_blur_to_folder(`
`114`	`116`	`images_output_uris: list[str],`
`115`	`117`	`):`
`116`	`118`	`actual = images_mm_df["blob_col"].blob.image_blur(`
`117`		`- (8, 8), dst=images_output_folder, connection=bq_connection`
	`119`	`+ (8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv"`
`118`	`120`	`)`
`119`	`121`	`expected_df = pd.DataFrame(`
`120`	`122`	`{`
`@@ -136,7 +138,9 @@ def test_blob_image_blur_to_folder(`
`136`	`138`
`137`	`139`
`138`	`140`	`def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):`
`139`		`- actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection)`
	`141`	`+ actual = images_mm_df["blob_col"].blob.image_blur(`
	`142`	`+ (8, 8), connection=bq_connection, engine="opencv"`
	`143`	`+ )`
`140`	`144`
`141`	`145`	`assert isinstance(actual, bpd.Series)`
`142`	`146`	`assert len(actual) == 2`
`@@ -154,7 +158,7 @@ def test_blob_image_resize_to_series(`
`154`	`158`	`)`
`155`	`159`
`156`	`160`	`actual = images_mm_df["blob_col"].blob.image_resize(`
`157`		`- (200, 300), dst=series, connection=bq_connection`
	`161`	`+ (200, 300), dst=series, connection=bq_connection, engine="opencv"`
`158`	`162`	`)`
`159`	`163`	`expected_df = pd.DataFrame(`
`160`	`164`	`{`
`@@ -182,7 +186,7 @@ def test_blob_image_resize_to_folder(`
`182`	`186`	`images_output_uris: list[str],`
`183`	`187`	`):`
`184`	`188`	`actual = images_mm_df["blob_col"].blob.image_resize(`
`185`		`- (200, 300), dst=images_output_folder, connection=bq_connection`
	`189`	`+ (200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv"`
`186`	`190`	`)`
`187`	`191`	`expected_df = pd.DataFrame(`
`188`	`192`	`{`
`@@ -205,7 +209,7 @@ def test_blob_image_resize_to_folder(`
`205`	`209`
`206`	`210`	`def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):`
`207`	`211`	`actual = images_mm_df["blob_col"].blob.image_resize(`
`208`		`- (200, 300), connection=bq_connection`
	`212`	`+ (200, 300), connection=bq_connection, engine="opencv"`
`209`	`213`	`)`
`210`	`214`
`211`	`215`	`assert isinstance(actual, bpd.Series)`
`@@ -224,7 +228,12 @@ def test_blob_image_normalize_to_series(`
`224`	`228`	`)`
`225`	`229`
`226`	`230`	`actual = images_mm_df["blob_col"].blob.image_normalize(`
`227`		`- alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection`
	`231`	`+ alpha=50.0,`
	`232`	`+ beta=150.0,`
	`233`	`+ norm_type="minmax",`
	`234`	`+ dst=series,`
	`235`	`+ connection=bq_connection,`
	`236`	`+ engine="opencv",`
`228`	`237`	`)`
`229`	`238`	`expected_df = pd.DataFrame(`
`230`	`239`	`{`
`@@ -257,6 +266,7 @@ def test_blob_image_normalize_to_folder(`
`257`	`266`	`norm_type="minmax",`
`258`	`267`	`dst=images_output_folder,`
`259`	`268`	`connection=bq_connection,`
	`269`	`+ engine="opencv",`
`260`	`270`	`)`
`261`	`271`	`expected_df = pd.DataFrame(`
`262`	`272`	`{`
`@@ -279,7 +289,11 @@ def test_blob_image_normalize_to_folder(`
`279`	`289`
`280`	`290`	`def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):`
`281`	`291`	`actual = images_mm_df["blob_col"].blob.image_normalize(`
`282`		`- alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection`
	`292`	`+ alpha=50.0,`
	`293`	`+ beta=150.0,`
	`294`	`+ norm_type="minmax",`
	`295`	`+ connection=bq_connection,`
	`296`	`+ engine="opencv",`
`283`	`297`	`)`
`284`	`298`
`285`	`299`	`assert isinstance(actual, bpd.Series)`
`@@ -322,7 +336,7 @@ def test_blob_pdf_extract(`
`322`	`336`	`):`
`323`	`337`	`actual = (`
`324`	`338`	`pdf_mm_df["pdf"]`
`325`		`- .blob.pdf_extract(connection=bq_connection, verbose=verbose)`
	`339`	`+ .blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf")`
`326`	`340`	`.explode()`
`327`	`341`	`.to_pandas()`
`328`	`342`	`)`
`@@ -373,7 +387,11 @@ def test_blob_pdf_chunk(`
`373`	`387`	`actual = (`
`374`	`388`	`pdf_mm_df["pdf"]`
`375`	`389`	`.blob.pdf_chunk(`
`376`		`- connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose`
	`390`	`+ connection=bq_connection,`
	`391`	`+ chunk_size=50,`
	`392`	`+ overlap_size=10,`
	`393`	`+ verbose=verbose,`
	`394`	`+ engine="pypdf",`
`377`	`395`	`)`
`378`	`396`	`.explode()`
`379`	`397`	`.to_pandas()`