diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 0bbbc418e6..62c79b5615 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -57,6 +57,7 @@ to_json, to_json_string, ) +from bigframes.bigquery._operations.mathematical import rand from bigframes.bigquery._operations.search import create_vector_index, vector_search from bigframes.bigquery._operations.sql import sql_scalar from bigframes.bigquery._operations.struct import struct @@ -97,6 +98,8 @@ parse_json, to_json, to_json_string, + # mathematical ops + rand, # search ops create_vector_index, vector_search, @@ -148,6 +151,8 @@ "parse_json", "to_json", "to_json_string", + # mathematical ops + "rand", # search ops "create_vector_index", "vector_search", diff --git a/bigframes/bigquery/_operations/mathematical.py b/bigframes/bigquery/_operations/mathematical.py new file mode 100644 index 0000000000..a5d6f9abd3 --- /dev/null +++ b/bigframes/bigquery/_operations/mathematical.py @@ -0,0 +1,68 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Union + +from bigframes import dataframe +from bigframes import dtypes +from bigframes import operations as ops +from bigframes import series + + +def rand(input_data: Union[series.Series, dataframe.DataFrame]) -> series.Series: + """ + Generates a pseudo-random value of type FLOAT64 in the range of [0, 1), + inclusive of 0 and exclusive of 1. + + .. warning:: + This method introduces non-determinism to the expression. Reading the + same column twice may result in different results. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> df = bpd.DataFrame({"a": [1, 2, 3]}) + >>> df['random'] = bbq.rand(df) + >>> # Resulting column 'random' will contain random floats between 0 and 1. + + Args: + input_data (bigframes.pandas.Series or bigframes.pandas.DataFrame): + A Series or DataFrame to determine the number of rows and the index + of the result. The actual values in this input are ignored. + + Returns: + bigframes.pandas.Series: A new Series of random float values. + """ + if isinstance(input_data, dataframe.DataFrame): + if len(input_data.columns) == 0: + raise ValueError("Input DataFrame must have at least one column.") + # Use the first column as anchor + anchor = input_data.iloc[:, 0] + elif isinstance(input_data, series.Series): + anchor = input_data + else: + raise TypeError( + f"Unsupported type {type(input_data)}. " + "Expected bigframes.pandas.Series or bigframes.pandas.DataFrame." + ) + + op = ops.SqlScalarOp( + _output_type=dtypes.FLOAT_DTYPE, + sql_template="RAND()", + is_deterministic=False, + ) + return anchor._apply_nary_op(op, []) diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py index d6155a770c..f7175ec279 100644 --- a/bigframes/operations/generic_ops.py +++ b/bigframes/operations/generic_ops.py @@ -443,10 +443,15 @@ class SqlScalarOp(base_ops.NaryOp): name: typing.ClassVar[str] = "sql_scalar" _output_type: dtypes.ExpressionType sql_template: str + is_deterministic: bool = True def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return self._output_type + @property + def deterministic(self) -> bool: + return self.is_deterministic + @dataclasses.dataclass(frozen=True) class PyUdfOp(base_ops.NaryOp): diff --git a/tests/system/small/bigquery/test_mathematical.py b/tests/system/small/bigquery/test_mathematical.py new file mode 100644 index 0000000000..2e6754a50b --- /dev/null +++ b/tests/system/small/bigquery/test_mathematical.py @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.bigquery as bbq + + +def test_rand(scalars_df_index): + df = scalars_df_index + + # Apply rand + result = bbq.rand(df) + + # Eagerly evaluate + result_pd = result.to_pandas() + + # Check length + assert len(result_pd) == len(df) + + # Check values in [0, 1) + assert (result_pd >= 0).all() + assert (result_pd < 1).all() + + # Check not all values are equal (unlikely collision for random) + if len(result_pd) > 1: + assert result_pd.nunique() > 1 diff --git a/tests/unit/bigquery/test_mathematical.py b/tests/unit/bigquery/test_mathematical.py new file mode 100644 index 0000000000..c7c5cfaa42 --- /dev/null +++ b/tests/unit/bigquery/test_mathematical.py @@ -0,0 +1,57 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest.mock as mock + +import bigframes.bigquery as bbq +import bigframes.dataframe as dataframe +import bigframes.dtypes as dtypes +import bigframes.operations as ops +import bigframes.series as series + + +def test_rand_calls_apply_nary_op(): + mock_series = mock.create_autospec(series.Series, instance=True) + + bbq.rand(mock_series) + + mock_series._apply_nary_op.assert_called_once() + args, _ = mock_series._apply_nary_op.call_args + op = args[0] + assert isinstance(op, ops.SqlScalarOp) + assert op.sql_template == "RAND()" + assert op._output_type == dtypes.FLOAT_DTYPE + assert op.deterministic is False + assert args[1] == [] + + +def test_rand_with_dataframe(): + mock_df = mock.create_autospec(dataframe.DataFrame, instance=True) + # mock columns length > 0 + mock_df.columns = ["col1"] + # mock iloc to return a series + mock_series = mock.create_autospec(series.Series, instance=True) + # Configure mock_df.iloc to return mock_series when indexed + # iloc is indexable, so we mock __getitem__ + mock_indexer = mock.MagicMock() + mock_indexer.__getitem__.return_value = mock_series + type(mock_df).iloc = mock.PropertyMock(return_value=mock_indexer) + + bbq.rand(mock_df) + + mock_series._apply_nary_op.assert_called_once() + args, _ = mock_series._apply_nary_op.call_args + op = args[0] + assert isinstance(op, ops.SqlScalarOp) + assert op.sql_template == "RAND()"