Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,4 @@ system_tests/local_test_setup
# Make sure a generated file isn't accidentally committed.
pylintrc
pylintrc.test
dummy.pkl
37 changes: 37 additions & 0 deletions conftest.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a simplified version of the file at #2147

Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest

import bigframes._config
import bigframes.pandas as bpd


@pytest.fixture(autouse=True)
def default_doctest_imports(doctest_namespace):
"""
Avoid some boilerplate in pandas-inspired tests.

See: https://docs.pytest.org/en/stable/how-to/doctest.html#doctest-namespace-fixture
"""
doctest_namespace["np"] = np
doctest_namespace["pd"] = pd
doctest_namespace["pa"] = pa
doctest_namespace["bpd"] = bpd
bigframes._config.options.display.progress_bar = None
54 changes: 29 additions & 25 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6978,7 +6978,7 @@ def query(self, expr: str) -> DataFrame | None:

def interpolate(self, method: str = "linear"):
"""
Fill NaN values using an interpolation method.
Fill NA (NULL in BigQuery) values using an interpolation method.

**Examples:**

Expand Down Expand Up @@ -7028,35 +7028,39 @@ def interpolate(self, method: str = "linear"):

def fillna(self, value):
"""
Fill NA/NaN values using the specified method.
Fill NA (NULL in BigQuery) values using the specified method.

**Examples:**
Note that empty strings ``''``, :attr:`numpy.inf`, and
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
logic differs from numpy, but it is the same as BigQuery and the
:class:`pandas.ArrowDtype`.

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
**Examples:**

>>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0],
... [3, 4, np.nan, 1],
... [np.nan, np.nan, np.nan, np.nan],
... [np.nan, 3, np.nan, 4]],
... columns=list("ABCD")).astype("Float64")
>>> df = bpd.DataFrame(
... [
... pa.array([np.nan, 2, None, 0], type=pa.float64()),
... pa.array([3, np.nan, None, 1], type=pa.float64()),
... pa.array([None, None, np.nan, None], type=pa.float64()),
... pa.array([4, 5, None, np.nan], type=pa.float64()),
... ], columns=list("ABCD"), dtype=pd.ArrowDtype(pa.float64()))
>>> df
A B C D
0 <NA> 2.0 <NA> 0.0
1 3.0 4.0 <NA> 1.0
2 <NA> <NA> <NA> <NA>
3 <NA> 3.0 <NA> 4.0
A B C D
0 NaN 2.0 <NA> 0.0
1 3.0 NaN <NA> 1.0
2 <NA> <NA> NaN <NA>
3 4.0 5.0 <NA> NaN
<BLANKLINE>
[4 rows x 4 columns]

Replace all NA elements with 0s.
Replace all NA (NULL) elements with 0s.

>>> df.fillna(0)
A B C D
0 0.0 2.0 0.0 0.0
1 3.0 4.0 0.0 1.0
2 0.0 0.0 0.0 0.0
3 0.0 3.0 0.0 4.0
0 NaN 2.0 0.0 0.0
1 3.0 NaN 0.0 1.0
2 0.0 0.0 NaN 0.0
3 4.0 5.0 0.0 NaN
<BLANKLINE>
[4 rows x 4 columns]

Expand All @@ -7072,11 +7076,11 @@ def fillna(self, value):
<BLANKLINE>
[3 rows x 4 columns]
>>> df.fillna(df_fill)
A B C D
0 0.0 2.0 2.0 0.0
1 3.0 4.0 6.0 1.0
2 8.0 9.0 10.0 11.0
3 <NA> 3.0 <NA> 4.0
A B C D
0 NaN 2.0 2.0 0.0
1 3.0 NaN 6.0 1.0
2 8.0 9.0 NaN 11.0
3 4.0 5.0 <NA> NaN
<BLANKLINE>
[4 rows x 4 columns]

Expand Down
87 changes: 50 additions & 37 deletions third_party/bigframes_vendored/pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -816,75 +816,88 @@ def bfill(self, *, limit: Optional[int] = None):
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def isna(self) -> NDFrame:
"""Detect missing values.
"""Detect missing (NULL) values.

Return a boolean same-sized object indicating if the values are NA.
NA values get mapped to True values. Everything else gets mapped to
False values. Characters such as empty strings ``''`` or
:attr:`numpy.inf` are not considered NA values.
Return a boolean same-sized object indicating if the values are NA
(NULL in BigQuery). NA/NULL values get mapped to True values.
Everything else gets mapped to False values.

**Examples:**
Note that empty strings ``''``, :attr:`numpy.inf`, and
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
logic differs from numpy, but it is the same as BigQuery and the
:class:`pandas.ArrowDtype`.

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> import numpy as np
**Examples:**

>>> df = bpd.DataFrame(dict(
... age=[5, 6, np.nan],
... born=[bpd.NA, "1940-04-25", "1940-04-25"],
... name=['Alfred', 'Batman', ''],
... toy=[None, 'Batmobile', 'Joker'],
... age=pd.Series(pa.array(
... [5, 6, None, 4],
... type=pa.int64(),
... ), dtype=pd.ArrowDtype(pa.int64())),
... born=pd.to_datetime([pd.NA, "1940-04-25", "1940-04-25", "1941-08-25"]),
... name=['Alfred', 'Batman', '', 'Plastic Man'],
... toy=[None, 'Batmobile', 'Joker', 'Play dough'],
... height=pd.Series(pa.array(
... [6.1, 5.9, None, np.nan],
... type=pa.float64(),
... ), dtype=pd.ArrowDtype(pa.float64())),
... ))
>>> df
age born name toy
0 5.0 <NA> Alfred <NA>
1 6.0 1940-04-25 Batman Batmobile
2 <NA> 1940-04-25 Joker
age born name toy height
0 5 <NA> Alfred <NA> 6.1
1 6 1940-04-25 00:00:00 Batman Batmobile 5.9
2 <NA> 1940-04-25 00:00:00 Joker <NA>
3 4 1941-08-25 00:00:00 Plastic Man Play dough NaN
<BLANKLINE>
[3 rows x 4 columns]
[4 rows x 5 columns]

Show which entries in a DataFrame are NA:
Show which entries in a DataFrame are NA (NULL in BigQuery):

>>> df.isna()
age born name toy
0 False True False True
1 False False False False
2 True False False False
age born name toy height
0 False True False True False
1 False False False False False
2 True False False False True
3 False False False False False
<BLANKLINE>
[3 rows x 4 columns]
[4 rows x 5 columns]

>>> df.isnull()
age born name toy
0 False True False True
1 False False False False
2 True False False False
age born name toy height
0 False True False True False
1 False False False False False
2 True False False False True
3 False False False False False
<BLANKLINE>
[3 rows x 4 columns]
[4 rows x 5 columns]

Show which entries in a Series are NA:
Show which entries in a Series are NA (NULL in BigQuery):

>>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA])
>>> ser = bpd.Series(pa.array(
... [5, None, 6, np.nan, None],
... type=pa.float64(),
... ), dtype=pd.ArrowDtype(pa.float64()))
>>> ser
0 5
0 5.0
1 <NA>
2 6
3 <NA>
2 6.0
3 NaN
4 <NA>
dtype: Int64
dtype: Float64

>>> ser.isna()
0 False
1 True
2 False
3 True
3 False
4 True
dtype: boolean

>>> ser.isnull()
0 False
1 True
2 False
3 True
3 False
4 True
dtype: boolean

Expand Down
20 changes: 13 additions & 7 deletions third_party/bigframes_vendored/pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,17 +957,23 @@ def value_counts(

def fillna(self, value) -> Index:
"""
Fill NA/NaN values with the specified value.
Fill NA (NULL in BigQuery) values using the specified method.

**Examples:**
Note that empty strings ``''``, :attr:`numpy.inf`, and
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
logic differs from numpy, but it is the same as BigQuery and the
:class:`pandas.ArrowDtype`.

>>> import bigframes.pandas as bpd
>>> import numpy as np
>>> bpd.options.display.progress_bar = None
**Examples:**

>>> idx = bpd.Index([np.nan, np.nan, 3])
>>> idx = bpd.Index(
... pa.array([None, np.nan, 3, None], type=pa.float64()),
... dtype=pd.ArrowDtype(pa.float64()),
... )
>>> idx
Index([<NA>, nan, 3.0, <NA>], dtype='Float64')
>>> idx.fillna(0)
Index([0.0, 0.0, 3.0], dtype='Float64')
Index([0.0, nan, 3.0, 0.0], dtype='Float64')

Args:
value (scalar):
Expand Down
26 changes: 15 additions & 11 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2410,26 +2410,30 @@ def fillna(
value=None,
) -> Series | None:
"""
Fill NA/NaN values using the specified method.
Fill NA (NULL in BigQuery) values using the specified method.

**Examples:**
Note that empty strings ``''``, :attr:`numpy.inf`, and
:attr:`numpy.nan` are ***not*** considered NA values. This NA/NULL
logic differs from numpy, but it is the same as BigQuery and the
:class:`pandas.ArrowDtype`.

>>> import bigframes.pandas as bpd
>>> import numpy as np
>>> bpd.options.display.progress_bar = None
**Examples:**

>>> s = bpd.Series([np.nan, 2, np.nan, -1])
>>> s = bpd.Series(
... pa.array([np.nan, 2, None, -1], type=pa.float64()),
... dtype=pd.ArrowDtype(pa.float64()),
... )
>>> s
0 <NA>
0 NaN
1 2.0
2 <NA>
3 -1.0
dtype: Float64

Replace all NA elements with 0s.
Replace all NA (NULL) elements with 0s.

>>> s.fillna(0)
0 0.0
0 NaN
1 2.0
2 0.0
3 -1.0
Expand All @@ -2439,7 +2443,7 @@ def fillna(

>>> s_fill = bpd.Series([11, 22, 33])
>>> s.fillna(s_fill)
0 11.0
0 NaN
1 2.0
2 33.0
3 -1.0
Expand Down Expand Up @@ -4482,7 +4486,7 @@ def update(self, other) -> None:
2 6
dtype: Int64

If ``other`` contains NaNs the corresponding values are not updated
If ``other`` contains NA (NULL values) the corresponding values are not updated
in the original Series.

>>> s = bpd.Series([1, 2, 3])
Expand Down