Skip to content

Commit 1649a58

Browse files
authored
Use pytest-memray to check for block serialization error issues (#1330)
1 parent e5045b2 commit 1649a58

File tree

3 files changed

+245
-1
lines changed

3 files changed

+245
-1
lines changed

pydatalab/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ dev-dependencies = [
119119
"pytest ~= 8.2",
120120
"pytest-cov ~= 5.0",
121121
"pytest-dependency ~= 0.6",
122+
"pytest-memray ~= 1.8",
122123
"pre-commit ~= 4.0",
123124
"mongomock ~= 4.1",
124125
"mkdocs ~= 1.6",
@@ -134,7 +135,7 @@ dev-dependencies = [
134135
datalab-app-plugin-insitu = { git = "https://github.com/datalab-org/datalab-app-plugin-insitu.git", rev = "v0.2.0" }
135136

136137
[tool.pytest.ini_options]
137-
addopts = "--cov-report=term --cov-report=xml --cov ./src/pydatalab"
138+
addopts = "--cov-report=xml --cov ./src/pydatalab"
138139
filterwarnings = [
139140
"error",
140141
"ignore:.*np.bool8*:DeprecationWarning",

pydatalab/tests/server/test_blocks.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from pathlib import Path
2+
13
import pytest
24

35
from pydatalab.apps import BLOCK_TYPES, BLOCKS
@@ -544,3 +546,113 @@ def test_create_sample_with_example_files(
544546
if block_type == "xrd":
545547
doc = database.items.find_one({"item_id": sample_id}, projection={"blocks_obj": 1})
546548
assert doc["blocks_obj"][block_id]["computed"]["peak_data"] is not None
549+
550+
551+
@pytest.fixture()
552+
def create_large_xye_file(tmpdir):
553+
"""Create a relatively large .xye file for testing tabular block serialization and memory usage,
554+
as a separate fixture to avoid it being counted in memray profile."""
555+
556+
fname = Path(tmpdir / "large_table.xye")
557+
558+
# Make a dataframe of ~3 columns and 1,000,000 rows
559+
# totalling ~2.4 MB (raw floats), so maybe 3 MB as a dataframe
560+
import numpy as np
561+
import pandas as pd
562+
563+
N = 50_000
564+
565+
pd.DataFrame(
566+
{
567+
"two_theta": np.array(np.linspace(5, 85, N), dtype=np.float64),
568+
"intensity": np.array(np.random.rand(N), dtype=np.float64),
569+
"error": np.array(0.1 * np.random.rand(N), dtype=np.float64),
570+
}
571+
).to_csv(fname, sep=",", index=False)
572+
573+
yield fname
574+
575+
576+
@pytest.mark.limit_memory("130MB")
577+
def test_large_fake_xrd_data_block_serialization(
578+
admin_client, default_sample_dict, tmpdir, create_large_xye_file
579+
):
580+
"""Make a fake xye file with relatively large data and test serialization
581+
memory usage in particular.
582+
583+
As of the time of writing, we get a breakdown like:
584+
585+
> Allocation results for tests/server/test_blocks.py::test_large_fake_xrd_data_block_serialization at the high watermark
586+
>
587+
> 📦 Total memory allocated: 128.4MiB
588+
> 📏 Total allocations: 382
589+
> 📊 Histogram of allocation sizes: |▁▃█ |
590+
> 🥇 Biggest allocating functions:
591+
> - lstsq:./pydatalab/.venv/lib/python3.11/site-packages/numpy/linalg/linalg.py:2326 -> 32.0MiB
592+
> - raw_decode:/home/mevans/.local/share/uv/python/cpython-3.11.10-linux-x86_64-gnu/lib/python3.11/json/decoder.py:353 -> 20.3MiB
593+
> - raw_decode:/home/mevans/.local/share/uv/python/cpython-3.11.10-linux-x86_64-gnu/lib/python3.11/json/decoder.py:353 -> 19.3MiB
594+
> - encode:/home/mevans/.local/share/uv/python/cpython-3.11.10-linux-x86_64-gnu/lib/python3.11/json/encoder.py:203 -> 14.0MiB
595+
> - _iterencode_list:/home/mevans/.local/share/uv/python/cpython-3.11.10-linux-x86_64-gnu/lib/python3.11/json/encoder.py:303 -> 14.0MiB
596+
597+
"""
598+
import gc
599+
600+
gc.collect()
601+
gc.collect()
602+
603+
block_type = "xrd"
604+
605+
sample_id = "test_sample_with_large_table"
606+
sample_data = default_sample_dict.copy()
607+
sample_data["item_id"] = sample_id
608+
609+
response = admin_client.post("/new-sample/", json=sample_data)
610+
assert response.status_code == 201, f"Failed to create sample for {block_type}: {response.json}"
611+
assert response.json["status"] == "success"
612+
613+
with open(create_large_xye_file, "rb") as f:
614+
response = admin_client.post(
615+
"/upload-file/",
616+
buffered=True,
617+
content_type="multipart/form-data",
618+
data={
619+
"item_id": sample_id,
620+
"file": [(f, create_large_xye_file.name)],
621+
"type": "application/octet-stream",
622+
"replace_file": "null",
623+
"relativePath": "null",
624+
},
625+
)
626+
assert response.status_code == 201, f"Failed to upload {create_large_xye_file}"
627+
assert response.json["status"] == "success"
628+
file_id = response.json["file_id"]
629+
630+
response = admin_client.post(
631+
"/add-data-block/",
632+
json={
633+
"block_type": block_type,
634+
"item_id": sample_id,
635+
"index": 0,
636+
},
637+
)
638+
639+
block_id = response.json["new_block_obj"]["block_id"]
640+
641+
gc.collect()
642+
643+
response = admin_client.post(
644+
"/update-block/",
645+
json={
646+
"block_data": {
647+
"blocktype": "tabular",
648+
"item_id": sample_id,
649+
"file_id": file_id,
650+
"block_id": block_id,
651+
},
652+
},
653+
)
654+
655+
assert response.status_code == 200, f"Failed to update tabular block: {response.json}"
656+
assert response.json["new_block_data"]["bokeh_plot_data"]
657+
658+
gc.collect()

pydatalab/uv.lock

Lines changed: 131 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)