Skip to content

Commit 185eacb

Browse files
committed
POC: updates from ruff formatter
1 parent 798d410 commit 185eacb

File tree

9 files changed

+56
-38
lines changed

9 files changed

+56
-38
lines changed

migrations/001_2025_05_30_backfill_run_timestamp_column.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def backfill_dataset(location: str, *, dry_run: bool = False) -> None:
8383
logger.info(json.dumps(result))
8484

8585
logger.info(
86-
f"Backfill complete. Elapsed: {time.perf_counter()-start_time}, "
86+
f"Backfill complete. Elapsed: {time.perf_counter() - start_time}, "
8787
f"Success: {success_count}, Skipped: {skip_count}, Errors: {error_count}"
8888
)
8989

migrations/002_2025_06_25_consistent_run_timestamp_per_etl_run.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ def fix_backfilled_run_timestamps(location: str, *, dry_run: bool = False) -> No
7070
error_count = 0
7171

7272
for idx, row in parquet_to_run_timestamp_df.iterrows():
73-
7473
if row.status == "OK":
7574
continue
7675

@@ -94,7 +93,7 @@ def fix_backfilled_run_timestamps(location: str, *, dry_run: bool = False) -> No
9493
logger.info(json.dumps(result))
9594

9695
logger.info(
97-
f"Backfill complete. Elapsed: {time.perf_counter()-start_time}, "
96+
f"Backfill complete. Elapsed: {time.perf_counter() - start_time}, "
9897
f"Success: {success_count}, Skipped: {skip_count}, Errors: {error_count}"
9998
)
10099

tests/test_dataset.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,12 +248,15 @@ def test_dataset_duckdb_context_created_on_init(timdex_dataset):
248248

249249

250250
def test_dataset_duckdb_context_creates_data_schema(timdex_dataset):
251-
assert timdex_dataset.conn.query("""
251+
assert (
252+
timdex_dataset.conn.query("""
252253
select count(*)
253254
from information_schema.schemata
254255
where catalog_name = 'memory'
255256
and schema_name = 'data';
256-
""").fetchone()[0] == 1
257+
""").fetchone()[0]
258+
== 1
259+
)
257260

258261

259262
def test_dataset_preload_current_records_default_false(timdex_dataset):

tests/test_metadata.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,15 @@ def test_tdm_init_metadata_file_found_success(timdex_metadata):
5555

5656

5757
def test_tdm_duckdb_context_creates_metadata_schema(timdex_metadata):
58-
assert timdex_metadata.conn.query("""
58+
assert (
59+
timdex_metadata.conn.query("""
5960
select count(*)
6061
from information_schema.schemata
6162
where catalog_name = 'memory'
6263
and schema_name = 'metadata';
63-
""").fetchone()[0] == 1
64+
""").fetchone()[0]
65+
== 1
66+
)
6467

6568

6669
def test_tdm_connection_has_static_database_attached(timdex_metadata):
@@ -285,13 +288,13 @@ def test_tdm_merge_append_deltas_adds_records_to_static_db(
285288
):
286289
append_deltas = timdex_metadata_with_deltas.conn.query(f"""
287290
select
288-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
291+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
289292
from metadata.append_deltas
290293
""").to_df()
291294

292295
merged_static_db = timdex_metadata_merged_deltas.conn.query(f"""
293296
select
294-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
297+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
295298
from static_db.records
296299
""").to_df()
297300

@@ -317,12 +320,16 @@ def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_and_valid(
317320
monkeypatch.setenv("HOME", str(preset_home))
318321

319322
td = TIMDEXDataset(timdex_dataset_with_runs.location)
320-
df = td.conn.query("""
323+
df = (
324+
td.conn.query("""
321325
select
322326
current_setting('secret_directory') as secret_directory,
323327
current_setting('extension_directory') as extension_directory
324328
;
325-
""").to_df().iloc[0]
329+
""")
330+
.to_df()
331+
.iloc[0]
332+
)
326333
assert "my-account" in df.secret_directory
327334
assert df.extension_directory == "" # expected and okay when HOME set
328335

@@ -334,12 +341,16 @@ def test_td_prepare_duckdb_secret_and_extensions_home_env_var_unset(
334341

335342
td = TIMDEXDataset(timdex_dataset_with_runs.location)
336343

337-
df = td.conn.query("""
344+
df = (
345+
td.conn.query("""
338346
select
339347
current_setting('secret_directory') as secret_directory,
340348
current_setting('extension_directory') as extension_directory
341349
;
342-
""").to_df().iloc[0]
350+
""")
351+
.to_df()
352+
.iloc[0]
353+
)
343354
assert df.secret_directory == "/tmp/.duckdb/secrets"
344355
assert df.extension_directory == "/tmp/.duckdb/extensions"
345356

@@ -351,12 +362,16 @@ def test_td_prepare_duckdb_secret_and_extensions_home_env_var_set_but_empty(
351362

352363
td = TIMDEXDataset(timdex_dataset_with_runs.location)
353364

354-
df = td.conn.query("""
365+
df = (
366+
td.conn.query("""
355367
select
356368
current_setting('secret_directory') as secret_directory,
357369
current_setting('extension_directory') as extension_directory
358370
;
359-
""").to_df().iloc[0]
371+
""")
372+
.to_df()
373+
.iloc[0]
374+
)
360375
assert df.secret_directory == "/tmp/.duckdb/secrets"
361376
assert df.extension_directory == "/tmp/.duckdb/extensions"
362377

tests/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def generate_sample_embeddings_for_run(
118118
for _idx, record in records_metadata.iterrows():
119119
embedding_vector = [random.random() for _ in range(embedding_dimensions)]
120120
embedding_object = json.dumps(
121-
{f"token{x+1}": (x + 1) / 10 for x in range(embedding_dimensions)}
121+
{f"token{x + 1}": (x + 1) / 10 for x in range(embedding_dimensions)}
122122
).encode()
123123

124124
yield DatasetEmbedding(

timdex_dataset_api/dataset.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,6 @@ class TIMDEXDatasetConfig:
111111

112112

113113
class TIMDEXDataset:
114-
115114
def __init__(
116115
self,
117116
location: str,
@@ -211,7 +210,7 @@ def load_pyarrow_dataset(self, parquet_files: list[str] | None = None) -> ds.Dat
211210

212211
logger.info(
213212
f"Dataset successfully loaded: '{self.data_records_root}', "
214-
f"{round(time.perf_counter()-start_time, 2)}s"
213+
f"{round(time.perf_counter() - start_time, 2)}s"
215214
)
216215

217216
return dataset
@@ -496,12 +495,12 @@ def read_batches_iter(
496495

497496
batch_rps = int(batch_yield_count / (time.perf_counter() - batch_time))
498497
logger.debug(
499-
f"read_batches_iter batch {i+1}, yielded: {batch_yield_count} "
498+
f"read_batches_iter batch {i + 1}, yielded: {batch_yield_count} "
500499
f"@ {batch_rps} records/second, total yielded: {total_yield_count}"
501500
)
502501

503502
logger.debug(
504-
f"read_batches_iter() elapsed: {round(time.perf_counter()-start_time, 2)}s"
503+
f"read_batches_iter() elapsed: {round(time.perf_counter() - start_time, 2)}s"
505504
)
506505

507506
def _iter_meta_chunks(
@@ -525,7 +524,6 @@ def _iter_meta_chunks(
525524

526525
total_yielded = 0
527526
while True:
528-
529527
# enforce limit if passed
530528
if limit is not None:
531529
remaining = limit - total_yielded
@@ -586,7 +584,8 @@ def _build_data_query_for_chunk(
586584
filenames = list(meta_chunk_df["filename"].unique())
587585
if self.location_scheme == "s3":
588586
filenames = [
589-
f"s3://{f.removeprefix('s3://')}" for f in filenames # type: ignore[union-attr]
587+
f"s3://{f.removeprefix('s3://')}"
588+
for f in filenames # type: ignore[union-attr]
590589
]
591590
parquet_list_sql = "[" + ",".join(f"'{f}'" for f in filenames) + "]"
592591

timdex_dataset_api/embeddings.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ def to_dict(
136136

137137

138138
class TIMDEXEmbeddings:
139-
140139
def __init__(self, timdex_dataset: "TIMDEXDataset"):
141140
"""Init TIMDEXEmbeddings.
142141
@@ -174,7 +173,7 @@ def _setup_embeddings_views(self) -> None:
174173

175174
logger.debug(
176175
"Embeddings views setup for TIMDEXEmbeddings, "
177-
f"{round(time.perf_counter()-start_time,2)}s"
176+
f"{round(time.perf_counter() - start_time, 2)}s"
178177
)
179178

180179
def _create_embeddings_view(self, conn: DuckDBPyConnection) -> None:
@@ -377,7 +376,7 @@ def read_batches_iter(
377376
rows_per_batch=self.timdex_dataset.config.read_batch_size
378377
)
379378

380-
logger.debug(f"read() elapsed: {round(time.perf_counter()-start_time, 2)}s")
379+
logger.debug(f"read() elapsed: {round(time.perf_counter() - start_time, 2)}s")
381380

382381
def _build_query(
383382
self,

timdex_dataset_api/metadata.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737

3838

3939
class TIMDEXDatasetMetadata:
40-
4140
def __init__(self, timdex_dataset: "TIMDEXDataset") -> None:
4241
"""Init TIMDEXDatasetMetadata.
4342
@@ -174,7 +173,7 @@ def _create_full_dataset_table(self, conn: DuckDBPyConnection) -> None:
174173
query = f"""
175174
create or replace table records as (
176175
select
177-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
176+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
178177
from read_parquet(
179178
'{self.location}/data/records/**/*.parquet',
180179
hive_partitioning=true,
@@ -215,7 +214,7 @@ def _setup_metadata_schema(self) -> None:
215214

216215
logger.debug(
217216
"Metadata schema setup for TIMDEXDatasetMetadata, "
218-
f"{round(time.perf_counter()-start_time,2)}s"
217+
f"{round(time.perf_counter() - start_time, 2)}s"
219218
)
220219

221220
def _attach_database_file(self, conn: DuckDBPyConnection) -> None:
@@ -278,11 +277,11 @@ def _create_records_union_view(self, conn: DuckDBPyConnection) -> None:
278277
create or replace view metadata.records as
279278
(
280279
select
281-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
280+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
282281
from static_db.records
283282
union all
284283
select
285-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
284+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
286285
from metadata.append_deltas
287286
);
288287
""")
@@ -380,10 +379,14 @@ def merge_append_deltas(self) -> None:
380379
s3_client = S3Client()
381380

382381
# get filenames of append deltas
383-
append_delta_filenames = self.conn.query("""
382+
append_delta_filenames = (
383+
self.conn.query("""
384384
select distinct(append_delta_filename)
385385
from metadata.append_deltas
386-
""").to_df()["append_delta_filename"].to_list()
386+
""")
387+
.to_df()["append_delta_filename"]
388+
.to_list()
389+
)
387390

388391
if len(append_delta_filenames) == 0:
389392
logger.info("no append deltas found")
@@ -408,7 +411,7 @@ def merge_append_deltas(self) -> None:
408411
self.conn.execute(f"""
409412
insert into local_static_db.records
410413
select
411-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
414+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
412415
from metadata.append_deltas
413416
""")
414417

@@ -433,7 +436,7 @@ def merge_append_deltas(self) -> None:
433436

434437
logger.debug(
435438
"append deltas merged into the static metadata database file: "
436-
f"{self.metadata_database_path}, {time.perf_counter()-start_time}s"
439+
f"{self.metadata_database_path}, {time.perf_counter() - start_time}s"
437440
)
438441

439442
def write_append_delta_duckdb(self, filepath: str) -> None:
@@ -451,13 +454,13 @@ def write_append_delta_duckdb(self, filepath: str) -> None:
451454

452455
# ensure s3:// schema prefix is present
453456
if self.location_scheme == "s3":
454-
filepath = f"s3://{filepath.removeprefix("s3://")}"
457+
filepath = f"s3://{filepath.removeprefix('s3://')}"
455458

456459
# perform query + write as one SQL statement
457460
sql = f"""
458461
copy (
459462
select
460-
{','.join(ORDERED_METADATA_COLUMN_NAMES)}
463+
{",".join(ORDERED_METADATA_COLUMN_NAMES)}
461464
from read_parquet(
462465
'{filepath}',
463466
hive_partitioning=true,
@@ -469,7 +472,7 @@ def write_append_delta_duckdb(self, filepath: str) -> None:
469472
self.conn.execute(sql)
470473

471474
logger.debug(
472-
f"Append delta written: {output_path}, {time.perf_counter()-start_time}s"
475+
f"Append delta written: {output_path}, {time.perf_counter() - start_time}s"
473476
)
474477

475478
def build_keyset_paginated_metadata_query(

timdex_dataset_api/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def create_connection(self, path: str = ":memory:") -> DuckDBPyConnection:
138138
conn.execute("SET enable_progress_bar = false;")
139139
self.configure_connection(conn)
140140
logger.debug(
141-
f"DuckDB connection created, {round(time.perf_counter()-start_time,2)}s"
141+
f"DuckDB connection created, {round(time.perf_counter() - start_time, 2)}s"
142142
)
143143
return conn
144144

0 commit comments

Comments
 (0)