1
- from typing import Any , Optional
1
+ from typing import Any , Optional , Union , cast
2
2
3
3
from quixstreams .state .base .transaction import (
4
4
PartitionTransaction ,
5
5
PartitionTransactionStatus ,
6
6
validate_transaction_status ,
7
7
)
8
8
from quixstreams .state .metadata import SEPARATOR
9
- from quixstreams .state .serialization import int_to_int64_bytes , serialize
9
+ from quixstreams .state .recovery import ChangelogProducer
10
+ from quixstreams .state .rocksdb .metadata import (
11
+ LATEST_TIMESTAMP_KEY ,
12
+ LATEST_TIMESTAMPS_CF_NAME ,
13
+ )
14
+ from quixstreams .state .rocksdb .types import RocksDBOptionsType
15
+ from quixstreams .state .rocksdb .windowed .transaction import TimestampsCache
16
+ from quixstreams .state .serialization import (
17
+ DumpsFunc ,
18
+ LoadsFunc ,
19
+ int_to_int64_bytes ,
20
+ serialize ,
21
+ )
10
22
11
23
from .partition import RocksDBStorePartition
12
24
from .store import RocksDBStore
18
30
)
19
31
20
32
DAYS_7 = 7 * 24 * 60 * 60 * 1000
21
- EXPIRATION_COUNTER = 0
22
33
23
34
24
35
class TimestampedPartitionTransaction (PartitionTransaction ):
@@ -30,12 +41,26 @@ class TimestampedPartitionTransaction(PartitionTransaction):
30
41
It interacts with both an in-memory update cache and the persistent RocksDB store.
31
42
"""
32
43
33
- # Override the type hint from the parent class (`PartitionTransaction`).
34
- # This informs type checkers like mypy that in this specific subclass,
35
- # `_partition` is a `TimestampedStorePartition` (defined below),
36
- # which has methods like `.iter_items()` that the base type might lack.
37
- # The string quotes are necessary for the forward reference.
38
- _partition : "TimestampedStorePartition"
44
+ def __init__ (
45
+ self ,
46
+ partition : "TimestampedStorePartition" ,
47
+ dumps : DumpsFunc ,
48
+ loads : LoadsFunc ,
49
+ changelog_producer : Optional [ChangelogProducer ] = None ,
50
+ ):
51
+ super ().__init__ (
52
+ partition = partition ,
53
+ dumps = dumps ,
54
+ loads = loads ,
55
+ changelog_producer = changelog_producer ,
56
+ )
57
+ self ._partition : TimestampedStorePartition = cast (
58
+ "TimestampedStorePartition" , self ._partition
59
+ )
60
+ self ._latest_timestamps : TimestampsCache = TimestampsCache (
61
+ key = LATEST_TIMESTAMP_KEY ,
62
+ cf_name = LATEST_TIMESTAMPS_CF_NAME ,
63
+ )
39
64
40
65
@validate_transaction_status (PartitionTransactionStatus .STARTED )
41
66
def get_last (
@@ -60,13 +85,20 @@ def get_last(
60
85
:param cf_name: The column family name.
61
86
:return: The deserialized value if found, otherwise None.
62
87
"""
63
- global EXPIRATION_COUNTER
64
88
65
89
prefix = self ._ensure_bytes (prefix )
66
90
91
+ latest_timestamp = max (
92
+ self ._get_timestamp (
93
+ prefix = prefix , cache = self ._latest_timestamps , default = 0
94
+ ),
95
+ timestamp ,
96
+ )
97
+
67
98
# Negative retention is not allowed
68
- lower_bound_timestamp = max (timestamp - retention_ms , 0 )
69
- lower_bound = self ._serialize_key (lower_bound_timestamp , prefix )
99
+ lower_bound = self ._serialize_key (
100
+ max (latest_timestamp - retention_ms , 0 ), prefix
101
+ )
70
102
# +1 because upper bound is exclusive
71
103
upper_bound = self ._serialize_key (timestamp + 1 , prefix )
72
104
@@ -101,15 +133,18 @@ def get_last(
101
133
# iterating backwards from the upper bound.
102
134
break
103
135
104
- if not EXPIRATION_COUNTER % 1000 :
105
- self ._expire (lower_bound_timestamp , prefix , cf_name = cf_name )
106
- EXPIRATION_COUNTER += 1
107
-
108
136
return self ._deserialize_value (value ) if value is not None else None
109
137
110
138
@validate_transaction_status (PartitionTransactionStatus .STARTED )
111
- def set (self , timestamp : int , value : Any , prefix : Any , cf_name : str = "default" ):
112
- """Set a value associated with a prefix and timestamp.
139
+ def set_for_timestamp (
140
+ self ,
141
+ timestamp : int ,
142
+ value : Any ,
143
+ prefix : Any ,
144
+ retention_ms : int = DAYS_7 ,
145
+ cf_name : str = "default" ,
146
+ ):
147
+ """Set a value for the timestamp.
113
148
114
149
This method acts as a proxy, passing the provided `timestamp` and `prefix`
115
150
to the parent `set` method. The parent method internally serializes these
@@ -122,8 +157,16 @@ def set(self, timestamp: int, value: Any, prefix: Any, cf_name: str = "default")
122
157
"""
123
158
prefix = self ._ensure_bytes (prefix )
124
159
super ().set (timestamp , value , prefix , cf_name = cf_name )
160
+ self ._expire (
161
+ timestamp = timestamp ,
162
+ prefix = prefix ,
163
+ retention_ms = retention_ms ,
164
+ cf_name = cf_name ,
165
+ )
125
166
126
- def _expire (self , timestamp : int , prefix : bytes , cf_name : str = "default" ):
167
+ def _expire (
168
+ self , timestamp : int , prefix : bytes , retention_ms : int , cf_name : str = "default"
169
+ ):
127
170
"""
128
171
Delete all entries for a given prefix with timestamps less than the
129
172
provided timestamp.
@@ -136,11 +179,23 @@ def _expire(self, timestamp: int, prefix: bytes, cf_name: str = "default"):
136
179
:param prefix: The key prefix.
137
180
:param cf_name: Column family name.
138
181
"""
139
- key = self ._serialize_key (timestamp , prefix )
140
182
141
- cached = self ._update_cache .get_updates_for_prefix (
183
+ latest_timestamp = max (
184
+ self ._get_timestamp (
185
+ prefix = prefix , cache = self ._latest_timestamps , default = 0
186
+ ),
187
+ timestamp ,
188
+ )
189
+ self ._set_timestamp (
190
+ cache = self ._latest_timestamps ,
142
191
prefix = prefix ,
143
- cf_name = cf_name ,
192
+ timestamp_ms = latest_timestamp ,
193
+ )
194
+
195
+ key = self ._serialize_key (max (timestamp - retention_ms , 0 ), prefix )
196
+
197
+ cached = self ._update_cache .get_updates_for_prefix (
198
+ prefix = prefix , cf_name = cf_name
144
199
)
145
200
# Cast to list to avoid RuntimeError: dictionary changed size during iteration
146
201
for cached_key in list (cached ):
@@ -160,8 +215,42 @@ def _ensure_bytes(self, prefix: Any) -> bytes:
160
215
return prefix
161
216
return serialize (prefix , dumps = self ._dumps )
162
217
163
- def _serialize_key (self , timestamp : int , prefix : bytes ) -> bytes :
164
- return prefix + SEPARATOR + int_to_int64_bytes (timestamp )
218
+ def _serialize_key (self , key : Union [int , bytes ], prefix : bytes ) -> bytes :
219
+ match key :
220
+ case int ():
221
+ return prefix + SEPARATOR + int_to_int64_bytes (key )
222
+ case bytes ():
223
+ return prefix + SEPARATOR + key
224
+ case _:
225
+ raise ValueError (f"Invalid key type: { type (key )} " )
226
+
227
+ def _get_timestamp (
228
+ self , cache : TimestampsCache , prefix : bytes , default : Any = None
229
+ ) -> Any :
230
+ cached_ts = cache .timestamps .get (prefix )
231
+ if cached_ts is not None :
232
+ return cached_ts
233
+
234
+ stored_ts = self .get (
235
+ key = cache .key ,
236
+ prefix = prefix ,
237
+ cf_name = cache .cf_name ,
238
+ default = default ,
239
+ )
240
+ if stored_ts is not None and not isinstance (stored_ts , int ):
241
+ raise ValueError (f"invalid timestamp { stored_ts } " )
242
+
243
+ cache .timestamps [prefix ] = stored_ts or default
244
+ return stored_ts
245
+
246
+ def _set_timestamp (self , cache : TimestampsCache , prefix : bytes , timestamp_ms : int ):
247
+ cache .timestamps [prefix ] = timestamp_ms
248
+ self .set (
249
+ key = cache .key ,
250
+ value = timestamp_ms ,
251
+ prefix = prefix ,
252
+ cf_name = cache .cf_name ,
253
+ )
165
254
166
255
167
256
class TimestampedStorePartition (RocksDBStorePartition ):
@@ -174,6 +263,15 @@ class TimestampedStorePartition(RocksDBStorePartition):
174
263
175
264
partition_transaction_class = TimestampedPartitionTransaction
176
265
266
+ def __init__ (
267
+ self ,
268
+ path : str ,
269
+ options : Optional [RocksDBOptionsType ] = None ,
270
+ changelog_producer : Optional [ChangelogProducer ] = None ,
271
+ ) -> None :
272
+ super ().__init__ (path , options = options , changelog_producer = changelog_producer )
273
+ self ._ensure_column_family (LATEST_TIMESTAMPS_CF_NAME )
274
+
177
275
178
276
class TimestampedStore (RocksDBStore ):
179
277
"""
0 commit comments