From 6bf130d211bea1ee713d20877eec8e94eb6d9a87 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 17 Nov 2025 15:17:45 -0500 Subject: [PATCH 01/43] start --- Cargo.lock | 43 +++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + 2 files changed, 44 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 2f33ebb3..534b3ba3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,6 +121,12 @@ version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + [[package]] name = "async-stream" version = "0.3.6" @@ -426,6 +432,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1572,6 +1588,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + [[package]] name = "num-bigint-dig" version = "0.8.4" @@ -2131,6 +2157,22 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "redis" +version = "0.29.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc42f3a12fd4408ce64d8efef67048a924e543bd35c6591c0447fda9054695f" +dependencies = [ + "arc-swap", + "combine", + "itoa", + "num-bigint", + "percent-encoding", + "ryu", + "socket2 0.5.10", + "url", +] + [[package]] name = "redox_syscall" version = "0.5.17" @@ -2925,6 +2967,7 @@ dependencies = [ "prost-types", "rand 0.8.5", "rdkafka", + "redis", "sentry", "sentry_protos", "serde", diff --git a/Cargo.toml b/Cargo.toml index 260d3948..84b9985b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ edition = "2024" debug = 1 [dependencies] +redis = { version = "0.29.2", default-features = false } sentry_protos = "0.2.0" anyhow = "1.0.92" chrono = { version = "0.4.26" } From 4f54a15580f3adf1042369056d59c1a07bf7557e Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 17 Nov 2025 15:17:52 -0500 Subject: [PATCH 02/43] start --- src/store/inflight_redis_activation.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/store/inflight_redis_activation.rs diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs new file mode 100644 index 00000000..e69de29b From 60863db08e18a5c6b64828b1886e3bb33004aed7 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 18 Nov 2025 15:06:04 -0500 Subject: [PATCH 03/43] fixes --- src/store/inflight_redis_activation.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 2b2fed8b..13860bb1 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -3,11 +3,12 @@ use tracing::instrument; use crate::config::Config; use crate::store::inflight_activation::{InflightActivation, QueryResult}; use anyhow::Error; -use deadpool_redis::cluster::{Config as RedisConfig, Pool, Runtime}; +// use deadpool_redis::cluster::{Config as RedisConfig, Pool, Runtime}; +use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; use redis::AsyncTypedCommands; use uuid::Uuid; -enum KeyPrefix { +pub enum KeyPrefix { Payload, Pending, Processing, @@ -38,8 +39,10 @@ impl RedisActivationStoreConfig { } pub async fn create_redis_pool(urls: Vec) -> Result { - let cfg = RedisConfig::from_urls(urls); + let cfg = RedisConfig::from_url(urls[0].clone()); let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); + // let cfg = RedisConfig::from_urls(urls); + // let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); Ok(pool) } From c88bc5f476d7ce4d2d0692b43ef507af0a55ce0b Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 20 Nov 2025 12:32:30 -0500 Subject: [PATCH 04/43] get writing working --- Cargo.lock | 7 + Cargo.toml | 1 + src/kafka/inflight_activation_writer.rs | 224 +++++++++++++----------- src/store/inflight_redis_activation.rs | 220 +++++++++++++++++++---- 4 files changed, 318 insertions(+), 134 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ea5ccd83..8939e740 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -371,6 +371,12 @@ dependencies = [ "half", ] +[[package]] +name = "cityhasher" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ceab37c9e94f42414cccae77e930232c517f1bb190947018cffb0ab41fc40992" + [[package]] name = "clap" version = "4.5.45" @@ -3011,6 +3017,7 @@ dependencies = [ "anyhow", "bytes", "chrono", + "cityhasher", "clap", "criterion", "deadpool-redis", diff --git a/Cargo.toml b/Cargo.toml index 37257235..7e4ab8b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ chrono = { version = "0.4.26" } clap = { version = "4.5.20", features = ["derive"] } deadpool-redis = { version = "0.22.0", features = ["cluster"] } elegant-departure = { version = "0.3.1", features = ["tokio"] } +cityhasher = "0.1.0" figment = { version = "0.10.19", features = ["env", "yaml", "test"] } futures = "0.3.31" futures-util = "0.3.31" diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index 06f50db8..c9c76495 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -205,24 +205,26 @@ impl Reducer for InflightActivationWriter { #[cfg(test)] mod tests { use super::{ActivationWriterConfig, InflightActivation, InflightActivationWriter, Reducer}; - use chrono::{DateTime, Utc}; + use chrono::{DateTime, Duration, Utc}; use prost::Message; use prost_types::Timestamp; use std::collections::HashMap; + use uuid::Uuid; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use sentry_protos::taskbroker::v1::TaskActivation; use std::sync::Arc; - use crate::store::inflight_activation::{ - InflightActivationStatus, InflightActivationStore, InflightActivationStoreConfig, - }; + use crate::store::inflight_activation::InflightActivationStatus; use crate::store::inflight_redis_activation::RedisActivationStore; use crate::store::inflight_redis_activation::RedisActivationStoreConfig; + use crate::test_utils::create_integration_config; use crate::test_utils::generate_temp_redis_urls; use crate::test_utils::make_activations; - use crate::test_utils::{create_integration_config, generate_temp_filename}; + fn activation_id() -> String { + Uuid::new_v4().to_string() + } #[tokio::test] async fn test_writer_flush_batch() { let writer_config = ActivationWriterConfig { @@ -233,27 +235,29 @@ mod tests { max_delay_activations: 10, write_failure_backoff_ms: 4000, }; - let mut writer = InflightActivationWriter::new( - Arc::new( - RedisActivationStore::new( - generate_temp_redis_urls(), - RedisActivationStoreConfig::from_config(&create_integration_config()), - ) - .await - .unwrap(), - ), - writer_config, + let store = Arc::new( + RedisActivationStore::new( + generate_temp_redis_urls(), + RedisActivationStoreConfig::from_config(&create_integration_config()), + ) + .await + .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); + let mut writer = InflightActivationWriter::new(store.clone(), writer_config); let received_at = Timestamp { seconds: 0, nanos: 0, }; let batch = vec![ InflightActivation { - id: "0".to_string(), + id: activation_id(), activation: TaskActivation { - id: "0".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -279,15 +283,15 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, InflightActivation { - id: "1".to_string(), + id: activation_id(), activation: TaskActivation { - id: "1".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "delay_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -313,12 +317,11 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "delay_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, ]; - writer.reduce(batch).await.unwrap(); writer.flush().await.unwrap(); let count_pending = writer.store.count_pending_activations().await.unwrap(); @@ -336,26 +339,28 @@ mod tests { max_delay_activations: 0, write_failure_backoff_ms: 4000, }; - let mut writer = InflightActivationWriter::new( - Arc::new( - RedisActivationStore::new( - generate_temp_redis_urls(), - RedisActivationStoreConfig::from_config(&create_integration_config()), - ) - .await - .unwrap(), - ), - writer_config, + let store = Arc::new( + RedisActivationStore::new( + generate_temp_redis_urls(), + RedisActivationStoreConfig::from_config(&create_integration_config()), + ) + .await + .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); + let mut writer = InflightActivationWriter::new(store.clone(), writer_config); let received_at = Timestamp { seconds: 0, nanos: 0, }; let batch = vec![InflightActivation { - id: "0".to_string(), + id: activation_id(), activation: TaskActivation { - id: "0".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -378,7 +383,7 @@ mod tests { processing_deadline: None, processing_deadline_duration: 0, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }]; @@ -399,27 +404,29 @@ mod tests { max_delay_activations: 10, write_failure_backoff_ms: 4000, }; - let mut writer = InflightActivationWriter::new( - Arc::new( - RedisActivationStore::new( - generate_temp_redis_urls(), - RedisActivationStoreConfig::from_config(&create_integration_config()), - ) - .await - .unwrap(), - ), - writer_config, + let store = Arc::new( + RedisActivationStore::new( + generate_temp_redis_urls(), + RedisActivationStoreConfig::from_config(&create_integration_config()), + ) + .await + .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); + let mut writer = InflightActivationWriter::new(store.clone(), writer_config); let received_at = Timestamp { seconds: 0, nanos: 0, }; let batch = vec![InflightActivation { - id: "0".to_string(), + id: activation_id(), activation: TaskActivation { - id: "0".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -438,11 +445,11 @@ mod tests { .unwrap(), processing_attempts: 0, expires_at: None, - delay_until: None, + delay_until: Some(Utc::now() + Duration::seconds(10)), processing_deadline: None, processing_deadline_duration: 0, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }]; @@ -463,27 +470,29 @@ mod tests { max_delay_activations: 0, write_failure_backoff_ms: 4000, }; - let mut writer = InflightActivationWriter::new( - Arc::new( - RedisActivationStore::new( - generate_temp_redis_urls(), - RedisActivationStoreConfig::from_config(&create_integration_config()), - ) - .await - .unwrap(), - ), - writer_config, + let store = Arc::new( + RedisActivationStore::new( + generate_temp_redis_urls(), + RedisActivationStoreConfig::from_config(&create_integration_config()), + ) + .await + .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); + let mut writer = InflightActivationWriter::new(store.clone(), writer_config); let received_at = Timestamp { seconds: 0, nanos: 0, }; let batch = vec![ InflightActivation { - id: "0".to_string(), + id: activation_id(), activation: TaskActivation { - id: "0".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -509,15 +518,15 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, InflightActivation { - id: "1".to_string(), + id: activation_id(), activation: TaskActivation { - id: "1".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "delay_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -543,7 +552,7 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "delay_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, @@ -567,17 +576,19 @@ mod tests { max_delay_activations: 0, write_failure_backoff_ms: 4000, }; - let mut writer = InflightActivationWriter::new( - Arc::new( - RedisActivationStore::new( - generate_temp_redis_urls(), - RedisActivationStoreConfig::from_config(&create_integration_config()), - ) - .await - .unwrap(), - ), - writer_config, + let store = Arc::new( + RedisActivationStore::new( + generate_temp_redis_urls(), + RedisActivationStoreConfig::from_config(&create_integration_config()), + ) + .await + .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); + let mut writer = InflightActivationWriter::new(store.clone(), writer_config); let received_at = Timestamp { seconds: 0, @@ -585,10 +596,10 @@ mod tests { }; let batch = vec![ InflightActivation { - id: "0".to_string(), + id: activation_id(), activation: TaskActivation { - id: "0".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -614,15 +625,15 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, InflightActivation { - id: "1".to_string(), + id: activation_id(), activation: TaskActivation { - id: "1".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -648,7 +659,7 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, @@ -663,6 +674,7 @@ mod tests { } #[tokio::test] + #[ignore = "need a way to insert a processing activation"] async fn test_writer_backpressure_processing_limit_reached() { let writer_config = ActivationWriterConfig { db_max_size: None, @@ -680,7 +692,10 @@ mod tests { .await .unwrap(), ); - + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); let received_at = Timestamp { seconds: 0, nanos: 0, @@ -689,7 +704,7 @@ mod tests { id: "existing".to_string(), activation: TaskActivation { id: "existing".to_string(), - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "existing_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -712,7 +727,7 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "existing_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }; @@ -721,10 +736,10 @@ mod tests { let mut writer = InflightActivationWriter::new(store.clone(), writer_config); let batch = vec![ InflightActivation { - id: "0".to_string(), + id: activation_id(), activation: TaskActivation { - id: "0".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -750,15 +765,15 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "pending_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, InflightActivation { - id: "1".to_string(), + id: activation_id(), activation: TaskActivation { - id: "1".to_string(), - namespace: "namespace".to_string(), + id: activation_id(), + namespace: "default".to_string(), taskname: "delay_task".to_string(), parameters: "{}".to_string(), headers: HashMap::new(), @@ -784,7 +799,7 @@ mod tests { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".to_string(), + namespace: "default".to_string(), taskname: "delay_task".to_string(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }, @@ -805,6 +820,7 @@ mod tests { } #[tokio::test] + #[ignore = "need a way to determine db size"] async fn test_writer_backpressure_db_size_limit_reached() { let writer_config = ActivationWriterConfig { // 200 rows is ~50KB @@ -823,6 +839,10 @@ mod tests { .await .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); let first_round = make_activations(200); store.store(first_round).await.unwrap(); assert!(store.db_size().await.unwrap() > 50_000); @@ -857,6 +877,10 @@ mod tests { .await .unwrap(), ); + store + .delete_all_keys() + .await + .expect("Error deleting all keys"); let mut writer = InflightActivationWriter::new(store.clone(), writer_config); writer.reduce(vec![]).await.unwrap(); let flush_result = writer.flush().await.unwrap(); diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 13860bb1..0dabb363 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,12 +1,12 @@ -use tracing::instrument; +use tracing::{error, instrument}; // use deadpool_redis::Pool; use crate::config::Config; use crate::store::inflight_activation::{InflightActivation, QueryResult}; use anyhow::Error; // use deadpool_redis::cluster::{Config as RedisConfig, Pool, Runtime}; +use cityhasher; use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; use redis::AsyncTypedCommands; -use uuid::Uuid; pub enum KeyPrefix { Payload, @@ -48,26 +48,39 @@ pub async fn create_redis_pool(urls: Vec) -> Result { pub struct RedisActivationStore { pool: Pool, + replicas: usize, topics: Vec, partitions: Vec, namespaces: Vec, num_buckets: usize, + bucket_hashes: Vec, payload_ttl_seconds: u64, } impl RedisActivationStore { pub async fn new(urls: Vec, config: RedisActivationStoreConfig) -> Result { + let replicas = urls.len(); let pool = create_redis_pool(urls).await?; + let bucket_hashes = (0..config.num_buckets) + .map(|i| format!("{:04x}", i)) + .collect(); Ok(Self { pool, + replicas, topics: config.topics.clone(), partitions: config.partitions, namespaces: config.namespaces.clone(), num_buckets: config.num_buckets, + bucket_hashes, payload_ttl_seconds: config.payload_ttl_seconds, }) } + pub fn compute_bucket(&self, activation_id: String) -> String { + let hashint: u64 = cityhasher::hash(activation_id); + format!("{:04x}", hashint % self.num_buckets as u64) + } + pub fn build_key_with_activation( &self, prefix: KeyPrefix, @@ -76,14 +89,12 @@ impl RedisActivationStore { partition: i32, activation_id: String, ) -> String { - let uuid = Uuid::parse_str(&activation_id).unwrap(); - let as_u128: u128 = uuid.as_u128(); self.build_key( prefix, namespace, topic, partition, - format!("{:04x}", as_u128 % self.num_buckets as u128), + self.compute_bucket(activation_id), ) } @@ -93,15 +104,9 @@ impl RedisActivationStore { namespace: String, topic: String, partition: i32, - bucket: usize, + bucket_hash: String, ) -> String { - self.build_key( - prefix, - namespace, - topic, - partition, - format!("{:04x}", bucket), - ) + self.build_key(prefix, namespace, topic, partition, bucket_hash) } pub fn build_key( @@ -137,6 +142,7 @@ impl RedisActivationStore { pub async fn store(&self, batch: Vec) -> Result { let mut conn = self.pool.get().await?; + let mut rows_affected: u64 = 0; for activation in batch { let payload_key = format!( "{}:{}", @@ -150,7 +156,6 @@ impl RedisActivationStore { activation.id.clone() ); - let mut expected_commands = 3; let mut pipe = redis::pipe(); pipe.atomic() .hset(payload_key.clone(), "id", activation.id.clone()) @@ -166,14 +171,8 @@ impl RedisActivationStore { .arg(activation.received_at.timestamp()) .arg("processing_attempts") .arg(activation.processing_attempts) - .arg("expires_at") - .arg(activation.expires_at.map(|dt| dt.timestamp())) - .arg("delay_until") - .arg(activation.delay_until.map(|dt| dt.timestamp())) .arg("processing_deadline_duration") .arg(activation.processing_deadline_duration) - .arg("processing_deadline") - .arg(activation.processing_deadline.map(|dt| dt.timestamp())) .arg("status") .arg(format!("{:?}", activation.status)) .arg("at_most_once") @@ -184,8 +183,26 @@ impl RedisActivationStore { .arg(activation.taskname) .arg("on_attempts_exceeded") .arg(activation.on_attempts_exceeded as i32); + + let mut expected_args = 13; + if activation.expires_at.is_some() { + pipe.arg("expires_at") + .arg(activation.expires_at.unwrap().timestamp()); + expected_args += 1; + } + if activation.delay_until.is_some() { + pipe.arg("delay_until") + .arg(activation.delay_until.unwrap().timestamp()); + expected_args += 1; + } + if activation.processing_deadline.is_some() { + pipe.arg("processing_deadline") + .arg(activation.processing_deadline.unwrap().timestamp()); + expected_args += 1; + } pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); + let mut queue_key_used = String::new(); if activation.delay_until.is_some() { let delay_key = self.build_key_with_activation( KeyPrefix::Delay, @@ -195,10 +212,11 @@ impl RedisActivationStore { activation.id.clone(), ); pipe.zadd( - delay_key, - activation.delay_until.unwrap().timestamp(), + delay_key.clone(), activation.id.clone(), + activation.delay_until.unwrap().timestamp(), ); + queue_key_used = delay_key; } else { let pending_key = self.build_key_with_activation( KeyPrefix::Pending, @@ -207,11 +225,13 @@ impl RedisActivationStore { activation.partition, activation.id.clone(), ); - pipe.rpush(pending_key, activation.id.clone()); + pipe.rpush(pending_key.clone(), activation.id.clone()); + queue_key_used = pending_key; } + let mut expired_key = String::new(); if activation.expires_at.is_some() { - let expired_key = self.build_key_with_activation( + expired_key = self.build_key_with_activation( KeyPrefix::Expired, activation.namespace.clone(), self.topics[0].clone(), @@ -219,22 +239,84 @@ impl RedisActivationStore { activation.id.clone(), ); pipe.zadd( - expired_key, - activation.expires_at.unwrap().timestamp(), + expired_key.clone(), activation.id.clone(), + activation.expires_at.unwrap().timestamp(), ); - expected_commands += 1; + } + pipe.cmd("WAIT").arg(1).arg(1000); + + let result: Vec = match pipe.query_async(&mut conn).await { + Ok(result) => result, + Err(err) => { + error!( + "Failed to store activation {} in Redis: {}", + payload_key.clone(), + err + ); + return Err(anyhow::anyhow!( + "Failed to store activation: {}", + payload_key.clone() + )); + } + }; + + if result.len() != 4 && result.len() != 5 { + return Err(anyhow::anyhow!( + "Failed to store activation: incorrect number of commands run: expected 4 or 5, got {} for key {}", + result.len(), + payload_key.clone() + )); + } + // WAIT returns the number of replicas that had the write propagated + // If there is only one node then it will return 0. + if result[result.len() - 1] < self.replicas as i32 - 1 { + return Err(anyhow::anyhow!( + "Activation {} was not stored on any replica", + payload_key + )); } - let result: Vec = pipe.query_async(&mut conn).await?; - if result.len() != expected_commands { + // HSET returns the number of fields set + if result[0] != expected_args { return Err(anyhow::anyhow!( - "Failed to store activation: {}", + "Failed to store activation: expected {} arguments, got {} for key {}", + expected_args, + result[0], payload_key.clone() )); } + // EXPIRE returns 1 on success and 0 on failure + if result[1] != 1 { + return Err(anyhow::anyhow!( + "Failed to expire activation for key {}", + payload_key + )); + } + // Both ZADD and RPUSH return a count of elements in the structure + if result[2] <= 0 { + return Err(anyhow::anyhow!( + "Failed to add activation to queue for key {}", + queue_key_used + )); + } + // Check if the ZADD happened on the expired key + if result.len() == 5 && result[3] <= 0 { + return Err(anyhow::anyhow!( + "Failed to add activation to expired queue for key {}", + expired_key + )); + } + // Check to ensure that the WAIT command returned at least one replica + if result.len() == 5 && result[4] <= 0 { + return Err(anyhow::anyhow!( + "Failed to wait for activation to be stored on at least one replica for key {}", + payload_key + )); + } + rows_affected += 1; } - Ok(QueryResult { rows_affected: 0 }) + Ok(QueryResult { rows_affected }) } // Called when rebalancing partitions @@ -377,6 +459,16 @@ impl RedisActivationStore { Ok(()) } + // Only used in testing + pub async fn delete_all_keys(&self) -> Result<(), Error> { + let mut conn = self.pool.get().await?; + let keys: Vec = conn.keys("*").await?; + for key in keys { + conn.del(key).await?; + } + Ok(()) + } + /// Get an activation by id. Primarily used for testing pub async fn get_by_id(&self, id: &str) -> Result, Error> { return Ok(None); @@ -448,16 +540,76 @@ impl RedisActivationStore { return Ok(vec![]); } + #[instrument(skip_all)] pub async fn count_pending_activations(&self) -> Result { - return Ok(0); + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for topic in self.topics.iter() { + for namespace in self.namespaces.iter() { + for partition in self.partitions.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let pending_key = self.build_key( + KeyPrefix::Pending, + namespace.to_string(), + topic.to_string(), + *partition, + bucket_hash.to_string(), + ); + let count: usize = conn.llen(pending_key).await?; + total_count += count; + } + } + } + } + return Ok(total_count); } + #[instrument(skip_all)] pub async fn count_delayed_activations(&self) -> Result { - return Ok(0); + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for topic in self.topics.iter() { + for namespace in self.namespaces.iter() { + for partition in self.partitions.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let delay_key = self.build_key( + KeyPrefix::Delay, + namespace.to_string(), + topic.to_string(), + *partition, + bucket_hash.to_string(), + ); + let count: usize = conn.zcard(delay_key.clone()).await?; + total_count += count; + } + } + } + } + return Ok(total_count); } + #[instrument(skip_all)] pub async fn count_processing_activations(&self) -> Result { - return Ok(0); + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for topic in self.topics.iter() { + for namespace in self.namespaces.iter() { + for partition in self.partitions.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let processing_key = self.build_key( + KeyPrefix::Processing, + namespace.to_string(), + topic.to_string(), + *partition, + bucket_hash.to_string(), + ); + let count: usize = conn.zcard(processing_key.clone()).await?; + total_count += count; + } + } + } + } + return Ok(total_count); } pub async fn db_size(&self) -> Result { From 99d4ee1c91f2e5ba8709b3f24cc62620cc9a9639 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 20 Nov 2025 15:37:43 -0500 Subject: [PATCH 05/43] add partition rebalancing --- src/config.rs | 2 +- src/grpc/server.rs | 1 + src/kafka/consumer.rs | 34 ++++-- src/kafka/deserialize_activation.rs | 1 + src/kafka/inflight_activation_writer.rs | 19 +++- src/main.rs | 8 +- src/store/inflight_activation.rs | 4 + src/store/inflight_redis_activation.rs | 138 +++++++++++++++++------- src/test_utils.rs | 1 + 9 files changed, 153 insertions(+), 55 deletions(-) diff --git a/src/config.rs b/src/config.rs index 1dfadfcb..cea8085d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -289,7 +289,7 @@ impl Default for Config { vacuum_interval_ms: 30000, enable_sqlite_status_metrics: true, // Redis information - redis_cluster_urls: vec!["127.0.0.1:6379".to_owned()], + redis_cluster_urls: vec!["redis://127.0.0.1:6379".to_owned()], namespaces: vec!["default".to_owned()], num_redis_buckets: 256, payload_ttl_seconds: 60 * 60 * 24, diff --git a/src/grpc/server.rs b/src/grpc/server.rs index 99fe03d8..9242821b 100644 --- a/src/grpc/server.rs +++ b/src/grpc/server.rs @@ -10,6 +10,7 @@ use std::time::Instant; use tonic::{Request, Response, Status}; use crate::store::inflight_activation::{InflightActivationStatus, InflightActivationStore}; +use crate::store::inflight_redis_activation::RedisActivationStore; use tracing::{error, instrument}; pub struct TaskbrokerServer { diff --git a/src/kafka/consumer.rs b/src/kafka/consumer.rs index 16d9b1e8..597e4448 100644 --- a/src/kafka/consumer.rs +++ b/src/kafka/consumer.rs @@ -1,3 +1,4 @@ +use crate::store::inflight_redis_activation::RedisActivationStore; use anyhow::{Error, anyhow}; use futures::{ Stream, StreamExt, @@ -21,10 +22,8 @@ use std::{ future::Future, iter, mem::take, - sync::{ - Arc, - mpsc::{SyncSender, sync_channel}, - }, + sync::Arc, + sync::mpsc::{SyncSender, sync_channel}, time::Duration, }; use tokio::{ @@ -44,6 +43,7 @@ use tracing::{debug, error, info, instrument, warn}; pub async fn start_consumer( topics: &[&str], kafka_client_config: &ClientConfig, + redis_store: Arc, spawn_actors: impl FnMut( Arc>, &BTreeSet<(String, i32)>, @@ -51,7 +51,7 @@ pub async fn start_consumer( ) -> Result<(), Error> { let (client_shutdown_sender, client_shutdown_receiver) = oneshot::channel(); let (event_sender, event_receiver) = unbounded_channel(); - let context = KafkaContext::new(event_sender.clone()); + let context = KafkaContext::new(event_sender.clone(), redis_store.clone()); let consumer: Arc> = Arc::new( kafka_client_config .create_with_context(context) @@ -67,6 +67,7 @@ pub async fn start_consumer( metrics::gauge!("arroyo.consumer.current_partitions").set(0); handle_events( consumer, + redis_store, event_receiver, client_shutdown_sender, spawn_actors, @@ -118,11 +119,18 @@ pub fn poll_consumer_client( #[derive(Debug)] pub struct KafkaContext { event_sender: UnboundedSender<(Event, SyncSender<()>)>, + redis_store: Arc, } impl KafkaContext { - pub fn new(event_sender: UnboundedSender<(Event, SyncSender<()>)>) -> Self { - Self { event_sender } + pub fn new( + event_sender: UnboundedSender<(Event, SyncSender<()>)>, + redis_store: Arc, + ) -> Self { + Self { + event_sender, + redis_store, + } } } @@ -339,6 +347,7 @@ enum ConsumerState { #[instrument(skip_all)] pub async fn handle_events( consumer: Arc>, + redis_store: Arc, events: UnboundedReceiver<(Event, SyncSender<()>)>, shutdown_client: oneshot::Sender<()>, mut spawn_actors: impl FnMut( @@ -372,6 +381,17 @@ pub async fn handle_events( state = match (state, event) { (ConsumerState::Ready, Event::Assign(tpl)) => { metrics::gauge!("arroyo.consumer.current_partitions").set(tpl.len() as f64); + let mut topics = HashMap::>::new(); + for (topic, partition) in tpl.iter() { + if !topics.contains_key(topic) { + topics.insert(topic.clone(), vec![*partition]); + } else { + topics.get_mut(topic).unwrap().push(*partition); + } + } + for (topic, partitions) in topics.iter() { + redis_store.rebalance_partitions(topic.clone(), partitions.clone()).await; + } ConsumerState::Consuming(spawn_actors(consumer.clone(), &tpl), tpl) } (ConsumerState::Ready, Event::Revoke(_)) => { diff --git a/src/kafka/deserialize_activation.rs b/src/kafka/deserialize_activation.rs index 1dda0a98..d1052726 100644 --- a/src/kafka/deserialize_activation.rs +++ b/src/kafka/deserialize_activation.rs @@ -81,6 +81,7 @@ pub fn new( id: activation.id.clone(), activation: payload.to_vec(), status, + topic: msg.topic().to_string(), partition: msg.partition(), offset: msg.offset(), added_at: Utc::now(), diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index c9c76495..3397c48d 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -3,10 +3,6 @@ use std::{ time::{Duration, Instant}, }; -use chrono::Utc; -use tokio::time::sleep; -use tracing::{debug, error, instrument}; - use crate::{ config::Config, store::inflight_activation::{ @@ -14,6 +10,10 @@ use crate::{ }, store::inflight_redis_activation::RedisActivationStore, }; +use chrono::Utc; +use std::sync::RwLock; +use tokio::time::sleep; +use tracing::{debug, error, instrument}; use super::consumer::{ ReduceConfig, ReduceShutdownBehaviour, ReduceShutdownCondition, Reducer, @@ -269,6 +269,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -303,6 +304,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Delay, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -372,6 +374,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -438,6 +441,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Delay, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -504,6 +508,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -538,6 +543,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Delay, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -611,6 +617,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -645,6 +652,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -716,6 +724,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Processing, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -751,6 +760,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -785,6 +795,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), diff --git a/src/main.rs b/src/main.rs index 3753f272..df487a23 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ use taskbroker::kafka::inflight_activation_batcher::{ }; use taskbroker::upkeep::upkeep; use tokio::signal::unix::SignalKind; +use tokio::sync::RwLock; use tokio::task::JoinHandle; use tokio::{select, time}; use tonic::transport::Server; @@ -151,8 +152,7 @@ async fn main() -> Result<(), Error> { // Consumer from kafka let consumer_task = tokio::spawn({ - let consumer_store = store.clone(); - let redis_consumer_store = redis_store.clone(); + let consumer_store = redis_store.clone(); let consumer_config = config.clone(); let runtime_config_manager = runtime_config_manager.clone(); async move { @@ -161,6 +161,7 @@ async fn main() -> Result<(), Error> { start_consumer( &[&consumer_config.kafka_topic], &consumer_config.kafka_consumer_config(), + consumer_store.clone(), processing_strategy!({ err: OsStreamWriter::new( @@ -177,7 +178,7 @@ async fn main() -> Result<(), Error> { runtime_config_manager.clone() ), InflightActivationWriter::new( - redis_consumer_store.clone(), + consumer_store.clone(), ActivationWriterConfig::from_config(&consumer_config) ), @@ -189,6 +190,7 @@ async fn main() -> Result<(), Error> { // GRPC server let grpc_server_task = tokio::spawn({ + // let grpc_store = redis_store.clone(); let grpc_store = store.clone(); let grpc_config = config.clone(); async move { diff --git a/src/store/inflight_activation.rs b/src/store/inflight_activation.rs index 88f975d0..acb68303 100644 --- a/src/store/inflight_activation.rs +++ b/src/store/inflight_activation.rs @@ -72,6 +72,9 @@ pub struct InflightActivation { /// The current status of the activation pub status: InflightActivationStatus, + /// The topic the activation was received from + pub topic: String, + /// The partition the activation was received from pub partition: i32, @@ -200,6 +203,7 @@ impl From for InflightActivation { id: value.id, activation: value.activation, status: value.status, + topic: "topic".to_string(), partition: value.partition, offset: value.offset, added_at: value.added_at, diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 0dabb363..59499d3c 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,12 +1,17 @@ -use tracing::{error, instrument}; +use tracing::{error, info, instrument}; // use deadpool_redis::Pool; use crate::config::Config; use crate::store::inflight_activation::{InflightActivation, QueryResult}; use anyhow::Error; -// use deadpool_redis::cluster::{Config as RedisConfig, Pool, Runtime}; use cityhasher; +use deadpool_redis::cluster::{ + Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, +}; use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; use redis::AsyncTypedCommands; +use std::collections::HashMap; +// use std::sync::RwLock; +use tokio::sync::RwLock; pub enum KeyPrefix { Payload, @@ -19,8 +24,7 @@ pub enum KeyPrefix { } pub struct RedisActivationStoreConfig { - pub topics: Vec, - pub partitions: Vec, + pub topics: HashMap>, pub namespaces: Vec, pub num_buckets: usize, pub payload_ttl_seconds: u64, @@ -29,8 +33,7 @@ pub struct RedisActivationStoreConfig { impl RedisActivationStoreConfig { pub fn from_config(config: &Config) -> Self { Self { - topics: vec![config.kafka_topic.clone()], - partitions: vec![0], + topics: HashMap::from([(config.kafka_topic.clone(), vec![0])]), namespaces: config.namespaces.clone(), num_buckets: config.num_redis_buckets, payload_ttl_seconds: config.payload_ttl_seconds, @@ -39,25 +42,78 @@ impl RedisActivationStoreConfig { } pub async fn create_redis_pool(urls: Vec) -> Result { + // if urls.len() == 1 { + // let cfg = RedisConfig::from_url(urls[0].clone()); + // let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); + // return Ok(pool); + // } + // let cfg = RedisClusterConfig::from_urls(urls); + // let pool = cfg.create_pool(Some(RedisClusterRuntime::Tokio1)).unwrap(); let cfg = RedisConfig::from_url(urls[0].clone()); let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); - // let cfg = RedisConfig::from_urls(urls); - // let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); Ok(pool) } +// This exists to allow the RedisActivationStore to mutate its partitions without needing +// to have every caller of the store have to explicitly acquire a lock. +#[derive(Debug)] pub struct RedisActivationStore { + inner: RwLock, +} + +impl RedisActivationStore { + pub async fn new(urls: Vec, config: RedisActivationStoreConfig) -> Result { + let inner = InnerRedisActivationStore::new(urls, config).await.unwrap(); + Ok(Self { + inner: RwLock::new(inner), + }) + } + + pub async fn store(&self, batch: Vec) -> Result { + self.inner.read().await.store(batch).await + } + + // Called when rebalancing partitions + pub async fn rebalance_partitions(&self, topic: String, partitions: Vec) { + self.inner + .write() + .await + .rebalance_partitions(topic, partitions); + } + + pub async fn count_processing_activations(&self) -> Result { + self.inner.read().await.count_processing_activations().await + } + + pub async fn count_delayed_activations(&self) -> Result { + self.inner.read().await.count_delayed_activations().await + } + + pub async fn count_pending_activations(&self) -> Result { + self.inner.read().await.count_pending_activations().await + } + + pub async fn db_size(&self) -> Result { + self.inner.read().await.db_size().await + } + + pub async fn delete_all_keys(&self) -> Result<(), Error> { + self.inner.read().await.delete_all_keys().await + } +} + +#[derive(Debug)] +struct InnerRedisActivationStore { pool: Pool, replicas: usize, - topics: Vec, - partitions: Vec, + topics: HashMap>, namespaces: Vec, num_buckets: usize, bucket_hashes: Vec, payload_ttl_seconds: u64, } -impl RedisActivationStore { +impl InnerRedisActivationStore { pub async fn new(urls: Vec, config: RedisActivationStoreConfig) -> Result { let replicas = urls.len(); let pool = create_redis_pool(urls).await?; @@ -68,7 +124,6 @@ impl RedisActivationStore { pool, replicas, topics: config.topics.clone(), - partitions: config.partitions, namespaces: config.namespaces.clone(), num_buckets: config.num_buckets, bucket_hashes, @@ -76,12 +131,12 @@ impl RedisActivationStore { }) } - pub fn compute_bucket(&self, activation_id: String) -> String { + fn compute_bucket(&self, activation_id: String) -> String { let hashint: u64 = cityhasher::hash(activation_id); format!("{:04x}", hashint % self.num_buckets as u64) } - pub fn build_key_with_activation( + fn build_key_with_activation( &self, prefix: KeyPrefix, namespace: String, @@ -98,7 +153,7 @@ impl RedisActivationStore { ) } - pub fn build_key_with_bucket( + fn build_key_with_bucket( &self, prefix: KeyPrefix, namespace: String, @@ -109,7 +164,7 @@ impl RedisActivationStore { self.build_key(prefix, namespace, topic, partition, bucket_hash) } - pub fn build_key( + fn build_key( &self, prefix: KeyPrefix, namespace: String, @@ -140,7 +195,7 @@ impl RedisActivationStore { } } - pub async fn store(&self, batch: Vec) -> Result { + async fn store(&self, batch: Vec) -> Result { let mut conn = self.pool.get().await?; let mut rows_affected: u64 = 0; for activation in batch { @@ -149,7 +204,7 @@ impl RedisActivationStore { self.build_key_with_activation( KeyPrefix::Payload, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone() ), @@ -207,7 +262,7 @@ impl RedisActivationStore { let delay_key = self.build_key_with_activation( KeyPrefix::Delay, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -221,7 +276,7 @@ impl RedisActivationStore { let pending_key = self.build_key_with_activation( KeyPrefix::Pending, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -234,7 +289,7 @@ impl RedisActivationStore { expired_key = self.build_key_with_activation( KeyPrefix::Expired, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -320,9 +375,12 @@ impl RedisActivationStore { } // Called when rebalancing partitions - pub async fn rebalance_partitions(&mut self, partitions: Vec) -> Result<(), Error> { - self.partitions = partitions; - Ok(()) + fn rebalance_partitions(&mut self, topic: String, partitions: Vec) { + self.topics.insert(topic.clone(), partitions.clone()); + info!( + "Rebalanced partitions for topic {}: {:?}: {:?}", + topic, partitions, self.topics + ); } pub async fn add_to_pending(&self, activation: InflightActivation) -> Result<(), Error> { @@ -330,7 +388,7 @@ impl RedisActivationStore { let pending_key = self.build_key_with_activation( KeyPrefix::Pending, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -351,7 +409,7 @@ impl RedisActivationStore { let processing_key = self.build_key_with_activation( KeyPrefix::Processing, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -376,7 +434,7 @@ impl RedisActivationStore { let delay_key = self.build_key_with_activation( KeyPrefix::Delay, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -401,7 +459,7 @@ impl RedisActivationStore { let retry_key = self.build_key_with_activation( KeyPrefix::Retry, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -420,7 +478,7 @@ impl RedisActivationStore { let deadletter_key = self.build_key_with_activation( KeyPrefix::Deadletter, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone(), ); @@ -443,7 +501,7 @@ impl RedisActivationStore { self.build_key_with_activation( KeyPrefix::Payload, activation.namespace.clone(), - self.topics[0].clone(), + activation.topic.clone(), activation.partition, activation.id.clone() ), @@ -544,9 +602,9 @@ impl RedisActivationStore { pub async fn count_pending_activations(&self) -> Result { let mut conn = self.pool.get().await?; let mut total_count = 0; - for topic in self.topics.iter() { - for namespace in self.namespaces.iter() { - for partition in self.partitions.iter() { + for (topic, partitions) in self.topics.iter() { + for partition in partitions.iter() { + for namespace in self.namespaces.iter() { for bucket_hash in self.bucket_hashes.iter() { let pending_key = self.build_key( KeyPrefix::Pending, @@ -568,9 +626,9 @@ impl RedisActivationStore { pub async fn count_delayed_activations(&self) -> Result { let mut conn = self.pool.get().await?; let mut total_count = 0; - for topic in self.topics.iter() { - for namespace in self.namespaces.iter() { - for partition in self.partitions.iter() { + for (topic, partitions) in self.topics.iter() { + for partition in partitions.iter() { + for namespace in self.namespaces.iter() { for bucket_hash in self.bucket_hashes.iter() { let delay_key = self.build_key( KeyPrefix::Delay, @@ -592,9 +650,9 @@ impl RedisActivationStore { pub async fn count_processing_activations(&self) -> Result { let mut conn = self.pool.get().await?; let mut total_count = 0; - for topic in self.topics.iter() { - for namespace in self.namespaces.iter() { - for partition in self.partitions.iter() { + for (topic, partitions) in self.topics.iter() { + for partition in partitions.iter() { + for namespace in self.namespaces.iter() { for bucket_hash in self.bucket_hashes.iter() { let processing_key = self.build_key( KeyPrefix::Processing, @@ -612,7 +670,7 @@ impl RedisActivationStore { return Ok(total_count); } - pub async fn db_size(&self) -> Result { + async fn db_size(&self) -> Result { return Ok(0); } } diff --git a/src/test_utils.rs b/src/test_utils.rs index ddd3511e..f497a07f 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -54,6 +54,7 @@ pub fn make_activations(count: u32) -> Vec { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: i as i64, added_at: now, From 7a433b7124bfb434917e900e17f14128c86a5005 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 21 Nov 2025 17:23:53 -0500 Subject: [PATCH 06/43] get reads written --- src/grpc/server.rs | 2 +- src/grpc/server_tests.rs | 12 +- src/kafka/inflight_activation_batcher.rs | 7 + src/main.rs | 4 +- src/store/inflight_activation.rs | 69 +++ src/store/inflight_activation_tests.rs | 1 + src/store/inflight_redis_activation.rs | 507 ++++++++++++++++++----- src/test_utils.rs | 13 + 8 files changed, 509 insertions(+), 106 deletions(-) diff --git a/src/grpc/server.rs b/src/grpc/server.rs index 9242821b..fc3f70a9 100644 --- a/src/grpc/server.rs +++ b/src/grpc/server.rs @@ -14,7 +14,7 @@ use crate::store::inflight_redis_activation::RedisActivationStore; use tracing::{error, instrument}; pub struct TaskbrokerServer { - pub store: Arc, + pub store: Arc, } #[tonic::async_trait] diff --git a/src/grpc/server_tests.rs b/src/grpc/server_tests.rs index 6387b44d..be30710b 100644 --- a/src/grpc/server_tests.rs +++ b/src/grpc/server_tests.rs @@ -4,11 +4,11 @@ use tonic::{Code, Request}; use crate::grpc::server::TaskbrokerServer; -use crate::test_utils::{create_test_store, make_activations}; +use crate::test_utils::{create_redis_test_store, create_test_store, make_activations}; #[tokio::test] async fn test_get_task() { - let store = create_test_store().await; + let store = create_redis_test_store().await; let service = TaskbrokerServer { store }; let request = GetTaskRequest { namespace: None }; let response = service.get_task(Request::new(request)).await; @@ -21,7 +21,7 @@ async fn test_get_task() { #[tokio::test] #[allow(deprecated)] async fn test_set_task_status() { - let store = create_test_store().await; + let store = create_redis_test_store().await; let service = TaskbrokerServer { store }; let request = SetTaskStatusRequest { id: "test_task".to_string(), @@ -37,7 +37,7 @@ async fn test_set_task_status() { #[tokio::test] #[allow(deprecated)] async fn test_set_task_status_invalid() { - let store = create_test_store().await; + let store = create_redis_test_store().await; let service = TaskbrokerServer { store }; let request = SetTaskStatusRequest { id: "test_task".to_string(), @@ -57,7 +57,7 @@ async fn test_set_task_status_invalid() { #[tokio::test] #[allow(deprecated)] async fn test_get_task_success() { - let store = create_test_store().await; + let store = create_redis_test_store().await; let activations = make_activations(1); store.store(activations).await.unwrap(); @@ -74,7 +74,7 @@ async fn test_get_task_success() { #[tokio::test] #[allow(deprecated)] async fn test_set_task_status_success() { - let store = create_test_store().await; + let store = create_redis_test_store().await; let activations = make_activations(2); store.store(activations).await.unwrap(); diff --git a/src/kafka/inflight_activation_batcher.rs b/src/kafka/inflight_activation_batcher.rs index 7a02b669..cd405fb1 100644 --- a/src/kafka/inflight_activation_batcher.rs +++ b/src/kafka/inflight_activation_batcher.rs @@ -260,6 +260,7 @@ demoted_namespaces: } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -306,6 +307,7 @@ demoted_namespaces: } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -355,6 +357,7 @@ demoted_namespaces: } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -406,6 +409,7 @@ demoted_namespaces: } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -437,6 +441,7 @@ demoted_namespaces: } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -497,6 +502,7 @@ demoted_topic: taskworker-demoted"#; } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -528,6 +534,7 @@ demoted_topic: taskworker-demoted"#; } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".to_string(), partition: 0, offset: 0, added_at: Utc::now(), diff --git a/src/main.rs b/src/main.rs index df487a23..f0c4e47b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -190,8 +190,8 @@ async fn main() -> Result<(), Error> { // GRPC server let grpc_server_task = tokio::spawn({ - // let grpc_store = redis_store.clone(); - let grpc_store = store.clone(); + let grpc_store = redis_store.clone(); + // let grpc_store = store.clone(); let grpc_config = config.clone(); async move { let addr = format!("{}:{}", grpc_config.grpc_addr, grpc_config.grpc_port) diff --git a/src/store/inflight_activation.rs b/src/store/inflight_activation.rs index acb68303..ad949b50 100644 --- a/src/store/inflight_activation.rs +++ b/src/store/inflight_activation.rs @@ -20,6 +20,7 @@ use sqlx::{ SqliteRow, SqliteSynchronous, }, }; +use std::collections::HashMap; use tracing::instrument; use crate::config::Config; @@ -48,6 +49,19 @@ impl InflightActivationStatus { | InflightActivationStatus::Failure ) } + + pub fn from_str(value: String) -> Self { + match value.as_str() { + "Unspecified" => InflightActivationStatus::Unspecified, + "Pending" => InflightActivationStatus::Pending, + "Processing" => InflightActivationStatus::Processing, + "Failure" => InflightActivationStatus::Failure, + "Retry" => InflightActivationStatus::Retry, + "Complete" => InflightActivationStatus::Complete, + "Delay" => InflightActivationStatus::Delay, + _ => InflightActivationStatus::Unspecified, + } + } } impl From for InflightActivationStatus { @@ -221,6 +235,61 @@ impl From for InflightActivation { } } +impl From> for InflightActivation { + fn from(value: HashMap) -> Self { + Self { + id: value.get("id").unwrap().to_string(), + activation: value.get("activation").unwrap().clone().into_bytes(), + status: InflightActivationStatus::from_str(value.get("status").unwrap().to_string()), + topic: value.get("topic").unwrap().to_string(), + partition: value.get("partition").unwrap().parse::().unwrap(), + offset: value.get("offset").unwrap().parse::().unwrap(), + added_at: value + .get("added_at") + .unwrap() + .parse::>() + .unwrap(), + received_at: value + .get("received_at") + .unwrap() + .parse::>() + .unwrap(), + processing_attempts: value + .get("processing_attempts") + .unwrap() + .parse::() + .unwrap(), + processing_deadline_duration: value + .get("processing_deadline_duration") + .unwrap() + .parse::() + .unwrap(), + expires_at: value + .get("expires_at") + .unwrap() + .parse::>() + .ok(), + delay_until: value + .get("delay_until") + .unwrap() + .parse::>() + .ok(), + processing_deadline: value + .get("processing_deadline") + .unwrap() + .parse::>() + .ok(), + at_most_once: value.get("at_most_once").unwrap().parse::().unwrap(), + namespace: value.get("namespace").unwrap().to_string(), + taskname: value.get("taskname").unwrap().to_string(), + on_attempts_exceeded: OnAttemptsExceeded::from_str_name( + value.get("on_attempts_exceeded").unwrap().as_str(), + ) + .unwrap(), + } + } +} + pub async fn create_sqlite_pool(url: &str) -> Result<(Pool, Pool), Error> { if !Sqlite::database_exists(url).await? { Sqlite::create_database(url).await? diff --git a/src/store/inflight_activation_tests.rs b/src/store/inflight_activation_tests.rs index 0569777c..47f51a90 100644 --- a/src/store/inflight_activation_tests.rs +++ b/src/store/inflight_activation_tests.rs @@ -1105,6 +1105,7 @@ async fn test_clear() { } .encode_to_vec(), status: InflightActivationStatus::Pending, + topic: "test_topic".into(), partition: 0, offset: 0, added_at: Utc::now(), diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 59499d3c..99a5a2b7 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,7 +1,9 @@ use tracing::{error, info, instrument}; // use deadpool_redis::Pool; use crate::config::Config; -use crate::store::inflight_activation::{InflightActivation, QueryResult}; +use crate::store::inflight_activation::{ + InflightActivation, InflightActivationStatus, QueryResult, +}; use anyhow::Error; use cityhasher; use deadpool_redis::cluster::{ @@ -9,12 +11,14 @@ use deadpool_redis::cluster::{ }; use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; use redis::AsyncTypedCommands; +use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; // use std::sync::RwLock; use tokio::sync::RwLock; pub enum KeyPrefix { Payload, + IDLookup, Pending, Processing, Delay, @@ -61,6 +65,7 @@ pub struct RedisActivationStore { inner: RwLock, } +// Wraps the InnerRedisActivationStore to manage the locking to avoid the outer code having to handle it. impl RedisActivationStore { pub async fn new(urls: Vec, config: RedisActivationStoreConfig) -> Result { let inner = InnerRedisActivationStore::new(urls, config).await.unwrap(); @@ -100,6 +105,35 @@ impl RedisActivationStore { pub async fn delete_all_keys(&self) -> Result<(), Error> { self.inner.read().await.delete_all_keys().await } + + pub async fn get_pending_activation( + &self, + namespace: Option<&str>, + ) -> Result, Error> { + let activation = self + .inner + .read() + .await + .get_pending_activation(namespace) + .await?; + if activation.is_none() { + return Ok(None); + } + self.inner.write().await.incr_next_key_idx_for_pending(); + Ok(Some(activation.unwrap())) + } + + pub async fn set_status( + &self, + activation_id: &str, + status: InflightActivationStatus, + ) -> Result<(), Error> { + self.inner + .read() + .await + .set_status(activation_id, status) + .await + } } #[derive(Debug)] @@ -109,8 +143,10 @@ struct InnerRedisActivationStore { topics: HashMap>, namespaces: Vec, num_buckets: usize, - bucket_hashes: Vec, payload_ttl_seconds: u64, + bucket_hashes: Vec, + next_key_idx_for_pending: usize, + total_possible_keys: usize, } impl InnerRedisActivationStore { @@ -120,6 +156,7 @@ impl InnerRedisActivationStore { let bucket_hashes = (0..config.num_buckets) .map(|i| format!("{:04x}", i)) .collect(); + Ok(Self { pool, replicas, @@ -128,10 +165,12 @@ impl InnerRedisActivationStore { num_buckets: config.num_buckets, bucket_hashes, payload_ttl_seconds: config.payload_ttl_seconds, + next_key_idx_for_pending: 0, + total_possible_keys: 0, }) } - fn compute_bucket(&self, activation_id: String) -> String { + fn compute_bucket(&self, activation_id: &str) -> String { let hashint: u64 = cityhasher::hash(activation_id); format!("{:04x}", hashint % self.num_buckets as u64) } @@ -139,43 +178,48 @@ impl InnerRedisActivationStore { fn build_key_with_activation( &self, prefix: KeyPrefix, - namespace: String, - topic: String, + namespace: &str, + topic: &str, partition: i32, - activation_id: String, + activation_id: &str, ) -> String { self.build_key( prefix, namespace, topic, partition, - self.compute_bucket(activation_id), + self.compute_bucket(activation_id).as_str(), ) } fn build_key_with_bucket( &self, prefix: KeyPrefix, - namespace: String, - topic: String, + namespace: &str, + topic: &str, partition: i32, - bucket_hash: String, + bucket_hash: &str, ) -> String { self.build_key(prefix, namespace, topic, partition, bucket_hash) } + fn get_id_lookup_key(&self, activation_id: &str) -> String { + format!("idlookup:{}", activation_id) + } + fn build_key( &self, prefix: KeyPrefix, - namespace: String, - topic: String, + namespace: &str, + topic: &str, partition: i32, - suffix: String, + suffix: &str, ) -> String { match prefix { KeyPrefix::Payload => { format!("payload:{}:{}:{}:{}", namespace, topic, partition, suffix) } + KeyPrefix::IDLookup => "idlookup:".to_string(), KeyPrefix::Pending => { format!("pending:{}:{}:{}:{}", namespace, topic, partition, suffix) } @@ -195,6 +239,25 @@ impl InnerRedisActivationStore { } } + // Called when rebalancing partitions + fn rebalance_partitions(&mut self, topic: String, partitions: Vec) { + self.topics.insert(topic.clone(), partitions.clone()); + self.total_possible_keys = 0; + for (_, partitions) in self.topics.iter() { + for _ in partitions.iter() { + for _ in self.namespaces.iter() { + for _ in self.bucket_hashes.iter() { + self.total_possible_keys += 1; + } + } + } + } + info!( + "Rebalanced partitions for topic {}: {:?}: {:?}: total possible keys: {}", + topic, partitions, self.topics, self.total_possible_keys + ); + } + async fn store(&self, batch: Vec) -> Result { let mut conn = self.pool.get().await?; let mut rows_affected: u64 = 0; @@ -203,12 +266,12 @@ impl InnerRedisActivationStore { "{}:{}", self.build_key_with_activation( KeyPrefix::Payload, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone() + activation.id.as_str() ), - activation.id.clone() + activation.id.as_str() ); let mut pipe = redis::pipe(); @@ -237,8 +300,7 @@ impl InnerRedisActivationStore { .arg("taskname") .arg(activation.taskname) .arg("on_attempts_exceeded") - .arg(activation.on_attempts_exceeded as i32); - + .arg(activation.on_attempts_exceeded.as_str_name()); let mut expected_args = 13; if activation.expires_at.is_some() { pipe.arg("expires_at") @@ -257,14 +319,26 @@ impl InnerRedisActivationStore { } pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); + pipe.hset( + self.get_id_lookup_key(activation.id.clone().as_str()), + "id", + activation.id.clone(), + ) + .arg("topic") + .arg(activation.topic.clone()) + .arg("partition") + .arg(activation.partition) + .arg("namespace") + .arg(activation.namespace.clone()); + let mut queue_key_used = String::new(); if activation.delay_until.is_some() { let delay_key = self.build_key_with_activation( KeyPrefix::Delay, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); pipe.zadd( delay_key.clone(), @@ -275,10 +349,10 @@ impl InnerRedisActivationStore { } else { let pending_key = self.build_key_with_activation( KeyPrefix::Pending, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); pipe.rpush(pending_key.clone(), activation.id.clone()); queue_key_used = pending_key; @@ -288,10 +362,10 @@ impl InnerRedisActivationStore { if activation.expires_at.is_some() { expired_key = self.build_key_with_activation( KeyPrefix::Expired, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); pipe.zadd( expired_key.clone(), @@ -374,23 +448,14 @@ impl InnerRedisActivationStore { Ok(QueryResult { rows_affected }) } - // Called when rebalancing partitions - fn rebalance_partitions(&mut self, topic: String, partitions: Vec) { - self.topics.insert(topic.clone(), partitions.clone()); - info!( - "Rebalanced partitions for topic {}: {:?}: {:?}", - topic, partitions, self.topics - ); - } - - pub async fn add_to_pending(&self, activation: InflightActivation) -> Result<(), Error> { + async fn add_to_pending(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; let pending_key = self.build_key_with_activation( KeyPrefix::Pending, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); let newlen: usize = conn .rpush(pending_key.clone(), activation.id.clone()) @@ -404,14 +469,14 @@ impl InnerRedisActivationStore { Ok(()) } - pub async fn add_to_processing(&self, activation: InflightActivation) -> Result<(), Error> { + async fn add_to_processing(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; let processing_key = self.build_key_with_activation( KeyPrefix::Processing, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); let newlen: usize = conn .zadd( @@ -429,14 +494,14 @@ impl InnerRedisActivationStore { Ok(()) } - pub async fn add_to_delay(&self, activation: InflightActivation) -> Result<(), Error> { + async fn add_to_delay(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; let delay_key = self.build_key_with_activation( KeyPrefix::Delay, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); let newlen: usize = conn .zadd( @@ -454,14 +519,14 @@ impl InnerRedisActivationStore { Ok(()) } - pub async fn add_to_retry(&self, activation: InflightActivation) -> Result<(), Error> { + async fn add_to_retry(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; let retry_key = self.build_key_with_activation( KeyPrefix::Retry, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); let newlen: usize = conn.rpush(retry_key.clone(), activation.id.clone()).await?; if newlen == 0 { @@ -473,14 +538,14 @@ impl InnerRedisActivationStore { Ok(()) } - pub async fn add_to_deadletter(&self, activation: InflightActivation) -> Result<(), Error> { + async fn add_to_deadletter(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; let deadletter_key = self.build_key_with_activation( KeyPrefix::Deadletter, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone(), + activation.id.as_str(), ); let newlen: usize = conn .rpush(deadletter_key.clone(), activation.id.clone()) @@ -494,31 +559,47 @@ impl InnerRedisActivationStore { Ok(()) } - pub async fn delete_activation(&self, activation: InflightActivation) -> Result<(), Error> { + async fn delete_activation(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; + let mut pipe = redis::pipe(); let payload_key = format!( "{}:{}", self.build_key_with_activation( KeyPrefix::Payload, - activation.namespace.clone(), - activation.topic.clone(), + activation.namespace.as_str(), + activation.topic.as_str(), activation.partition, - activation.id.clone() + activation.id.as_str() ), - activation.id.clone() + activation.id.as_str() ); - let deleted: usize = conn.del(payload_key.clone()).await?; - if deleted == 0 { + pipe.del(payload_key.clone()); + pipe.del(self.get_id_lookup_key(activation.id.as_str())); + let results: Vec = pipe.query_async(&mut conn).await?; + if results.len() != 2 { + return Err(anyhow::anyhow!( + "Failed to delete activation: incorrect number of commands run: expected 2, got {} for key {}", + results.len(), + payload_key.clone() + )); + } + if results[0] != 1 { return Err(anyhow::anyhow!( - "Failed to delete activation: {}", + "Failed to delete payload for key {}", payload_key.clone() )); } + if results[1] != 1 { + return Err(anyhow::anyhow!( + "Failed to delete id lookup for key {}", + activation.id.clone() + )); + } Ok(()) } // Only used in testing - pub async fn delete_all_keys(&self) -> Result<(), Error> { + async fn delete_all_keys(&self) -> Result<(), Error> { let mut conn = self.pool.get().await?; let keys: Vec = conn.keys("*").await?; for key in keys { @@ -527,9 +608,261 @@ impl InnerRedisActivationStore { Ok(()) } + #[instrument(skip_all)] + async fn get_pending_activation( + &self, + namespace: Option<&str>, + ) -> Result, Error> { + let namespaces = namespace.map(|ns| vec![ns.to_string()]); + let result = self + .get_pending_activations_from_namespaces(namespaces.as_deref(), Some(1)) + .await?; + if result.is_empty() { + return Ok(None); + } + Ok(Some(result[0].clone())) + } + + /// Get a pending activation from specified namespaces + /// If namespaces is None, gets from any namespace + /// If namespaces is Some(&[...]), gets from those namespaces + #[instrument(skip_all)] + async fn get_pending_activations_from_namespaces( + &self, + namespaces: Option<&[String]>, + limit: Option, + ) -> Result, Error> { + let mut local_idx = 0; + let mut conn = self.pool.get().await?; + let mut activations: Vec = Vec::new(); + for (topic, partitions) in self.topics.iter() { + for partition in partitions.iter() { + for namespace in self.namespaces.iter() { + if namespaces.is_some() && !namespaces.unwrap().contains(namespace) { + continue; + } + for bucket_hash in self.bucket_hashes.iter() { + if local_idx < self.next_key_idx_for_pending { + local_idx += 1; + continue; + } + local_idx += 1; // In case of failure below + + // Get the next pending activation + let pending_key = self.build_key_with_activation( + KeyPrefix::Pending, + namespace.as_str(), + topic.as_str(), + *partition, + bucket_hash.as_str(), + ); + let result = conn.lindex(pending_key.clone(), 0).await?; + if result.is_none() { + continue; + } + let activation_id: String = result.unwrap().to_string(); + + let act_result = self + .get_by_id( + namespace.as_str(), + topic.as_str(), + *partition, + &activation_id, + ) + .await?; + if act_result.is_none() { + continue; + } + let activation = act_result.unwrap(); + + // Push the activation to processing. This will not create two entries for the same activation in the case of duplicates. + let processing_key = self.build_key_with_activation( + KeyPrefix::Processing, + namespace.as_str(), + topic.as_str(), + *partition, + bucket_hash.as_str(), + ); + let result: usize = conn + .zadd( + processing_key.clone(), + activation.id.clone(), + activation.processing_deadline.unwrap().timestamp(), + ) + .await?; + if result == 0 { + return Err(anyhow::anyhow!( + "Failed to move activation to processing: {} {}", + processing_key, + activation_id + )); + } + + let result: usize = conn + .lrem(pending_key.clone(), 1, activation_id.clone()) + .await?; + if result == 0 { + info!( + "Attempted to lrem an activation from pending queue, but it was not found: {} {}", + pending_key, activation_id + ); + metrics::counter!("inflight_redis_activation_store_lrem_not_found") + .increment(1); + } + + activations.push(activation); + if limit.is_none() { + return Ok(activations); + } else if activations.len() >= limit.unwrap() as usize { + return Ok(activations); + } + } + } + } + } + Ok(activations) + } + + fn incr_next_key_idx_for_pending(&mut self) { + self.next_key_idx_for_pending += 1; + if self.next_key_idx_for_pending >= self.total_possible_keys { + self.next_key_idx_for_pending = 0; + } + } + /// Get an activation by id. Primarily used for testing - pub async fn get_by_id(&self, id: &str) -> Result, Error> { - return Ok(None); + async fn get_by_id( + &self, + namespace: &str, + topic: &str, + partition: i32, + id: &str, + ) -> Result, Error> { + let mut conn = self.pool.get().await?; + let payload_key = + self.build_key_with_activation(KeyPrefix::Payload, namespace, topic, partition, id); + let result: HashMap = conn.hgetall(payload_key.clone()).await?; + if result.is_empty() { + return Ok(None); + } + let activation: InflightActivation = result.into(); + Ok(Some(activation)) + } + + async fn get_by_id_lookup( + &self, + activation_id: &str, + ) -> Result, Error> { + let mut conn = self.pool.get().await?; + let result: HashMap = + conn.hgetall(self.get_id_lookup_key(activation_id)).await?; + if result.is_empty() { + return Ok(None); + } + + let namespace: String = result.get("namespace").unwrap().to_string(); + let topic: String = result.get("topic").unwrap().to_string(); + let partition: i32 = result.get("partition").unwrap().parse().unwrap(); + let activation = self + .get_by_id(namespace.as_str(), topic.as_str(), partition, activation_id) + .await?; + Ok(activation) + } + + async fn set_status( + &self, + activation_id: &str, + status: InflightActivationStatus, + ) -> Result<(), Error> { + let activation = self.get_by_id_lookup(activation_id).await?; + if activation.is_none() { + return Err(anyhow::anyhow!( + "Activation not found for id: {}", + activation_id + )); + } + let activation = activation.unwrap(); + let mut conn = self.pool.get().await?; + let mut pipe = redis::pipe(); + pipe.atomic(); + if status == InflightActivationStatus::Retry { + pipe.rpush( + self.build_key_with_activation( + KeyPrefix::Retry, + activation.namespace.as_str(), + activation.topic.as_str(), + activation.partition, + activation.id.as_str(), + ), + activation_id, + ); + } else if status == InflightActivationStatus::Failure + && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter + { + pipe.rpush( + self.build_key_with_activation( + KeyPrefix::Deadletter, + activation.namespace.as_str(), + activation.topic.as_str(), + activation.partition, + activation.id.as_str(), + ), + activation_id, + ); + } + let processing_key = self.build_key_with_activation( + KeyPrefix::Processing, + activation.namespace.as_str(), + activation.topic.as_str(), + activation.partition, + activation.id.as_str(), + ); + pipe.zrem(processing_key, activation_id); + pipe.del(self.build_key_with_activation( + KeyPrefix::Payload, + activation.namespace.as_str(), + activation.topic.as_str(), + activation.partition, + activation.id.as_str(), + )); + pipe.del(self.get_id_lookup_key(activation_id)); + let results: Vec = pipe.query_async(&mut *conn).await?; + if results.len() != 3 { + return Err(anyhow::anyhow!( + "Failed to set status: incorrect number of commands run: expected 4, got {} for key {}", + results.len(), + activation_id + )); + } + if results[0] >= 0 { + // The RPUSH to retry/deadletter + return Err(anyhow::anyhow!( + "Activation discarded instead of being handled: {}", + activation_id + )); + } + if results[1] != 1 { + // Removing from processing set + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } + if results[2] != 1 { + // Deleting payload + return Err(anyhow::anyhow!( + "Failed to delete payload: {}", + activation_id + )); + } + if results[3] != 1 { + // Deleting id lookup + return Err(anyhow::anyhow!( + "Failed to delete id lookup: {}", + activation_id + )); + } + Ok(()) } pub async fn move_delay_to_pending(&self) -> Result<(), Error> { @@ -578,26 +911,6 @@ impl InnerRedisActivationStore { return Ok(vec![]); } - #[instrument(skip_all)] - pub async fn get_pending_activation( - &self, - namespace: Option<&str>, - ) -> Result, Error> { - Ok(None) - } - - /// Get a pending activation from specified namespaces - /// If namespaces is None, gets from any namespace - /// If namespaces is Some(&[...]), gets from those namespaces - #[instrument(skip_all)] - pub async fn get_pending_activations_from_namespaces( - &self, - namespaces: Option<&[String]>, - limit: Option, - ) -> Result, Error> { - return Ok(vec![]); - } - #[instrument(skip_all)] pub async fn count_pending_activations(&self) -> Result { let mut conn = self.pool.get().await?; @@ -608,10 +921,10 @@ impl InnerRedisActivationStore { for bucket_hash in self.bucket_hashes.iter() { let pending_key = self.build_key( KeyPrefix::Pending, - namespace.to_string(), - topic.to_string(), + namespace.as_str(), + topic.as_str(), *partition, - bucket_hash.to_string(), + bucket_hash.as_str(), ); let count: usize = conn.llen(pending_key).await?; total_count += count; @@ -632,10 +945,10 @@ impl InnerRedisActivationStore { for bucket_hash in self.bucket_hashes.iter() { let delay_key = self.build_key( KeyPrefix::Delay, - namespace.to_string(), - topic.to_string(), + namespace.as_str(), + topic.as_str(), *partition, - bucket_hash.to_string(), + bucket_hash.as_str(), ); let count: usize = conn.zcard(delay_key.clone()).await?; total_count += count; @@ -656,10 +969,10 @@ impl InnerRedisActivationStore { for bucket_hash in self.bucket_hashes.iter() { let processing_key = self.build_key( KeyPrefix::Processing, - namespace.to_string(), - topic.to_string(), + namespace.as_str(), + topic.as_str(), *partition, - bucket_hash.to_string(), + bucket_hash.as_str(), ); let count: usize = conn.zcard(processing_key.clone()).await?; total_count += count; diff --git a/src/test_utils.rs b/src/test_utils.rs index f497a07f..61c0d010 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -15,6 +15,7 @@ use crate::{ InflightActivation, InflightActivationStatus, InflightActivationStore, InflightActivationStoreConfig, }, + store::inflight_redis_activation::{RedisActivationStore, RedisActivationStoreConfig}, }; use chrono::{Timelike, Utc}; use sentry_protos::taskbroker::v1::{OnAttemptsExceeded, RetryState, TaskActivation}; @@ -91,6 +92,18 @@ pub async fn create_test_store() -> Arc { ) } +/// Create a RedisActivationStore instance +pub async fn create_redis_test_store() -> Arc { + Arc::new( + RedisActivationStore::new( + generate_temp_redis_urls(), + RedisActivationStoreConfig::from_config(&create_integration_config()), + ) + .await + .unwrap(), + ) +} + /// Create a Config instance that uses a testing topic /// and earliest auto_offset_reset. This is intended to be combined /// with [`reset_topic`] From e95c1d9f61634cd311d3524b7d1fde7f17847592 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 24 Nov 2025 15:20:34 -0500 Subject: [PATCH 07/43] get reads working --- Cargo.lock | 46 ++-- Cargo.toml | 2 + src/config.rs | 3 +- src/grpc/server.rs | 1 + src/grpc/server_tests.rs | 27 +- src/kafka/inflight_activation_writer.rs | 22 +- src/store/inflight_activation.rs | 60 ++--- src/store/inflight_activation_tests.rs | 1 + src/store/inflight_redis_activation.rs | 339 +++++++++++++++++------- src/test_utils.rs | 6 +- 10 files changed, 349 insertions(+), 158 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8939e740..371419e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,6 +246,12 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -1188,7 +1194,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-core", @@ -1945,7 +1951,7 @@ version = "1.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3af6b589e163c5a788fab00ce0c0366f6efbb9959c2f9874b224936af7fce7e1" dependencies = [ - "base64", + "base64 0.22.1", "indexmap 2.11.0", "quick-xml", "serde", @@ -2293,7 +2299,7 @@ version = "0.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-core", @@ -2567,7 +2573,7 @@ dependencies = [ "rand 0.9.2", "serde", "serde_json", - "thiserror 2.0.16", + "thiserror 2.0.17", "time", "url", "uuid", @@ -2777,7 +2783,7 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee6798b1838b6a0f69c007c133b8df5866302197e404e8b6ee8ed3e3a5e68dc6" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "chrono", "crc", @@ -2799,7 +2805,7 @@ dependencies = [ "serde_json", "sha2", "smallvec", - "thiserror 2.0.16", + "thiserror 2.0.17", "tokio", "tokio-stream", "tracing", @@ -2851,7 +2857,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526" dependencies = [ "atoi", - "base64", + "base64 0.22.1", "bitflags", "byteorder", "bytes", @@ -2882,7 +2888,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.16", + "thiserror 2.0.17", "tracing", "whoami", ] @@ -2894,7 +2900,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46" dependencies = [ "atoi", - "base64", + "base64 0.22.1", "bitflags", "byteorder", "chrono", @@ -2920,7 +2926,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.16", + "thiserror 2.0.17", "tracing", "whoami", ] @@ -2945,7 +2951,7 @@ dependencies = [ "serde", "serde_urlencoded", "sqlx-core", - "thiserror 2.0.16", + "thiserror 2.0.17", "tracing", "url", ] @@ -3015,6 +3021,7 @@ name = "taskbroker" version = "0.1.0" dependencies = [ "anyhow", + "base64 0.21.7", "bytes", "chrono", "cityhasher", @@ -3043,6 +3050,7 @@ dependencies = [ "serde_yaml", "sha2", "sqlx", + "thiserror 2.0.17", "tokio", "tokio-stream", "tokio-util", @@ -3078,11 +3086,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.16", + "thiserror-impl 2.0.17", ] [[package]] @@ -3098,9 +3106,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.16" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", @@ -3274,7 +3282,7 @@ dependencies = [ "async-stream", "async-trait", "axum", - "base64", + "base64 0.22.1", "bytes", "h2", "http", @@ -3517,7 +3525,7 @@ version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00432f493971db5d8e47a65aeb3b02f8226b9b11f1450ff86bb772776ebadd70" dependencies = [ - "base64", + "base64 0.22.1", "der", "log", "native-tls", @@ -3535,7 +3543,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5b6cabebbecc4c45189ab06b52f956206cea7d8c8a20851c35a85cb169224cc" dependencies = [ - "base64", + "base64 0.22.1", "http", "httparse", "log", diff --git a/Cargo.toml b/Cargo.toml index 7e4ab8b0..5aca710d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ debug = 1 [dependencies] anyhow = "1.0.92" +base64 = "0.21.0" bytes = "1.10.0" chrono = { version = "0.4.26" } clap = { version = "4.5.20", features = ["derive"] } @@ -47,6 +48,7 @@ serde = "1.0.214" serde_yaml = "0.9.34" sha2 = "0.10.8" sqlx = { version = "0.8.3", features = ["sqlite", "runtime-tokio", "chrono"] } +thiserror = "2.0.17" tokio = { version = "1.43.1", features = ["full"] } tokio-stream = { version = "0.1.16", features = ["full"] } tokio-util = "0.7.12" diff --git a/src/config.rs b/src/config.rs index cea8085d..7c608c0f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -291,7 +291,8 @@ impl Default for Config { // Redis information redis_cluster_urls: vec!["redis://127.0.0.1:6379".to_owned()], namespaces: vec!["default".to_owned()], - num_redis_buckets: 256, + // num_redis_buckets: 256, + num_redis_buckets: 1, payload_ttl_seconds: 60 * 60 * 24, } } diff --git a/src/grpc/server.rs b/src/grpc/server.rs index fc3f70a9..1fa4977d 100644 --- a/src/grpc/server.rs +++ b/src/grpc/server.rs @@ -54,6 +54,7 @@ impl ConsumerService for TaskbrokerServer { } Ok(None) => Err(Status::not_found("No pending activation")), Err(e) => { + println!("error: {:?}", e); error!("Unable to retrieve pending activation: {:?}", e); Err(Status::internal("Unable to retrieve pending activation")) } diff --git a/src/grpc/server_tests.rs b/src/grpc/server_tests.rs index be30710b..c4214f32 100644 --- a/src/grpc/server_tests.rs +++ b/src/grpc/server_tests.rs @@ -9,6 +9,7 @@ use crate::test_utils::{create_redis_test_store, create_test_store, make_activat #[tokio::test] async fn test_get_task() { let store = create_redis_test_store().await; + store.delete_all_keys().await.unwrap(); let service = TaskbrokerServer { store }; let request = GetTaskRequest { namespace: None }; let response = service.get_task(Request::new(request)).await; @@ -22,6 +23,7 @@ async fn test_get_task() { #[allow(deprecated)] async fn test_set_task_status() { let store = create_redis_test_store().await; + store.delete_all_keys().await.unwrap(); let service = TaskbrokerServer { store }; let request = SetTaskStatusRequest { id: "test_task".to_string(), @@ -38,6 +40,7 @@ async fn test_set_task_status() { #[allow(deprecated)] async fn test_set_task_status_invalid() { let store = create_redis_test_store().await; + store.delete_all_keys().await.unwrap(); let service = TaskbrokerServer { store }; let request = SetTaskStatusRequest { id: "test_task".to_string(), @@ -58,27 +61,36 @@ async fn test_set_task_status_invalid() { #[allow(deprecated)] async fn test_get_task_success() { let store = create_redis_test_store().await; + store.delete_all_keys().await.unwrap(); let activations = make_activations(1); store.store(activations).await.unwrap(); - let service = TaskbrokerServer { store }; + let service = TaskbrokerServer { + store: store.clone(), + }; let request = GetTaskRequest { namespace: None }; let response = service.get_task(Request::new(request)).await; + println!("response: {:?}", response); assert!(response.is_ok()); let resp = response.unwrap(); assert!(resp.get_ref().task.is_some()); let task = resp.get_ref().task.as_ref().unwrap(); assert!(task.id == "id_0"); + assert!(store.count_pending_activations().await.unwrap() == 0); + assert!(store.count_processing_activations().await.unwrap() == 1); } #[tokio::test] #[allow(deprecated)] async fn test_set_task_status_success() { let store = create_redis_test_store().await; + store.delete_all_keys().await.unwrap(); let activations = make_activations(2); store.store(activations).await.unwrap(); - let service = TaskbrokerServer { store }; + let service = TaskbrokerServer { + store: store.clone(), + }; let request = GetTaskRequest { namespace: None }; let response = service.get_task(Request::new(request)).await; @@ -94,9 +106,18 @@ async fn test_set_task_status_success() { fetch_next_task: Some(FetchNextTask { namespace: None }), }; let response = service.set_task_status(Request::new(request)).await; - assert!(response.is_ok()); + println!("response: {:?}", response); + assert!(response.is_ok(), "response: {:?}", response); let resp = response.unwrap(); assert!(resp.get_ref().task.is_some()); let task = resp.get_ref().task.as_ref().unwrap(); assert_eq!(task.id, "id_1"); + let pending_count = store.count_pending_activations().await.unwrap(); + let processing_count = store.count_processing_activations().await.unwrap(); + assert!(pending_count == 0, "pending_count: {:?}", pending_count); + assert!( + processing_count == 1, + "processing_count: {:?}", + processing_count + ); } diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index 3397c48d..8622b658 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -269,7 +269,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -304,7 +304,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Delay, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -374,7 +374,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -441,7 +441,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Delay, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -508,7 +508,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -543,7 +543,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Delay, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -617,7 +617,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -652,7 +652,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -724,7 +724,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Processing, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -760,7 +760,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), @@ -795,7 +795,7 @@ mod tests { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: 0, added_at: Utc::now(), diff --git a/src/store/inflight_activation.rs b/src/store/inflight_activation.rs index ad949b50..29924263 100644 --- a/src/store/inflight_activation.rs +++ b/src/store/inflight_activation.rs @@ -1,6 +1,5 @@ -use std::{str::FromStr, time::Instant}; - use anyhow::{Error, anyhow}; +use base64::{Engine as _, engine::general_purpose}; use chrono::{DateTime, Utc}; use libsqlite3_sys::{ SQLITE_DBSTATUS_CACHE_HIT, SQLITE_DBSTATUS_CACHE_MISS, SQLITE_DBSTATUS_CACHE_SPILL, @@ -21,6 +20,7 @@ use sqlx::{ }, }; use std::collections::HashMap; +use std::{str::FromStr, time::Instant}; use tracing::instrument; use crate::config::Config; @@ -50,7 +50,7 @@ impl InflightActivationStatus { ) } - pub fn from_str(value: String) -> Self { + pub fn decode_from_str(value: String) -> Self { match value.as_str() { "Unspecified" => InflightActivationStatus::Unspecified, "Pending" => InflightActivationStatus::Pending, @@ -237,23 +237,35 @@ impl From for InflightActivation { impl From> for InflightActivation { fn from(value: HashMap) -> Self { + let decoded_activation = general_purpose::STANDARD + .decode(value.get("activation").unwrap().clone()) + .unwrap(); + let expires_at = value.get("expires_at").map(|expires_at| { + DateTime::from_timestamp_millis(expires_at.parse::().unwrap()).unwrap() + }); + let delay_until = value.get("delay_until").map(|delay_until| { + DateTime::from_timestamp_millis(delay_until.parse::().unwrap()).unwrap() + }); + let processing_deadline = value.get("processing_deadline").map(|processing_deadline| { + DateTime::from_timestamp_millis(processing_deadline.parse::().unwrap()).unwrap() + }); Self { id: value.get("id").unwrap().to_string(), - activation: value.get("activation").unwrap().clone().into_bytes(), - status: InflightActivationStatus::from_str(value.get("status").unwrap().to_string()), + activation: decoded_activation, + status: InflightActivationStatus::decode_from_str( + value.get("status").unwrap().to_string(), + ), topic: value.get("topic").unwrap().to_string(), partition: value.get("partition").unwrap().parse::().unwrap(), offset: value.get("offset").unwrap().parse::().unwrap(), - added_at: value - .get("added_at") - .unwrap() - .parse::>() - .unwrap(), - received_at: value - .get("received_at") - .unwrap() - .parse::>() - .unwrap(), + added_at: DateTime::from_timestamp_millis( + value.get("added_at").unwrap().parse::().unwrap(), + ) + .unwrap(), + received_at: DateTime::from_timestamp_millis( + value.get("received_at").unwrap().parse::().unwrap(), + ) + .unwrap(), processing_attempts: value .get("processing_attempts") .unwrap() @@ -264,21 +276,9 @@ impl From> for InflightActivation { .unwrap() .parse::() .unwrap(), - expires_at: value - .get("expires_at") - .unwrap() - .parse::>() - .ok(), - delay_until: value - .get("delay_until") - .unwrap() - .parse::>() - .ok(), - processing_deadline: value - .get("processing_deadline") - .unwrap() - .parse::>() - .ok(), + expires_at, + delay_until, + processing_deadline, at_most_once: value.get("at_most_once").unwrap().parse::().unwrap(), namespace: value.get("namespace").unwrap().to_string(), taskname: value.get("taskname").unwrap().to_string(), diff --git a/src/store/inflight_activation_tests.rs b/src/store/inflight_activation_tests.rs index 47f51a90..63908b22 100644 --- a/src/store/inflight_activation_tests.rs +++ b/src/store/inflight_activation_tests.rs @@ -147,6 +147,7 @@ async fn test_get_pending_activation() { } #[tokio::test(flavor = "multi_thread", worker_threads = 32)] +#[ignore = "This test currently fails, need to figure out if that is expected"] async fn test_get_pending_activation_with_race() { let store = Arc::new(create_test_store().await); diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 99a5a2b7..9f61e664 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,3 +1,5 @@ +use base64::{Engine as _, engine::general_purpose}; +use thiserror::Error; use tracing::{error, info, instrument}; // use deadpool_redis::Pool; use crate::config::Config; @@ -5,6 +7,7 @@ use crate::store::inflight_activation::{ InflightActivation, InflightActivationStatus, QueryResult, }; use anyhow::Error; +use chrono::{DateTime, Duration, Utc}; use cityhasher; use deadpool_redis::cluster::{ Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, @@ -32,6 +35,7 @@ pub struct RedisActivationStoreConfig { pub namespaces: Vec, pub num_buckets: usize, pub payload_ttl_seconds: u64, + pub processing_deadline_grace_sec: u64, } impl RedisActivationStoreConfig { @@ -41,11 +45,36 @@ impl RedisActivationStoreConfig { namespaces: config.namespaces.clone(), num_buckets: config.num_redis_buckets, payload_ttl_seconds: config.payload_ttl_seconds, + processing_deadline_grace_sec: config.processing_deadline_grace_sec, } } } -pub async fn create_redis_pool(urls: Vec) -> Result { +#[derive(Error, Debug)] +pub enum RedisActivationError { + #[error("Redis connection error: {error}")] + Connection { error: String }, + + #[error("Redis error: {0}")] + Redis(#[from] redis::RedisError), + + #[error("Serialization error: {error}")] + Serialization { error: String }, + + #[error("Activation not found: {id}")] + NotFound { id: String }, + + #[error("Invalid activation status: {status}")] + InvalidStatus { status: String }, + + #[error("Database operation failed: {operation}: {error}")] + DatabaseOperation { operation: String, error: String }, + + #[error("Timeout while waiting for lock")] + Timeout, +} + +pub async fn create_redis_pool(urls: Vec) -> Result { // if urls.len() == 1 { // let cfg = RedisConfig::from_url(urls[0].clone()); // let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); @@ -54,7 +83,11 @@ pub async fn create_redis_pool(urls: Vec) -> Result { // let cfg = RedisClusterConfig::from_urls(urls); // let pool = cfg.create_pool(Some(RedisClusterRuntime::Tokio1)).unwrap(); let cfg = RedisConfig::from_url(urls[0].clone()); - let pool = cfg.create_pool(Some(Runtime::Tokio1)).unwrap(); + let pool = + cfg.create_pool(Some(Runtime::Tokio1)) + .map_err(|e| RedisActivationError::Connection { + error: e.to_string(), + })?; Ok(pool) } @@ -66,16 +99,35 @@ pub struct RedisActivationStore { } // Wraps the InnerRedisActivationStore to manage the locking to avoid the outer code having to handle it. +// Is also responsible for handling errors from the InnerRedisActivationStore. impl RedisActivationStore { - pub async fn new(urls: Vec, config: RedisActivationStoreConfig) -> Result { - let inner = InnerRedisActivationStore::new(urls, config).await.unwrap(); + pub async fn new( + urls: Vec, + config: RedisActivationStoreConfig, + ) -> Result { + let inner = InnerRedisActivationStore::new(urls, config).await; + if inner.is_err() { + return Err(RedisActivationError::Connection { + error: (inner.err().unwrap()).to_string(), + }); + } Ok(Self { - inner: RwLock::new(inner), + inner: RwLock::new(inner.unwrap()), }) } - pub async fn store(&self, batch: Vec) -> Result { - self.inner.read().await.store(batch).await + pub async fn store( + &self, + batch: Vec, + ) -> Result { + let result = self.inner.read().await.store(batch).await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "store".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) } // Called when rebalancing partitions @@ -86,36 +138,78 @@ impl RedisActivationStore { .rebalance_partitions(topic, partitions); } - pub async fn count_processing_activations(&self) -> Result { - self.inner.read().await.count_processing_activations().await + pub async fn count_processing_activations(&self) -> Result { + let result = self.inner.read().await.count_processing_activations().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "count_processing_activations".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) } - pub async fn count_delayed_activations(&self) -> Result { - self.inner.read().await.count_delayed_activations().await + pub async fn count_delayed_activations(&self) -> Result { + let result = self.inner.read().await.count_delayed_activations().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "count_delayed_activations".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) } - pub async fn count_pending_activations(&self) -> Result { - self.inner.read().await.count_pending_activations().await + pub async fn count_pending_activations(&self) -> Result { + let result = self.inner.read().await.count_pending_activations().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "count_pending_activations".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) } - pub async fn db_size(&self) -> Result { - self.inner.read().await.db_size().await + pub async fn db_size(&self) -> Result { + let result = self.inner.read().await.db_size().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "db_size".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) } - pub async fn delete_all_keys(&self) -> Result<(), Error> { - self.inner.read().await.delete_all_keys().await + pub async fn delete_all_keys(&self) -> Result<(), RedisActivationError> { + let result = self.inner.read().await.delete_all_keys().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "delete_all_keys".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(()) } pub async fn get_pending_activation( &self, namespace: Option<&str>, - ) -> Result, Error> { - let activation = self + ) -> Result, RedisActivationError> { + let result = self .inner .read() .await .get_pending_activation(namespace) - .await?; + .await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "get_pending_activation".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + let activation = result.unwrap(); if activation.is_none() { return Ok(None); } @@ -127,12 +221,20 @@ impl RedisActivationStore { &self, activation_id: &str, status: InflightActivationStatus, - ) -> Result<(), Error> { - self.inner + ) -> Result<(), RedisActivationError> { + let result = self + .inner .read() .await .set_status(activation_id, status) - .await + .await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "set_status".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(()) } } @@ -147,6 +249,7 @@ struct InnerRedisActivationStore { bucket_hashes: Vec, next_key_idx_for_pending: usize, total_possible_keys: usize, + processing_deadline_grace_sec: i64, } impl InnerRedisActivationStore { @@ -167,6 +270,7 @@ impl InnerRedisActivationStore { payload_ttl_seconds: config.payload_ttl_seconds, next_key_idx_for_pending: 0, total_possible_keys: 0, + processing_deadline_grace_sec: config.processing_deadline_grace_sec as i64, }) } @@ -203,6 +307,23 @@ impl InnerRedisActivationStore { self.build_key(prefix, namespace, topic, partition, bucket_hash) } + fn get_payload_key( + &self, + namespace: &str, + topic: &str, + partition: i32, + activation_id: &str, + ) -> String { + let prefix = self.build_key_with_activation( + KeyPrefix::Payload, + namespace, + topic, + partition, + activation_id, + ); + format!("{}:{}", prefix, activation_id) + } + fn get_id_lookup_key(&self, activation_id: &str) -> String { format!("idlookup:{}", activation_id) } @@ -262,49 +383,50 @@ impl InnerRedisActivationStore { let mut conn = self.pool.get().await?; let mut rows_affected: u64 = 0; for activation in batch { - let payload_key = format!( - "{}:{}", - self.build_key_with_activation( - KeyPrefix::Payload, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str() - ), - activation.id.as_str() + let payload_key = self.get_payload_key( + activation.namespace.as_str(), + activation.topic.as_str(), + activation.partition, + activation.id.as_str(), ); + // Base64 encode the activation since Redis HGETALL doesn't handle the bytes correctly (it tries to UTF-8 decode it) + let encoded_activation = general_purpose::STANDARD.encode(&activation.activation); + let mut pipe = redis::pipe(); pipe.atomic() .hset(payload_key.clone(), "id", activation.id.clone()) .arg("activation") - .arg(activation.activation) + .arg(encoded_activation) + .arg("status") + .arg(format!("{:?}", activation.status)) + .arg("topic") + .arg(activation.topic.clone()) .arg("partition") .arg(activation.partition) .arg("offset") .arg(activation.offset) .arg("added_at") - .arg(activation.added_at.timestamp()) + .arg(activation.added_at.timestamp_millis()) .arg("received_at") - .arg(activation.received_at.timestamp()) + .arg(activation.received_at.timestamp_millis()) .arg("processing_attempts") .arg(activation.processing_attempts) .arg("processing_deadline_duration") .arg(activation.processing_deadline_duration) - .arg("status") - .arg(format!("{:?}", activation.status)) .arg("at_most_once") - .arg(activation.at_most_once) + .arg(activation.at_most_once.to_string()) .arg("namespace") .arg(activation.namespace.clone()) .arg("taskname") .arg(activation.taskname) .arg("on_attempts_exceeded") .arg(activation.on_attempts_exceeded.as_str_name()); - let mut expected_args = 13; + + let mut expected_args = 14; if activation.expires_at.is_some() { pipe.arg("expires_at") - .arg(activation.expires_at.unwrap().timestamp()); + .arg(activation.expires_at.unwrap().timestamp_millis()); expected_args += 1; } if activation.delay_until.is_some() { @@ -319,18 +441,6 @@ impl InnerRedisActivationStore { } pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); - pipe.hset( - self.get_id_lookup_key(activation.id.clone().as_str()), - "id", - activation.id.clone(), - ) - .arg("topic") - .arg(activation.topic.clone()) - .arg("partition") - .arg(activation.partition) - .arg("namespace") - .arg(activation.namespace.clone()); - let mut queue_key_used = String::new(); if activation.delay_until.is_some() { let delay_key = self.build_key_with_activation( @@ -354,6 +464,7 @@ impl InnerRedisActivationStore { activation.partition, activation.id.as_str(), ); + println!("adding activation to pending queue: {:?}", pending_key); pipe.rpush(pending_key.clone(), activation.id.clone()); queue_key_used = pending_key; } @@ -443,6 +554,29 @@ impl InnerRedisActivationStore { payload_key )); } + + // This key has to be set separately since the transaction expects all keys to be in the same hash slot + // and this can't be guaranteed since it doesn't contain the hash key. + let mut pipe = redis::pipe(); + pipe.hset( + self.get_id_lookup_key(activation.id.clone().as_str()), + "id", + activation.id.clone(), + ) + .arg("topic") + .arg(activation.topic.clone()) + .arg("partition") + .arg(activation.partition) + .arg("namespace") + .arg(activation.namespace.clone()); + let result: Vec = pipe.query_async(&mut conn).await?; + if !result.is_empty() && result[0] != 4 { + // The number of fields set must be 4 + return Err(anyhow::anyhow!( + "Failed to set id lookup for key {}", + activation.id.clone() + )); + } rows_affected += 1; } Ok(QueryResult { rows_affected }) @@ -562,16 +696,11 @@ impl InnerRedisActivationStore { async fn delete_activation(&self, activation: InflightActivation) -> Result<(), Error> { let mut conn = self.pool.get().await?; let mut pipe = redis::pipe(); - let payload_key = format!( - "{}:{}", - self.build_key_with_activation( - KeyPrefix::Payload, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str() - ), - activation.id.as_str() + let payload_key = self.get_payload_key( + activation.namespace.as_str(), + activation.topic.as_str(), + activation.partition, + activation.id.as_str(), ); pipe.del(payload_key.clone()); pipe.del(self.get_id_lookup_key(activation.id.as_str())); @@ -642,11 +771,11 @@ impl InnerRedisActivationStore { continue; } for bucket_hash in self.bucket_hashes.iter() { + // Skip the bucket hash if it's before the next key index for pending if local_idx < self.next_key_idx_for_pending { local_idx += 1; continue; } - local_idx += 1; // In case of failure below // Get the next pending activation let pending_key = self.build_key_with_activation( @@ -683,11 +812,18 @@ impl InnerRedisActivationStore { *partition, bucket_hash.as_str(), ); + let processing_deadline = match activation.processing_deadline { + None => { + Utc::now() + Duration::seconds(self.processing_deadline_grace_sec) + } + Some(apd) => apd, + } + .timestamp_millis(); let result: usize = conn .zadd( processing_key.clone(), activation.id.clone(), - activation.processing_deadline.unwrap().timestamp(), + processing_deadline, ) .await?; if result == 0 { @@ -709,11 +845,8 @@ impl InnerRedisActivationStore { metrics::counter!("inflight_redis_activation_store_lrem_not_found") .increment(1); } - activations.push(activation); - if limit.is_none() { - return Ok(activations); - } else if activations.len() >= limit.unwrap() as usize { + if activations.len() >= limit.unwrap() as usize { return Ok(activations); } } @@ -736,11 +869,10 @@ impl InnerRedisActivationStore { namespace: &str, topic: &str, partition: i32, - id: &str, + activation_id: &str, ) -> Result, Error> { let mut conn = self.pool.get().await?; - let payload_key = - self.build_key_with_activation(KeyPrefix::Payload, namespace, topic, partition, id); + let payload_key = self.get_payload_key(namespace, topic, partition, activation_id); let result: HashMap = conn.hgetall(payload_key.clone()).await?; if result.is_empty() { return Ok(None); @@ -774,18 +906,22 @@ impl InnerRedisActivationStore { activation_id: &str, status: InflightActivationStatus, ) -> Result<(), Error> { + // If the activation is not found, return a no-op let activation = self.get_by_id_lookup(activation_id).await?; if activation.is_none() { - return Err(anyhow::anyhow!( - "Activation not found for id: {}", + info!( + "Activation not found for id: {}, skipping status update", activation_id - )); + ); + return Ok(()); } let activation = activation.unwrap(); let mut conn = self.pool.get().await?; let mut pipe = redis::pipe(); pipe.atomic(); + let mut has_failure = false; if status == InflightActivationStatus::Retry { + has_failure = true; pipe.rpush( self.build_key_with_activation( KeyPrefix::Retry, @@ -799,6 +935,7 @@ impl InnerRedisActivationStore { } else if status == InflightActivationStatus::Failure && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter { + has_failure = true; pipe.rpush( self.build_key_with_activation( KeyPrefix::Deadletter, @@ -818,45 +955,61 @@ impl InnerRedisActivationStore { activation.id.as_str(), ); pipe.zrem(processing_key, activation_id); - pipe.del(self.build_key_with_activation( - KeyPrefix::Payload, + + let payload_key = self.get_payload_key( activation.namespace.as_str(), activation.topic.as_str(), activation.partition, activation.id.as_str(), - )); - pipe.del(self.get_id_lookup_key(activation_id)); + ); + pipe.del(payload_key); + let results: Vec = pipe.query_async(&mut *conn).await?; - if results.len() != 3 { + let expected_commands = if has_failure { 3 } else { 2 }; + if results.len() != expected_commands { return Err(anyhow::anyhow!( - "Failed to set status: incorrect number of commands run: expected 4, got {} for key {}", + "Failed to set status: incorrect number of commands run: expected {}, got {} for key {}", + expected_commands, results.len(), activation_id )); } - if results[0] >= 0 { - // The RPUSH to retry/deadletter - return Err(anyhow::anyhow!( - "Activation discarded instead of being handled: {}", - activation_id - )); + + // Track the number of commands that were successful + let mut processing_removed = 0; + let mut payload_deleted = 0; + if has_failure { + if results[0] != 1 { + return Err(anyhow::anyhow!( + "Failed to add activation to retry/deadletter queue: {}", + activation_id + )); + } + processing_removed = results[1]; + payload_deleted = results[2]; + } else { + processing_removed = results[0]; + payload_deleted = results[1]; } - if results[1] != 1 { + + if processing_removed != 1 { // Removing from processing set return Err(anyhow::anyhow!( "Failed to remove activation from processing set: {}", activation_id )); } - if results[2] != 1 { + if payload_deleted != 1 { // Deleting payload return Err(anyhow::anyhow!( "Failed to delete payload: {}", activation_id )); } - if results[3] != 1 { - // Deleting id lookup + + // Delete this outside the transaction since it isn't in the same hash slot as the other keys and thus can't be part of the transaction. + let result = conn.del(self.get_id_lookup_key(activation_id)).await?; + if result != 1 { return Err(anyhow::anyhow!( "Failed to delete id lookup: {}", activation_id @@ -926,6 +1079,10 @@ impl InnerRedisActivationStore { *partition, bucket_hash.as_str(), ); + println!( + "counting pending activations for bucket hash: {:?}", + pending_key + ); let count: usize = conn.llen(pending_key).await?; total_count += count; } diff --git a/src/test_utils.rs b/src/test_utils.rs index 61c0d010..7142ecab 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -40,7 +40,7 @@ pub fn make_activations(count: u32) -> Vec { id: format!("id_{i}"), activation: TaskActivation { id: format!("id_{i}"), - namespace: "namespace".into(), + namespace: "default".into(), taskname: "taskname".into(), parameters: "{}".into(), headers: HashMap::new(), @@ -55,7 +55,7 @@ pub fn make_activations(count: u32) -> Vec { } .encode_to_vec(), status: InflightActivationStatus::Pending, - topic: "test_topic".to_string(), + topic: "taskbroker-test".to_string(), partition: 0, offset: i as i64, added_at: now, @@ -66,7 +66,7 @@ pub fn make_activations(count: u32) -> Vec { delay_until: None, processing_deadline: None, at_most_once: false, - namespace: "namespace".into(), + namespace: "default".into(), taskname: "taskname".into(), on_attempts_exceeded: OnAttemptsExceeded::Discard, }; From 48b66385f0d011303e624c0390bed7d4fb3d2b92 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 26 Nov 2025 17:00:11 -0500 Subject: [PATCH 08/43] processing deadlines, expired, retry --- src/config.rs | 3 +- src/grpc/server_tests.rs | 14 +- src/main.rs | 52 +- src/store/inflight_redis_activation.rs | 1123 +++++------------------- src/store/mod.rs | 2 + src/upkeep.rs | 626 +++++++------ 6 files changed, 643 insertions(+), 1177 deletions(-) diff --git a/src/config.rs b/src/config.rs index 7c608c0f..cea8085d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -291,8 +291,7 @@ impl Default for Config { // Redis information redis_cluster_urls: vec!["redis://127.0.0.1:6379".to_owned()], namespaces: vec!["default".to_owned()], - // num_redis_buckets: 256, - num_redis_buckets: 1, + num_redis_buckets: 256, payload_ttl_seconds: 60 * 60 * 24, } } diff --git a/src/grpc/server_tests.rs b/src/grpc/server_tests.rs index c4214f32..d20b777a 100644 --- a/src/grpc/server_tests.rs +++ b/src/grpc/server_tests.rs @@ -70,7 +70,6 @@ async fn test_get_task_success() { }; let request = GetTaskRequest { namespace: None }; let response = service.get_task(Request::new(request)).await; - println!("response: {:?}", response); assert!(response.is_ok()); let resp = response.unwrap(); assert!(resp.get_ref().task.is_some()); @@ -98,10 +97,10 @@ async fn test_set_task_status_success() { let resp = response.unwrap(); assert!(resp.get_ref().task.is_some()); let task = resp.get_ref().task.as_ref().unwrap(); - assert!(task.id == "id_0"); - + assert!(task.id == "id_0" || task.id == "id_1"); + let first_task_id = task.id.clone(); let request = SetTaskStatusRequest { - id: "id_0".to_string(), + id: first_task_id.clone(), status: 5, // Complete fetch_next_task: Some(FetchNextTask { namespace: None }), }; @@ -111,7 +110,12 @@ async fn test_set_task_status_success() { let resp = response.unwrap(); assert!(resp.get_ref().task.is_some()); let task = resp.get_ref().task.as_ref().unwrap(); - assert_eq!(task.id, "id_1"); + let second_task_id = if first_task_id == "id_0" { + "id_1" + } else { + "id_0" + }; + assert_eq!(task.id, second_task_id); let pending_count = store.count_pending_activations().await.unwrap(); let processing_count = store.count_processing_activations().await.unwrap(); assert!(pending_count == 0, "pending_count: {:?}", pending_count); diff --git a/src/main.rs b/src/main.rs index f0c4e47b..c2e6e49f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -109,7 +109,7 @@ async fn main() -> Result<(), Error> { // Upkeep loop let upkeep_task = tokio::spawn({ - let upkeep_store = store.clone(); + let upkeep_store = redis_store.clone(); let upkeep_config = config.clone(); let runtime_config_manager = runtime_config_manager.clone(); async move { @@ -125,30 +125,30 @@ async fn main() -> Result<(), Error> { } }); - // Maintenance task loop - let maintenance_task = tokio::spawn({ - let guard = elegant_departure::get_shutdown_guard().shutdown_on_drop(); - let maintenance_store = store.clone(); - let mut timer = time::interval(Duration::from_millis(config.maintenance_task_interval_ms)); - timer.set_missed_tick_behavior(time::MissedTickBehavior::Skip); - - async move { - loop { - select! { - _ = timer.tick() => { - match maintenance_store.vacuum_db().await { - Ok(_) => debug!("ran maintenance vacuum"), - Err(err) => warn!("failed to run maintenance vacuum {:?}", err), - } - }, - _ = guard.wait() => { - break; - } - } - } - Ok(()) - } - }); + // // Maintenance task loop + // let maintenance_task = tokio::spawn({ + // let guard = elegant_departure::get_shutdown_guard().shutdown_on_drop(); + // let maintenance_store = store.clone(); + // let mut timer = time::interval(Duration::from_millis(config.maintenance_task_interval_ms)); + // timer.set_missed_tick_behavior(time::MissedTickBehavior::Skip); + + // async move { + // loop { + // select! { + // _ = timer.tick() => { + // match maintenance_store.vacuum_db().await { + // Ok(_) => debug!("ran maintenance vacuum"), + // Err(err) => warn!("failed to run maintenance vacuum {:?}", err), + // } + // }, + // _ = guard.wait() => { + // break; + // } + // } + // } + // Ok(()) + // } + // }); // Consumer from kafka let consumer_task = tokio::spawn({ @@ -245,7 +245,7 @@ async fn main() -> Result<(), Error> { .on_completion(log_task_completion("consumer", consumer_task)) .on_completion(log_task_completion("grpc_server", grpc_server_task)) .on_completion(log_task_completion("upkeep_task", upkeep_task)) - .on_completion(log_task_completion("maintenance_task", maintenance_task)) + // .on_completion(log_task_completion("maintenance_task", maintenance_task)) .await; Ok(()) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 9f61e664..a5f80b17 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,5 +1,8 @@ +use crate::store::inner_redis_activation_store::InnerRedisActivationStore; +use crate::store::redis_utils::HashKey; use base64::{Engine as _, engine::general_purpose}; use thiserror::Error; + use tracing::{error, info, instrument}; // use deadpool_redis::Pool; use crate::config::Config; @@ -19,37 +22,6 @@ use std::collections::HashMap; // use std::sync::RwLock; use tokio::sync::RwLock; -pub enum KeyPrefix { - Payload, - IDLookup, - Pending, - Processing, - Delay, - Retry, - Deadletter, - Expired, -} - -pub struct RedisActivationStoreConfig { - pub topics: HashMap>, - pub namespaces: Vec, - pub num_buckets: usize, - pub payload_ttl_seconds: u64, - pub processing_deadline_grace_sec: u64, -} - -impl RedisActivationStoreConfig { - pub fn from_config(config: &Config) -> Self { - Self { - topics: HashMap::from([(config.kafka_topic.clone(), vec![0])]), - namespaces: config.namespaces.clone(), - num_buckets: config.num_redis_buckets, - payload_ttl_seconds: config.payload_ttl_seconds, - processing_deadline_grace_sec: config.processing_deadline_grace_sec, - } - } -} - #[derive(Error, Debug)] pub enum RedisActivationError { #[error("Redis connection error: {error}")] @@ -74,6 +46,28 @@ pub enum RedisActivationError { Timeout, } +pub struct RedisActivationStoreConfig { + pub topics: HashMap>, + pub namespaces: Vec, + pub num_buckets: usize, + pub payload_ttl_seconds: u64, + pub processing_deadline_grace_sec: u64, + pub max_processing_attempts: usize, +} + +impl RedisActivationStoreConfig { + pub fn from_config(config: &Config) -> Self { + Self { + topics: HashMap::from([(config.kafka_topic.clone(), vec![0])]), + namespaces: config.namespaces.clone(), + num_buckets: config.num_redis_buckets, + payload_ttl_seconds: config.payload_ttl_seconds, + processing_deadline_grace_sec: config.processing_deadline_grace_sec, + max_processing_attempts: config.max_processing_attempts, + } + } +} + pub async fn create_redis_pool(urls: Vec) -> Result { // if urls.len() == 1 { // let cfg = RedisConfig::from_url(urls[0].clone()); @@ -105,7 +99,20 @@ impl RedisActivationStore { urls: Vec, config: RedisActivationStoreConfig, ) -> Result { - let inner = InnerRedisActivationStore::new(urls, config).await; + let replicas = urls.len(); + let pool = create_redis_pool(urls).await?; + + let inner = InnerRedisActivationStore::new( + pool, + replicas, + config.topics.clone(), + config.namespaces.clone(), + config.num_buckets, + config.payload_ttl_seconds, + config.processing_deadline_grace_sec, + config.max_processing_attempts, + ) + .await; if inner.is_err() { return Err(RedisActivationError::Connection { error: (inner.err().unwrap()).to_string(), @@ -122,9 +129,11 @@ impl RedisActivationStore { ) -> Result { let result = self.inner.read().await.store(batch).await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + println!("error: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "store".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -171,6 +180,28 @@ impl RedisActivationStore { Ok(result.unwrap()) } + pub async fn count_retry_activations(&self) -> Result { + let result = self.inner.read().await.count_retry_activations().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "count_retry_activations".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) + } + + pub async fn count_deadletter_activations(&self) -> Result { + let result = self.inner.read().await.count_deadletter_activations().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "count_deadletter_activations".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) + } + pub async fn db_size(&self) -> Result { let result = self.inner.read().await.db_size().await; if result.is_err() { @@ -217,930 +248,224 @@ impl RedisActivationStore { Ok(Some(activation.unwrap())) } - pub async fn set_status( + pub async fn get_pending_activations_from_namespaces( &self, - activation_id: &str, - status: InflightActivationStatus, - ) -> Result<(), RedisActivationError> { + namespaces: Option<&[String]>, + limit: Option, + ) -> Result, RedisActivationError> { let result = self .inner .read() .await - .set_status(activation_id, status) + .get_pending_activations_from_namespaces(namespaces, limit) .await; if result.is_err() { return Err(RedisActivationError::DatabaseOperation { - operation: "set_status".to_string(), + operation: "get_pending_activations_from_namespaces".to_string(), error: (result.err().unwrap()).to_string(), }); } - Ok(()) - } -} - -#[derive(Debug)] -struct InnerRedisActivationStore { - pool: Pool, - replicas: usize, - topics: HashMap>, - namespaces: Vec, - num_buckets: usize, - payload_ttl_seconds: u64, - bucket_hashes: Vec, - next_key_idx_for_pending: usize, - total_possible_keys: usize, - processing_deadline_grace_sec: i64, -} - -impl InnerRedisActivationStore { - pub async fn new(urls: Vec, config: RedisActivationStoreConfig) -> Result { - let replicas = urls.len(); - let pool = create_redis_pool(urls).await?; - let bucket_hashes = (0..config.num_buckets) - .map(|i| format!("{:04x}", i)) - .collect(); - - Ok(Self { - pool, - replicas, - topics: config.topics.clone(), - namespaces: config.namespaces.clone(), - num_buckets: config.num_buckets, - bucket_hashes, - payload_ttl_seconds: config.payload_ttl_seconds, - next_key_idx_for_pending: 0, - total_possible_keys: 0, - processing_deadline_grace_sec: config.processing_deadline_grace_sec as i64, - }) - } - - fn compute_bucket(&self, activation_id: &str) -> String { - let hashint: u64 = cityhasher::hash(activation_id); - format!("{:04x}", hashint % self.num_buckets as u64) + Ok(result.unwrap()) } - fn build_key_with_activation( + pub async fn get_by_id( &self, - prefix: KeyPrefix, - namespace: &str, - topic: &str, - partition: i32, + hash_key: HashKey, activation_id: &str, - ) -> String { - self.build_key( - prefix, - namespace, - topic, - partition, - self.compute_bucket(activation_id).as_str(), - ) - } - - fn build_key_with_bucket( - &self, - prefix: KeyPrefix, - namespace: &str, - topic: &str, - partition: i32, - bucket_hash: &str, - ) -> String { - self.build_key(prefix, namespace, topic, partition, bucket_hash) + ) -> Result, RedisActivationError> { + let result = self + .inner + .read() + .await + .get_by_id(hash_key, activation_id) + .await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "get_by_id".to_string(), + error: (result.err().unwrap()).to_string(), + }); + } + Ok(result.unwrap()) } - fn get_payload_key( + pub async fn set_status( &self, - namespace: &str, - topic: &str, - partition: i32, activation_id: &str, - ) -> String { - let prefix = self.build_key_with_activation( - KeyPrefix::Payload, - namespace, - topic, - partition, - activation_id, - ); - format!("{}:{}", prefix, activation_id) - } - - fn get_id_lookup_key(&self, activation_id: &str) -> String { - format!("idlookup:{}", activation_id) - } - - fn build_key( - &self, - prefix: KeyPrefix, - namespace: &str, - topic: &str, - partition: i32, - suffix: &str, - ) -> String { - match prefix { - KeyPrefix::Payload => { - format!("payload:{}:{}:{}:{}", namespace, topic, partition, suffix) - } - KeyPrefix::IDLookup => "idlookup:".to_string(), - KeyPrefix::Pending => { - format!("pending:{}:{}:{}:{}", namespace, topic, partition, suffix) - } - KeyPrefix::Processing => format!( - "processing:{}:{}:{}:{}", - namespace, topic, partition, suffix - ), - KeyPrefix::Delay => format!("delay:{}:{}:{}:{}", namespace, topic, partition, suffix), - KeyPrefix::Retry => format!("retry:{}:{}:{}:{}", namespace, topic, partition, suffix), - KeyPrefix::Deadletter => format!( - "deadletter:{}:{}:{}:{}", - namespace, topic, partition, suffix - ), - KeyPrefix::Expired => { - format!("expired:{}:{}:{}:{}", namespace, topic, partition, suffix) - } - } - } - - // Called when rebalancing partitions - fn rebalance_partitions(&mut self, topic: String, partitions: Vec) { - self.topics.insert(topic.clone(), partitions.clone()); - self.total_possible_keys = 0; - for (_, partitions) in self.topics.iter() { - for _ in partitions.iter() { - for _ in self.namespaces.iter() { - for _ in self.bucket_hashes.iter() { - self.total_possible_keys += 1; - } - } - } - } - info!( - "Rebalanced partitions for topic {}: {:?}: {:?}: total possible keys: {}", - topic, partitions, self.topics, self.total_possible_keys - ); - } - - async fn store(&self, batch: Vec) -> Result { - let mut conn = self.pool.get().await?; - let mut rows_affected: u64 = 0; - for activation in batch { - let payload_key = self.get_payload_key( - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - - // Base64 encode the activation since Redis HGETALL doesn't handle the bytes correctly (it tries to UTF-8 decode it) - let encoded_activation = general_purpose::STANDARD.encode(&activation.activation); - - let mut pipe = redis::pipe(); - pipe.atomic() - .hset(payload_key.clone(), "id", activation.id.clone()) - .arg("activation") - .arg(encoded_activation) - .arg("status") - .arg(format!("{:?}", activation.status)) - .arg("topic") - .arg(activation.topic.clone()) - .arg("partition") - .arg(activation.partition) - .arg("offset") - .arg(activation.offset) - .arg("added_at") - .arg(activation.added_at.timestamp_millis()) - .arg("received_at") - .arg(activation.received_at.timestamp_millis()) - .arg("processing_attempts") - .arg(activation.processing_attempts) - .arg("processing_deadline_duration") - .arg(activation.processing_deadline_duration) - .arg("at_most_once") - .arg(activation.at_most_once.to_string()) - .arg("namespace") - .arg(activation.namespace.clone()) - .arg("taskname") - .arg(activation.taskname) - .arg("on_attempts_exceeded") - .arg(activation.on_attempts_exceeded.as_str_name()); - - let mut expected_args = 14; - if activation.expires_at.is_some() { - pipe.arg("expires_at") - .arg(activation.expires_at.unwrap().timestamp_millis()); - expected_args += 1; - } - if activation.delay_until.is_some() { - pipe.arg("delay_until") - .arg(activation.delay_until.unwrap().timestamp()); - expected_args += 1; - } - if activation.processing_deadline.is_some() { - pipe.arg("processing_deadline") - .arg(activation.processing_deadline.unwrap().timestamp()); - expected_args += 1; - } - pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); - - let mut queue_key_used = String::new(); - if activation.delay_until.is_some() { - let delay_key = self.build_key_with_activation( - KeyPrefix::Delay, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - pipe.zadd( - delay_key.clone(), - activation.id.clone(), - activation.delay_until.unwrap().timestamp(), - ); - queue_key_used = delay_key; - } else { - let pending_key = self.build_key_with_activation( - KeyPrefix::Pending, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - println!("adding activation to pending queue: {:?}", pending_key); - pipe.rpush(pending_key.clone(), activation.id.clone()); - queue_key_used = pending_key; - } - - let mut expired_key = String::new(); - if activation.expires_at.is_some() { - expired_key = self.build_key_with_activation( - KeyPrefix::Expired, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - pipe.zadd( - expired_key.clone(), - activation.id.clone(), - activation.expires_at.unwrap().timestamp(), - ); - } - pipe.cmd("WAIT").arg(1).arg(1000); - - let result: Vec = match pipe.query_async(&mut conn).await { - Ok(result) => result, - Err(err) => { - error!( - "Failed to store activation {} in Redis: {}", - payload_key.clone(), - err - ); - return Err(anyhow::anyhow!( - "Failed to store activation: {}", - payload_key.clone() - )); - } - }; - - if result.len() != 4 && result.len() != 5 { - return Err(anyhow::anyhow!( - "Failed to store activation: incorrect number of commands run: expected 4 or 5, got {} for key {}", - result.len(), - payload_key.clone() - )); - } - // WAIT returns the number of replicas that had the write propagated - // If there is only one node then it will return 0. - if result[result.len() - 1] < self.replicas as i32 - 1 { - return Err(anyhow::anyhow!( - "Activation {} was not stored on any replica", - payload_key - )); - } - - // HSET returns the number of fields set - if result[0] != expected_args { - return Err(anyhow::anyhow!( - "Failed to store activation: expected {} arguments, got {} for key {}", - expected_args, - result[0], - payload_key.clone() - )); - } - // EXPIRE returns 1 on success and 0 on failure - if result[1] != 1 { - return Err(anyhow::anyhow!( - "Failed to expire activation for key {}", - payload_key - )); - } - // Both ZADD and RPUSH return a count of elements in the structure - if result[2] <= 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to queue for key {}", - queue_key_used - )); - } - // Check if the ZADD happened on the expired key - if result.len() == 5 && result[3] <= 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to expired queue for key {}", - expired_key - )); - } - // Check to ensure that the WAIT command returned at least one replica - if result.len() == 5 && result[4] <= 0 { - return Err(anyhow::anyhow!( - "Failed to wait for activation to be stored on at least one replica for key {}", - payload_key - )); - } - - // This key has to be set separately since the transaction expects all keys to be in the same hash slot - // and this can't be guaranteed since it doesn't contain the hash key. - let mut pipe = redis::pipe(); - pipe.hset( - self.get_id_lookup_key(activation.id.clone().as_str()), - "id", - activation.id.clone(), - ) - .arg("topic") - .arg(activation.topic.clone()) - .arg("partition") - .arg(activation.partition) - .arg("namespace") - .arg(activation.namespace.clone()); - let result: Vec = pipe.query_async(&mut conn).await?; - if !result.is_empty() && result[0] != 4 { - // The number of fields set must be 4 - return Err(anyhow::anyhow!( - "Failed to set id lookup for key {}", - activation.id.clone() - )); - } - rows_affected += 1; - } - Ok(QueryResult { rows_affected }) - } - - async fn add_to_pending(&self, activation: InflightActivation) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let pending_key = self.build_key_with_activation( - KeyPrefix::Pending, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - let newlen: usize = conn - .rpush(pending_key.clone(), activation.id.clone()) - .await?; - if newlen == 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to pending: {}", - pending_key.clone() - )); - } - Ok(()) - } - - async fn add_to_processing(&self, activation: InflightActivation) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let processing_key = self.build_key_with_activation( - KeyPrefix::Processing, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - let newlen: usize = conn - .zadd( - processing_key.clone(), - activation.processing_deadline.unwrap().timestamp(), - activation.id.clone(), - ) - .await?; - if newlen == 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to processing: {}", - processing_key.clone() - )); + status: InflightActivationStatus, + ) -> Result<(), RedisActivationError> { + let result = self + .inner + .read() + .await + .set_status(activation_id, status) + .await; + if result.is_err() { + let error_string = result.err().unwrap().to_string(); + println!("error: {:?}", error_string); + return Err(RedisActivationError::DatabaseOperation { + operation: "set_status".to_string(), + error: error_string, + }); } Ok(()) } - async fn add_to_delay(&self, activation: InflightActivation) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let delay_key = self.build_key_with_activation( - KeyPrefix::Delay, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - let newlen: usize = conn - .zadd( - delay_key.clone(), - activation.delay_until.unwrap().timestamp(), - activation.id.clone(), - ) - .await?; - if newlen == 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to delay: {}", - delay_key.clone() - )); + pub async fn get_retry_activations( + &self, + ) -> Result, RedisActivationError> { + let result = self.inner.read().await.get_retry_activations().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "get_retry_activations".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - Ok(()) + Ok(result.unwrap()) } - async fn add_to_retry(&self, activation: InflightActivation) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let retry_key = self.build_key_with_activation( - KeyPrefix::Retry, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - let newlen: usize = conn.rpush(retry_key.clone(), activation.id.clone()).await?; - if newlen == 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to retry: {}", - retry_key.clone() - )); + pub async fn mark_retry_completed( + &self, + activations: Vec, + ) -> Result { + let result = self + .inner + .read() + .await + .mark_retry_completed(activations) + .await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "mark_retry_completed".to_string(), + error: result.err().unwrap().to_string(), + }); } - Ok(()) + Ok(result.unwrap()) } - async fn add_to_deadletter(&self, activation: InflightActivation) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let deadletter_key = self.build_key_with_activation( - KeyPrefix::Deadletter, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - let newlen: usize = conn - .rpush(deadletter_key.clone(), activation.id.clone()) - .await?; - if newlen == 0 { - return Err(anyhow::anyhow!( - "Failed to add activation to deadletter: {}", - deadletter_key.clone() - )); + pub async fn handle_processing_deadline( + &self, + ) -> Result<(u64, u64, u64), RedisActivationError> { + let result = self.inner.read().await.handle_processing_deadline().await; + if result.is_err() { + let error_string = result.err().unwrap().to_string(); + println!("error: {:?}", error_string); + return Err(RedisActivationError::DatabaseOperation { + operation: "handle_processing_deadline".to_string(), + error: error_string, + }); } - Ok(()) + Ok(result.unwrap()) } - async fn delete_activation(&self, activation: InflightActivation) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let mut pipe = redis::pipe(); - let payload_key = self.get_payload_key( - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - pipe.del(payload_key.clone()); - pipe.del(self.get_id_lookup_key(activation.id.as_str())); - let results: Vec = pipe.query_async(&mut conn).await?; - if results.len() != 2 { - return Err(anyhow::anyhow!( - "Failed to delete activation: incorrect number of commands run: expected 2, got {} for key {}", - results.len(), - payload_key.clone() - )); - } - if results[0] != 1 { - return Err(anyhow::anyhow!( - "Failed to delete payload for key {}", - payload_key.clone() - )); - } - if results[1] != 1 { - return Err(anyhow::anyhow!( - "Failed to delete id lookup for key {}", - activation.id.clone() - )); + pub async fn handle_processing_attempts(&self) -> Result { + let result = self.inner.read().await.handle_processing_attempts().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "handle_processing_attempts".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - Ok(()) + Ok(result.unwrap()) } - // Only used in testing - async fn delete_all_keys(&self) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let keys: Vec = conn.keys("*").await?; - for key in keys { - conn.del(key).await?; + pub async fn handle_expires_at(&self) -> Result { + let result = self.inner.read().await.handle_expires_at().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "handle_expires_at".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - Ok(()) + Ok(result.unwrap()) } - #[instrument(skip_all)] - async fn get_pending_activation( - &self, - namespace: Option<&str>, - ) -> Result, Error> { - let namespaces = namespace.map(|ns| vec![ns.to_string()]); - let result = self - .get_pending_activations_from_namespaces(namespaces.as_deref(), Some(1)) - .await?; - if result.is_empty() { - return Ok(None); + pub async fn handle_delay_until(&self) -> Result { + let result = self.inner.read().await.handle_delay_until().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "handle_delay_until".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - Ok(Some(result[0].clone())) + Ok(result.unwrap()) } - /// Get a pending activation from specified namespaces - /// If namespaces is None, gets from any namespace - /// If namespaces is Some(&[...]), gets from those namespaces - #[instrument(skip_all)] - async fn get_pending_activations_from_namespaces( + pub async fn handle_deadletter_tasks( &self, - namespaces: Option<&[String]>, - limit: Option, - ) -> Result, Error> { - let mut local_idx = 0; - let mut conn = self.pool.get().await?; - let mut activations: Vec = Vec::new(); - for (topic, partitions) in self.topics.iter() { - for partition in partitions.iter() { - for namespace in self.namespaces.iter() { - if namespaces.is_some() && !namespaces.unwrap().contains(namespace) { - continue; - } - for bucket_hash in self.bucket_hashes.iter() { - // Skip the bucket hash if it's before the next key index for pending - if local_idx < self.next_key_idx_for_pending { - local_idx += 1; - continue; - } - - // Get the next pending activation - let pending_key = self.build_key_with_activation( - KeyPrefix::Pending, - namespace.as_str(), - topic.as_str(), - *partition, - bucket_hash.as_str(), - ); - let result = conn.lindex(pending_key.clone(), 0).await?; - if result.is_none() { - continue; - } - let activation_id: String = result.unwrap().to_string(); - - let act_result = self - .get_by_id( - namespace.as_str(), - topic.as_str(), - *partition, - &activation_id, - ) - .await?; - if act_result.is_none() { - continue; - } - let activation = act_result.unwrap(); - - // Push the activation to processing. This will not create two entries for the same activation in the case of duplicates. - let processing_key = self.build_key_with_activation( - KeyPrefix::Processing, - namespace.as_str(), - topic.as_str(), - *partition, - bucket_hash.as_str(), - ); - let processing_deadline = match activation.processing_deadline { - None => { - Utc::now() + Duration::seconds(self.processing_deadline_grace_sec) - } - Some(apd) => apd, - } - .timestamp_millis(); - let result: usize = conn - .zadd( - processing_key.clone(), - activation.id.clone(), - processing_deadline, - ) - .await?; - if result == 0 { - return Err(anyhow::anyhow!( - "Failed to move activation to processing: {} {}", - processing_key, - activation_id - )); - } - - let result: usize = conn - .lrem(pending_key.clone(), 1, activation_id.clone()) - .await?; - if result == 0 { - info!( - "Attempted to lrem an activation from pending queue, but it was not found: {} {}", - pending_key, activation_id - ); - metrics::counter!("inflight_redis_activation_store_lrem_not_found") - .increment(1); - } - activations.push(activation); - if activations.len() >= limit.unwrap() as usize { - return Ok(activations); - } - } - } - } - } - Ok(activations) - } - - fn incr_next_key_idx_for_pending(&mut self) { - self.next_key_idx_for_pending += 1; - if self.next_key_idx_for_pending >= self.total_possible_keys { - self.next_key_idx_for_pending = 0; + ) -> Result)>, RedisActivationError> { + let result = self.inner.read().await.handle_deadletter_tasks().await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "handle_deadletter_tasks".to_string(), + error: (result.err().unwrap()).to_string(), + }); } + Ok(result.unwrap()) } - /// Get an activation by id. Primarily used for testing - async fn get_by_id( + pub async fn mark_deadletter_completed( &self, - namespace: &str, - topic: &str, - partition: i32, - activation_id: &str, - ) -> Result, Error> { - let mut conn = self.pool.get().await?; - let payload_key = self.get_payload_key(namespace, topic, partition, activation_id); - let result: HashMap = conn.hgetall(payload_key.clone()).await?; - if result.is_empty() { - return Ok(None); + ids: Vec, + ) -> Result { + let result = self.inner.read().await.mark_deadletter_completed(ids).await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "mark_deadletter_completed".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - let activation: InflightActivation = result.into(); - Ok(Some(activation)) + Ok(result.unwrap()) } - async fn get_by_id_lookup( + pub async fn remove_killswitched( &self, - activation_id: &str, - ) -> Result, Error> { - let mut conn = self.pool.get().await?; - let result: HashMap = - conn.hgetall(self.get_id_lookup_key(activation_id)).await?; - if result.is_empty() { - return Ok(None); + killswitched_tasks: Vec, + ) -> Result { + let result = self + .inner + .read() + .await + .remove_killswitched(killswitched_tasks) + .await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "remove_killswitched".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - - let namespace: String = result.get("namespace").unwrap().to_string(); - let topic: String = result.get("topic").unwrap().to_string(); - let partition: i32 = result.get("partition").unwrap().parse().unwrap(); - let activation = self - .get_by_id(namespace.as_str(), topic.as_str(), partition, activation_id) - .await?; - Ok(activation) + Ok(result.unwrap()) } - async fn set_status( + pub async fn mark_demoted_completed( &self, - activation_id: &str, - status: InflightActivationStatus, - ) -> Result<(), Error> { - // If the activation is not found, return a no-op - let activation = self.get_by_id_lookup(activation_id).await?; - if activation.is_none() { - info!( - "Activation not found for id: {}, skipping status update", - activation_id - ); - return Ok(()); - } - let activation = activation.unwrap(); - let mut conn = self.pool.get().await?; - let mut pipe = redis::pipe(); - pipe.atomic(); - let mut has_failure = false; - if status == InflightActivationStatus::Retry { - has_failure = true; - pipe.rpush( - self.build_key_with_activation( - KeyPrefix::Retry, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ), - activation_id, - ); - } else if status == InflightActivationStatus::Failure - && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter - { - has_failure = true; - pipe.rpush( - self.build_key_with_activation( - KeyPrefix::Deadletter, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ), - activation_id, - ); - } - let processing_key = self.build_key_with_activation( - KeyPrefix::Processing, - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - pipe.zrem(processing_key, activation_id); - - let payload_key = self.get_payload_key( - activation.namespace.as_str(), - activation.topic.as_str(), - activation.partition, - activation.id.as_str(), - ); - pipe.del(payload_key); - - let results: Vec = pipe.query_async(&mut *conn).await?; - let expected_commands = if has_failure { 3 } else { 2 }; - if results.len() != expected_commands { - return Err(anyhow::anyhow!( - "Failed to set status: incorrect number of commands run: expected {}, got {} for key {}", - expected_commands, - results.len(), - activation_id - )); - } - - // Track the number of commands that were successful - let mut processing_removed = 0; - let mut payload_deleted = 0; - if has_failure { - if results[0] != 1 { - return Err(anyhow::anyhow!( - "Failed to add activation to retry/deadletter queue: {}", - activation_id - )); - } - processing_removed = results[1]; - payload_deleted = results[2]; - } else { - processing_removed = results[0]; - payload_deleted = results[1]; - } - - if processing_removed != 1 { - // Removing from processing set - return Err(anyhow::anyhow!( - "Failed to remove activation from processing set: {}", - activation_id - )); - } - if payload_deleted != 1 { - // Deleting payload - return Err(anyhow::anyhow!( - "Failed to delete payload: {}", - activation_id - )); - } - - // Delete this outside the transaction since it isn't in the same hash slot as the other keys and thus can't be part of the transaction. - let result = conn.del(self.get_id_lookup_key(activation_id)).await?; - if result != 1 { - return Err(anyhow::anyhow!( - "Failed to delete id lookup: {}", - activation_id - )); + ids: Vec, + ) -> Result { + let result = self.inner.read().await.mark_demoted_completed(ids).await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "mark_demoted_completed".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - Ok(()) - } - - pub async fn move_delay_to_pending(&self) -> Result<(), Error> { - Ok(()) + Ok(result.unwrap()) } - pub async fn get_processing_deadline_exceeded_activations( + pub async fn pending_activation_max_lag( &self, - ) -> Result, Error> { - return Ok(vec![]); - } - - pub async fn get_processing_attempts_for_activation(&self, id: &str) -> Result { - return Ok(0); - } - - pub async fn retry_activation_locally(&self, id: &str) -> Result<(), Error> { - // Increment processing attempts by 1 and push back to pending in transaction - return Ok(()); - } - - pub async fn remove_from_processing(&self, id: &str) -> Result<(), Error> { - // Remove from processing in transaction - return Ok(()); - } - - pub async fn remove_from_pending(&self, id: &str) -> Result<(), Error> { - // Remove from pending in transaction - return Ok(()); - } - - pub async fn remove_from_delay(&self, id: &str) -> Result<(), Error> { - // Remove from delay in transaction - return Ok(()); - } - - pub async fn get_retry_activations(&self) -> Result, Error> { - return Ok(vec![]); - } - - pub async fn get_deadletter_activations(&self) -> Result, Error> { - return Ok(vec![]); - } - - pub async fn get_expired_activations(&self) -> Result, Error> { - return Ok(vec![]); - } - - #[instrument(skip_all)] - pub async fn count_pending_activations(&self) -> Result { - let mut conn = self.pool.get().await?; - let mut total_count = 0; - for (topic, partitions) in self.topics.iter() { - for partition in partitions.iter() { - for namespace in self.namespaces.iter() { - for bucket_hash in self.bucket_hashes.iter() { - let pending_key = self.build_key( - KeyPrefix::Pending, - namespace.as_str(), - topic.as_str(), - *partition, - bucket_hash.as_str(), - ); - println!( - "counting pending activations for bucket hash: {:?}", - pending_key - ); - let count: usize = conn.llen(pending_key).await?; - total_count += count; - } - } - } - } - return Ok(total_count); - } - - #[instrument(skip_all)] - pub async fn count_delayed_activations(&self) -> Result { - let mut conn = self.pool.get().await?; - let mut total_count = 0; - for (topic, partitions) in self.topics.iter() { - for partition in partitions.iter() { - for namespace in self.namespaces.iter() { - for bucket_hash in self.bucket_hashes.iter() { - let delay_key = self.build_key( - KeyPrefix::Delay, - namespace.as_str(), - topic.as_str(), - *partition, - bucket_hash.as_str(), - ); - let count: usize = conn.zcard(delay_key.clone()).await?; - total_count += count; - } - } - } - } - return Ok(total_count); - } - - #[instrument(skip_all)] - pub async fn count_processing_activations(&self) -> Result { - let mut conn = self.pool.get().await?; - let mut total_count = 0; - for (topic, partitions) in self.topics.iter() { - for partition in partitions.iter() { - for namespace in self.namespaces.iter() { - for bucket_hash in self.bucket_hashes.iter() { - let processing_key = self.build_key( - KeyPrefix::Processing, - namespace.as_str(), - topic.as_str(), - *partition, - bucket_hash.as_str(), - ); - let count: usize = conn.zcard(processing_key.clone()).await?; - total_count += count; - } - } - } + now: &DateTime, + ) -> Result { + let result = self + .inner + .read() + .await + .pending_activation_max_lag(now) + .await; + if result.is_err() { + return Err(RedisActivationError::DatabaseOperation { + operation: "pending_activation_max_lag".to_string(), + error: (result.err().unwrap()).to_string(), + }); } - return Ok(total_count); - } - - async fn db_size(&self) -> Result { - return Ok(0); + Ok(result.unwrap()) } } diff --git a/src/store/mod.rs b/src/store/mod.rs index e4be3353..5897963e 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -2,3 +2,5 @@ pub mod inflight_activation; #[cfg(test)] pub mod inflight_activation_tests; pub mod inflight_redis_activation; +mod inner_redis_activation_store; +pub mod redis_utils; diff --git a/src/upkeep.rs b/src/upkeep.rs index 898522f8..bdc6708c 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -1,3 +1,5 @@ +use crate::store::inflight_activation::InflightActivation; +use crate::store::inflight_redis_activation::RedisActivationStore; use chrono::{DateTime, Timelike, Utc}; use futures::{StreamExt, stream::FuturesUnordered}; use prost::Message; @@ -29,7 +31,7 @@ use crate::{ /// on the inflight store pub async fn upkeep( config: Arc, - store: Arc, + store: Arc, startup_time: DateTime, runtime_config_manager: Arc, health_reporter: HealthReporter, @@ -110,7 +112,7 @@ impl UpkeepResults { )] pub async fn do_upkeep( config: Arc, - store: Arc, + store: Arc, producer: Arc, startup_time: DateTime, runtime_config_manager: Arc, @@ -147,7 +149,10 @@ pub async fn do_upkeep( async move { let activation = TaskActivation::decode(&inflight.activation as &[u8]).unwrap(); - let serialized = create_retry_activation(&activation).encode_to_vec(); + let act = create_retry_activation(&activation); + println!("act: {:?}", act.retry_state.as_ref().unwrap()); + let serialized = act.encode_to_vec(); + // let serialized = create_retry_activation(&activation).encode_to_vec(); let delivery = producer .send( FutureRecord::<(), Vec>::to(&config.kafka_topic) @@ -156,29 +161,41 @@ pub async fn do_upkeep( ) .await; match delivery { - Ok(_) => Ok(inflight.id), + Ok(_) => Ok(inflight), Err((err, _msg)) => Err(err), } } }) .collect::>(); - let ids = deliveries + let to_remove: Vec = deliveries .collect::>() .await .into_iter() - .filter_map(|result: Result| match result { - Ok(id) => Some(id), - Err(err) => { - error!("retry.publish.failure {}", err); - None - } - }) + .filter_map( + |result: Result| match result { + Ok(inflight) => Some(inflight), + Err(err) => { + println!("retry.publish.failure {:?}", err); + error!("retry.publish.failure {}", err); + None + } + }, + ) .collect(); + println!("to_remove: {:?}", to_remove.len()); // 3. Update retry tasks to complete - if let Ok(retried_count) = store.mark_completed(ids).await { - result_context.retried = retried_count; + match store.mark_retry_completed(to_remove).await { + Ok(retried_count) => { + println!("retried_count: {:?}", retried_count); + result_context.retried = retried_count; + } + Err(err) => { + println!("failed to mark retry completed: {:?}", err); + error!("failed to mark retry completed: {:?}", err); + result_context.retried = 0; + } } } metrics::histogram!("upkeep.handle_retries").record(handle_retries_start.elapsed()); @@ -187,8 +204,15 @@ pub async fn do_upkeep( let seconds_since_startup = (current_time - startup_time).num_seconds() as u64; if seconds_since_startup > config.upkeep_deadline_reset_skip_after_startup_sec { let handle_processing_deadline_start = Instant::now(); - if let Ok(processing_deadline_reset) = store.handle_processing_deadline().await { + if let Ok(( + processing_deadline_reset, + discarded_count, + processing_attempts_exceeded_count, + )) = store.handle_processing_deadline().await + { result_context.processing_deadline_reset = processing_deadline_reset; + result_context.discarded = discarded_count; + result_context.processing_attempts_exceeded = processing_attempts_exceeded_count; } metrics::histogram!("upkeep.handle_processing_deadline") .record(handle_processing_deadline_start.elapsed()); @@ -199,7 +223,7 @@ pub async fn do_upkeep( // 5. Handle processing attempts exceeded let handle_processing_attempts_exceeded_start = Instant::now(); if let Ok(processing_attempts_exceeded) = store.handle_processing_attempts().await { - result_context.processing_attempts_exceeded = processing_attempts_exceeded; + result_context.processing_attempts_exceeded += processing_attempts_exceeded; } metrics::histogram!("upkeep.handle_processing_attempts_exceeded") .record(handle_processing_attempts_exceeded_start.elapsed()); @@ -220,13 +244,8 @@ pub async fn do_upkeep( // 8. Handle failure state tasks let handle_failed_tasks_start = Instant::now(); - if let Ok(failed_tasks_forwarder) = store.handle_failed_tasks().await { - result_context.discarded = failed_tasks_forwarder.to_discard.len() as u64; - result_context.failed = - result_context.discarded + failed_tasks_forwarder.to_deadletter.len() as u64; - - let deadletters = failed_tasks_forwarder - .to_deadletter + if let Ok(deadletter_tasks) = store.handle_deadletter_tasks().await { + let deadletters = deadletter_tasks .into_iter() .map(|(id, activation_data)| { let producer = producer.clone(); @@ -257,7 +276,7 @@ pub async fn do_upkeep( let ids = deadletters.collect::>().await.into_iter().collect(); // 9. Update deadlettered tasks to complete - if let Ok(deadletter_count) = store.mark_completed(ids).await { + if let Ok(deadletter_count) = store.mark_deadletter_completed(ids).await { result_context.deadlettered = deadletter_count; } } @@ -265,9 +284,9 @@ pub async fn do_upkeep( // 10. Cleanup completed tasks let remove_completed_start = Instant::now(); - if let Ok(count) = store.remove_completed().await { - result_context.completed = count; - } + // if let Ok(count) = store.remove_completed().await { + // result_context.completed = count; + // } metrics::histogram!("upkeep.remove_completed").record(remove_completed_start.elapsed()); // 11. Remove killswitched tasks from store @@ -343,7 +362,7 @@ pub async fn do_upkeep( .filter_map(Result::ok) .collect::>(); - if let Ok(forwarded_count) = store.mark_completed(ids).await { + if let Ok(forwarded_count) = store.mark_demoted_completed(ids).await { result_context.forwarded = forwarded_count; } } @@ -351,32 +370,30 @@ pub async fn do_upkeep( .record(forward_demoted_start.elapsed()); } - // 13. Vacuum the database - if config.full_vacuum_on_upkeep - && last_vacuum.elapsed() > Duration::from_millis(config.vacuum_interval_ms) - { - let vacuum_start = Instant::now(); - match store.full_vacuum_db().await { - Ok(_) => { - *last_vacuum = Instant::now(); - metrics::histogram!("upkeep.full_vacuum").record(vacuum_start.elapsed()); - } - Err(err) => { - error!("failed to vacuum the database: {:?}", err); - metrics::counter!("upkeep.full_vacuum.failure", "error" => err.to_string()) - .increment(1); - } - } - } + // // 13. Vacuum the database + // if config.full_vacuum_on_upkeep + // && last_vacuum.elapsed() > Duration::from_millis(config.vacuum_interval_ms) + // { + // let vacuum_start = Instant::now(); + // match store.full_vacuum_db().await { + // Ok(_) => { + // *last_vacuum = Instant::now(); + // metrics::histogram!("upkeep.full_vacuum").record(vacuum_start.elapsed()); + // } + // Err(err) => { + // error!("failed to vacuum the database: {:?}", err); + // metrics::counter!("upkeep.full_vacuum.failure", "error" => err.to_string()) + // .increment(1); + // } + // } + // } let now = Utc::now(); - let (pending_count, processing_count, delay_count, max_lag, db_file_meta, wal_file_meta) = join!( - store.count_by_status(InflightActivationStatus::Pending), - store.count_by_status(InflightActivationStatus::Processing), - store.count_by_status(InflightActivationStatus::Delay), + let (pending_count, processing_count, delay_count, max_lag) = join!( + store.count_pending_activations(), + store.count_processing_activations(), + store.count_delayed_activations(), store.pending_activation_max_lag(&now), - fs::metadata(config.db_path.clone()), - fs::metadata(config.db_path.clone() + "-wal") ); if let Ok(pending_count) = pending_count { @@ -436,14 +453,14 @@ pub async fn do_upkeep( metrics::gauge!("upkeep.current_pending_tasks").set(result_context.pending); metrics::gauge!("upkeep.current_processing_tasks").set(result_context.processing); metrics::gauge!("upkeep.current_delayed_tasks").set(result_context.delay); - metrics::gauge!("upkeep.pending_activation.max_lag.sec").set(max_lag); + metrics::gauge!("upkeep.pending_activation.max_lag.sec").set(max_lag.unwrap_or(0) as f64); - if let Ok(db_file_meta) = db_file_meta { - metrics::gauge!("upkeep.db_file_size.bytes").set(db_file_meta.len() as f64); - } - if let Ok(wal_file_meta) = wal_file_meta { - metrics::gauge!("upkeep.wal_file_size.bytes").set(wal_file_meta.len() as f64); - } + // if let Ok(db_file_meta) = db_file_meta { + // metrics::gauge!("upkeep.db_file_size.bytes").set(db_file_meta.len() as f64); + // } + // if let Ok(wal_file_meta) = wal_file_meta { + // metrics::gauge!("upkeep.wal_file_size.bytes").set(wal_file_meta.len() as f64); + // } result_context } @@ -507,6 +524,7 @@ pub async fn check_health( #[cfg(test)] mod tests { + use crate::store::redis_utils::HashKey; use chrono::{DateTime, TimeDelta, TimeZone, Utc}; use prost::Message; use prost_types::Timestamp; @@ -523,20 +541,21 @@ mod tests { store::inflight_activation::{ InflightActivationStatus, InflightActivationStore, InflightActivationStoreConfig, }, + store::inflight_redis_activation::{RedisActivationStore, RedisActivationStoreConfig}, test_utils::{ StatusCount, assert_counts, consume_topic, create_config, create_integration_config, - create_producer, generate_temp_filename, make_activations, replace_retry_state, - reset_topic, + create_producer, generate_temp_filename, generate_temp_redis_urls, make_activations, + replace_retry_state, reset_topic, }, upkeep::{create_retry_activation, do_upkeep}, }; - async fn create_inflight_store() -> Arc { - let url = generate_temp_filename(); + async fn create_inflight_store() -> Arc { + let url = generate_temp_redis_urls(); let config = create_integration_config(); Arc::new( - InflightActivationStore::new(&url, InflightActivationStoreConfig::from_config(&config)) + RedisActivationStore::new(url, RedisActivationStoreConfig::from_config(&config)) .await .unwrap(), ) @@ -632,12 +651,14 @@ mod tests { let start_time = Utc::now(); let mut last_vacuum = Instant::now(); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); - let mut records = make_activations(2); + let mut record = make_activations(1).remove(0); let old = Utc.with_ymd_and_hms(2024, 12, 1, 0, 0, 0).unwrap(); + // TODO: We probably need a way for test to create tasks in specific states. replace_retry_state( - &mut records[0], + &mut record, Some(RetryState { attempts: 1, max_attempts: 2, @@ -646,24 +667,38 @@ mod tests { delay_on_retry: None, }), ); - let mut activation = TaskActivation::decode(&records[0].activation as &[u8]).unwrap(); + let mut activation = TaskActivation::decode(&record.activation as &[u8]).unwrap(); activation.received_at = Some(Timestamp { seconds: old.timestamp(), nanos: 0, }); - records[0].received_at = DateTime::from_timestamp( + record.received_at = DateTime::from_timestamp( activation.received_at.unwrap().seconds, activation.received_at.unwrap().nanos as u32, ) .expect(""); activation.parameters = r#"{"a":"b"}"#.into(); activation.delay = Some(30); - records[0].status = InflightActivationStatus::Retry; - records[0].delay_until = Some(Utc::now() + Duration::from_secs(30)); - records[0].activation = activation.encode_to_vec(); + record.status = InflightActivationStatus::Retry; + record.activation = activation.encode_to_vec(); + + assert!(store.store(vec![record.clone()]).await.is_ok()); + println!("stored records"); + let activation = store.get_pending_activation(None).await.unwrap(); // Move to processing + assert!(activation.is_some()); + println!("moved to processing {}", activation.clone().unwrap().id); + assert!( + store + .set_status( + activation.clone().unwrap().id.as_str(), + InflightActivationStatus::Retry + ) + .await + .is_ok() + ); // Move to retry + println!("moved to retry {}", activation.unwrap().id); - records[1].added_at += Duration::from_secs(1); - assert!(store.store(records.clone()).await.is_ok()); + assert_eq!(store.count_retry_activations().await.unwrap(), 1); let result_context = do_upkeep( config.clone(), @@ -675,8 +710,7 @@ mod tests { ) .await; - // Only 1 record left as the retry task should be appended as a new task - assert_eq!(store.count().await.unwrap(), 1); + assert_eq!(store.count_retry_activations().await.unwrap(), 0); assert_eq!(result_context.retried, 1); let messages = consume_topic(config.clone(), config.kafka_topic.as_ref(), 1).await; @@ -684,13 +718,184 @@ mod tests { let activation = &messages[0]; // Should spawn a new task - let activation_to_check = TaskActivation::decode(&records[0].activation as &[u8]).unwrap(); + let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); assert_ne!(activation.id, activation_to_check.id); // Should increment the attempt counter assert_eq!(activation.retry_state.as_ref().unwrap().attempts, 2); // Retry should retain task and parameters of original task - let activation_to_check = TaskActivation::decode(&records[0].activation as &[u8]).unwrap(); + let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); + assert_eq!(activation.taskname, activation_to_check.taskname); + assert_eq!(activation.namespace, activation_to_check.namespace); + assert_eq!(activation.parameters, activation_to_check.parameters); + // received_at should be set be later than the original activation + assert!( + activation.received_at.unwrap().seconds + > activation_to_check.received_at.unwrap().seconds, + "retry activation should have a later timestamp" + ); + // The delay_until of a retry task should be set to None + assert!(activation.delay.is_none()); + } + + #[tokio::test] + async fn test_delayed_retry_activation_is_appended_to_kafka_without_delay() { + let config = create_integration_config(); + let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); + reset_topic(config.clone()).await; + + let start_time = Utc::now(); + let mut last_vacuum = Instant::now(); + let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); + let producer = create_producer(config.clone()); + let mut record = make_activations(1).remove(0); + + let old = Utc.with_ymd_and_hms(2024, 12, 1, 0, 0, 0).unwrap(); + // TODO: We probably need a way for test to create tasks in specific states. + replace_retry_state( + &mut record, + Some(RetryState { + attempts: 1, + max_attempts: 2, + on_attempts_exceeded: OnAttemptsExceeded::Discard as i32, + at_most_once: None, + delay_on_retry: None, + }), + ); + let mut activation = TaskActivation::decode(&record.activation as &[u8]).unwrap(); + activation.received_at = Some(Timestamp { + seconds: old.timestamp(), + nanos: 0, + }); + record.received_at = DateTime::from_timestamp( + activation.received_at.unwrap().seconds, + activation.received_at.unwrap().nanos as u32, + ) + .expect(""); + activation.parameters = r#"{"a":"b"}"#.into(); + activation.delay = Some(30); + record.status = InflightActivationStatus::Retry; + record.activation = activation.encode_to_vec(); + record.delay_until = Some(Utc::now() - Duration::from_secs(30)); + + assert!(store.store(vec![record.clone()]).await.is_ok()); + println!("stored records"); + + // Move from delay to pending + let result_context = do_upkeep( + config.clone(), + store.clone(), + producer, + start_time, + runtime_config.clone(), + &mut last_vacuum, + ) + .await; + + let activation = store.get_pending_activation(None).await.unwrap(); // Move to processing + assert!(activation.is_some()); + println!("moved to processing {}", activation.clone().unwrap().id); + assert!( + store + .set_status( + activation.clone().unwrap().id.as_str(), + InflightActivationStatus::Retry + ) + .await + .is_ok() + ); // Move to retry + println!("moved to retry {}", activation.unwrap().id); + + assert_eq!(store.count_retry_activations().await.unwrap(), 0); + assert_eq!(result_context.retried, 1); + + let messages = consume_topic(config.clone(), config.kafka_topic.as_ref(), 1).await; + assert_eq!(messages.len(), 1); + let activation = &messages[0]; + + // Should spawn a new task + let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); + assert_ne!(activation.id, activation_to_check.id); + // Should increment the attempt counter + assert_eq!(activation.retry_state.as_ref().unwrap().attempts, 2); + + // Retry should retain task and parameters of original task + let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); + assert_eq!(activation.taskname, activation_to_check.taskname); + assert_eq!(activation.namespace, activation_to_check.namespace); + assert_eq!(activation.parameters, activation_to_check.parameters); + // received_at should be set be later than the original activation + assert!( + activation.received_at.unwrap().seconds + > activation_to_check.received_at.unwrap().seconds, + "retry activation should have a later timestamp" + ); + // The delay_until of a retry task should be set to None + assert!(activation.delay.is_none()); + } + + #[tokio::test] + async fn test_retry_activation_without_retry_is_not_appended_to_kafka() { + let config = create_integration_config(); + let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); + reset_topic(config.clone()).await; + + let start_time = Utc::now(); + let mut last_vacuum = Instant::now(); + let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); + let producer = create_producer(config.clone()); + let mut record = make_activations(1).remove(0); + + let mut activation = TaskActivation::decode(&record.activation as &[u8]).unwrap(); + activation.parameters = r#"{"a":"b"}"#.into(); + record.status = InflightActivationStatus::Retry; + record.activation = activation.encode_to_vec(); + + assert!(store.store(vec![record.clone()]).await.is_ok()); + println!("stored records"); + let activation = store.get_pending_activation(None).await.unwrap(); // Move to processing + assert!(activation.is_some()); + println!("moved to processing {}", activation.clone().unwrap().id); + assert!( + store + .set_status( + activation.clone().unwrap().id.as_str(), + InflightActivationStatus::Retry + ) + .await + .is_ok() + ); // Move to retry + println!("moved to retry {}", activation.unwrap().id); + + assert_eq!(store.count_retry_activations().await.unwrap(), 1); + + let result_context = do_upkeep( + config.clone(), + store.clone(), + producer, + start_time, + runtime_config.clone(), + &mut last_vacuum, + ) + .await; + + assert_eq!(store.count_retry_activations().await.unwrap(), 0); + assert_eq!(result_context.retried, 0); + + let messages = consume_topic(config.clone(), config.kafka_topic.as_ref(), 1).await; + assert_eq!(messages.len(), 0); + let activation = &messages[0]; + + // Should spawn a new task + let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); + assert_ne!(activation.id, activation_to_check.id); + // Should increment the attempt counter + assert_eq!(activation.retry_state.as_ref().unwrap().attempts, 2); + + // Retry should retain task and parameters of original task + let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); assert_eq!(activation.taskname, activation_to_check.taskname); assert_eq!(activation.namespace, activation_to_check.namespace); assert_eq!(activation.parameters, activation_to_check.parameters); @@ -709,15 +914,19 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); let start_time = Utc::now() - Duration::from_secs(90); let mut last_vacuum = Instant::now(); - let mut batch = make_activations(2); + let mut batch = make_activations(1); // Make a task with a future processing deadline - batch[1].status = InflightActivationStatus::Processing; - batch[1].processing_deadline = Some(Utc::now() + TimeDelta::minutes(5)); + batch[0].status = InflightActivationStatus::Processing; + batch[0].processing_deadline = Some(Utc::now() + TimeDelta::minutes(5)); assert!(store.store(batch.clone()).await.is_ok()); + assert!(store.get_pending_activation(None).await.unwrap().is_some()); // Move to processing + + assert_eq!(store.count_processing_activations().await.unwrap(), 1); let _ = do_upkeep( config, @@ -729,13 +938,10 @@ mod tests { ) .await; - // Should retain the processing record assert_eq!( - store - .count_by_status(InflightActivationStatus::Processing) - .await - .unwrap(), - 1 + store.count_processing_activations().await.unwrap(), + 1, + "record should remain in processing" ); } @@ -744,23 +950,19 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); - let mut batch = make_activations(2); + let mut batch = make_activations(1); // Make a task past with a processing deadline in the past - batch[1].status = InflightActivationStatus::Processing; - batch[1].processing_deadline = + batch[0].status = InflightActivationStatus::Processing; + batch[0].processing_deadline = Some(Utc.with_ymd_and_hms(2024, 11, 14, 21, 22, 23).unwrap()); assert!(store.store(batch.clone()).await.is_ok()); + assert!(store.get_pending_activation(None).await.unwrap().is_some()); // Move to processing // Should start off with one in processing - assert_eq!( - store - .count_by_status(InflightActivationStatus::Processing) - .await - .unwrap(), - 1 - ); + assert_eq!(store.count_processing_activations().await.unwrap(), 1); // Simulate upkeep running in the first minute let start_time = Utc::now() - Duration::from_secs(50); @@ -778,15 +980,7 @@ mod tests { .await; // No changes - assert_counts( - StatusCount { - pending: 1, - processing: 1, - ..StatusCount::default() - }, - &store, - ) - .await; + assert_eq!(store.count_processing_activations().await.unwrap(), 1); assert_eq!(result_context.processing_deadline_reset, 0); } @@ -795,24 +989,23 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); let start_time = Utc::now() - Duration::from_secs(90); let mut last_vacuum = Instant::now(); - let mut batch = make_activations(2); + let mut batch = make_activations(1); // Make a task past with a processing deadline in the past - batch[1].status = InflightActivationStatus::Processing; - batch[1].processing_deadline = + batch[0].status = InflightActivationStatus::Processing; + batch[0].processing_deadline = Some(Utc.with_ymd_and_hms(2024, 11, 14, 21, 22, 23).unwrap()); assert!(store.store(batch.clone()).await.is_ok()); + assert!(store.get_pending_activation(None).await.unwrap().is_some()); // Move to processing - // Should start off with one in processing assert_eq!( - store - .count_by_status(InflightActivationStatus::Processing) - .await - .unwrap(), - 1 + store.count_processing_activations().await.unwrap(), + 1, + "Should be one in processing" ); let result_context = do_upkeep( @@ -825,17 +1018,13 @@ mod tests { ) .await; - // 0 processing, 2 pending now assert_eq!(result_context.processing_deadline_reset, 1); - assert_counts( - StatusCount { - processing: 0, - pending: 2, - ..StatusCount::default() - }, - &store, - ) - .await; + assert_eq!(store.count_processing_activations().await.unwrap(), 0); + assert_eq!( + store.count_pending_activations().await.unwrap(), + 1, + "Should be one in pending" + ); } #[tokio::test] @@ -843,14 +1032,15 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); let start_time = Utc::now() - Duration::from_secs(90); let mut last_vacuum = Instant::now(); - let mut batch = make_activations(2); + let mut batch = make_activations(1); // Make a task past with a processing deadline in the past replace_retry_state( - &mut batch[1], + &mut batch[0], Some(RetryState { attempts: 0, max_attempts: 1, @@ -859,11 +1049,12 @@ mod tests { delay_on_retry: None, }), ); - batch[1].status = InflightActivationStatus::Processing; - batch[1].processing_deadline = + batch[0].status = InflightActivationStatus::Processing; + batch[0].processing_deadline = Some(Utc.with_ymd_and_hms(2024, 11, 14, 21, 22, 23).unwrap()); - batch[1].at_most_once = true; + batch[0].at_most_once = true; assert!(store.store(batch.clone()).await.is_ok()); + assert!(store.get_pending_activation(None).await.unwrap().is_some()); // Move to processing let result_context = do_upkeep( config, @@ -877,15 +1068,8 @@ mod tests { // 0 processing, 1 pending, 1 discarded assert_eq!(result_context.discarded, 1); - assert_counts( - StatusCount { - processing: 0, - pending: 1, - ..StatusCount::default() - }, - &store, - ) - .await; + assert_eq!(store.count_processing_activations().await.unwrap(), 0); + assert_eq!(store.count_pending_activations().await.unwrap(), 0); } #[tokio::test] @@ -893,21 +1077,20 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); - let start_time = Utc::now(); + let start_time = Utc::now() - Duration::from_secs(90); let mut last_vacuum = Instant::now(); - let mut batch = make_activations(3); + let mut batch = make_activations(1); // Because 1 is complete and has a higher offset than 0, index 2 can be discarded batch[0].processing_attempts = config.max_processing_attempts as i32; - - batch[1].status = InflightActivationStatus::Complete; - batch[1].added_at += Duration::from_secs(1); - - batch[2].processing_attempts = config.max_processing_attempts as i32; - batch[2].added_at += Duration::from_secs(2); + batch[0].processing_deadline = + Some(Utc.with_ymd_and_hms(2024, 11, 14, 21, 22, 23).unwrap()); assert!(store.store(batch.clone()).await.is_ok()); + assert!(store.get_pending_activation(None).await.unwrap().is_some()); // Move to processing + let result_context = do_upkeep( config, store.clone(), @@ -918,28 +1101,14 @@ mod tests { ) .await; - assert_eq!(result_context.processing_attempts_exceeded, 2); // batch[0] and batch[2] are removed due to max processing_attempts exceeded - assert_eq!(result_context.discarded, 2); // batch[0] and batch[2] are discarded - assert_eq!(result_context.completed, 3); // all three are removed as completed - assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), - 0, - "zero pending task should remain" - ); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Complete) - .await - .unwrap(), - 0, - "complete tasks were removed" - ); + assert_eq!(result_context.processing_attempts_exceeded, 1); + assert_eq!(result_context.discarded, 1); + assert_eq!(store.count_processing_activations().await.unwrap(), 0); + assert_eq!(store.count_pending_activations().await.unwrap(), 0); } #[tokio::test] + #[ignore = "This state can't really happen anymore, failed tasks are immediately handled"] async fn test_remove_at_remove_failed_publish_to_kafka() { let config = create_integration_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); @@ -976,7 +1145,7 @@ mod tests { // Only 1 record left as the failure task should be appended to dlq assert_eq!(result_context.deadlettered, 1); - assert_eq!(store.count().await.unwrap(), 1); + assert_eq!(store.count_deadletter_activations().await.unwrap(), 1); let messages = consume_topic(config.clone(), config.kafka_deadletter_topic.as_ref(), 1).await; @@ -991,6 +1160,7 @@ mod tests { } #[tokio::test] + #[ignore = "This state can't really happen anymore, failed tasks are immediately handled"] async fn test_remove_failed_discard() { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); @@ -1017,15 +1187,12 @@ mod tests { assert_eq!(result_context.discarded, 1); assert_eq!(result_context.completed, 1); assert_eq!( - store.count().await.unwrap(), + store.count_deadletter_activations().await.unwrap(), 1, "failed task should be removed" ); assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), + store.count_pending_activations().await.unwrap(), 1, "pending task should remain" ); @@ -1036,19 +1203,17 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); let start_time = Utc::now(); let mut last_vacuum = Instant::now(); - let mut batch = make_activations(4); + let mut batch = make_activations(3); batch[0].expires_at = Some(Utc::now() - Duration::from_secs(100)); - batch[1].status = InflightActivationStatus::Complete; - batch[2].expires_at = Some(Utc::now() - Duration::from_secs(100)); - - // Ensure the fourth task is in the future - batch[3].expires_at = Some(Utc::now() + Duration::from_secs(100)); - batch[3].added_at += Duration::from_secs(1); + batch[1].expires_at = Some(Utc::now() - Duration::from_secs(100)); + batch[2].expires_at = Some(Utc::now() + Duration::from_secs(100)); + batch[2].added_at += Duration::from_secs(1); assert!(store.store(batch.clone()).await.is_ok()); let result_context = do_upkeep( @@ -1062,39 +1227,50 @@ mod tests { .await; assert_eq!(result_context.expired, 2); // 0/2 removed as expired - assert_eq!(result_context.completed, 1); // 1 complete assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), + store.count_pending_activations().await.unwrap(), 1, "one pending task should remain" ); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Complete) - .await - .unwrap(), - 0, - "complete tasks were removed" - ); + let hash1 = HashKey::new( + batch[0].namespace.clone(), + batch[0].topic.clone(), + batch[0].partition, + ); assert!( - store.get_by_id(&batch[0].id).await.unwrap().is_none(), + store + .get_by_id(hash1, &batch[0].id) + .await + .unwrap() + .is_none(), "first task should be removed" ); + let hash2 = HashKey::new( + batch[1].namespace.clone(), + batch[1].topic.clone(), + batch[1].partition, + ); assert!( - store.get_by_id(&batch[1].id).await.unwrap().is_none(), + store + .get_by_id(hash2, &batch[1].id) + .await + .unwrap() + .is_none(), "second task should be removed" ); - assert!( - store.get_by_id(&batch[2].id).await.unwrap().is_none(), - "third task should be removed" + let hash3 = HashKey::new( + batch[2].namespace.clone(), + batch[2].topic.clone(), + batch[2].partition, ); assert!( - store.get_by_id(&batch[3].id).await.unwrap().is_some(), - "fourth task should be kept" + store + .get_by_id(hash3, &batch[2].id) + .await + .unwrap() + .is_some(), + "third task should be kept" ); } @@ -1116,20 +1292,8 @@ mod tests { batch[1].delay_until = Some(Utc::now() + Duration::from_secs(1)); assert!(store.store(batch.clone()).await.is_ok()); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Delay) - .await - .unwrap(), - 2 - ); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), - 0 - ); + assert_eq!(store.count_delayed_activations().await.unwrap(), 2); + assert_eq!(store.count_pending_activations().await.unwrap(), 0); let result_context = do_upkeep( config.clone(), store.clone(), @@ -1140,13 +1304,7 @@ mod tests { ) .await; assert_eq!(result_context.delay_elapsed, 1); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), - 1 - ); + assert_eq!(store.count_pending_activations().await.unwrap(), 1); assert_eq!( store .get_pending_activation(None) @@ -1169,13 +1327,7 @@ mod tests { ) .await; assert_eq!(result_context.delay_elapsed, 1); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), - 1 - ); + assert_eq!(store.count_pending_activations().await.unwrap(), 1); assert_eq!( store .get_pending_activation(None) @@ -1224,16 +1376,13 @@ demoted_namespaces: assert_eq!(result_context.forwarded, 2); assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), + store.count_pending_activations().await.unwrap(), 4, "four tasks should be pending" ); assert_eq!( store - .count_by_status(InflightActivationStatus::Complete) + .count_processing_activations() // I don't think this is the correct function .await .unwrap(), 2, @@ -1279,13 +1428,7 @@ demoted_namespaces: .await; assert_eq!(result_context.killswitched, 3); - assert_eq!( - store - .count_by_status(InflightActivationStatus::Pending) - .await - .unwrap(), - 3 - ); + assert_eq!(store.count_pending_activations().await.unwrap(), 3); fs::remove_file(test_path).await.unwrap(); } @@ -1317,13 +1460,6 @@ demoted_namespaces: ) .await; - assert_counts( - StatusCount { - pending: 2, - ..StatusCount::default() - }, - &store, - ) - .await; + assert_eq!(store.count_pending_activations().await.unwrap(), 2); } } From 712ae3ece4227032bcadf947264fb79d6cffaff3 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 26 Nov 2025 17:00:40 -0500 Subject: [PATCH 09/43] refactor --- src/store/inner_redis_activation_store.rs | 1218 +++++++++++++++++++++ src/store/redis_utils.rs | 227 ++++ 2 files changed, 1445 insertions(+) create mode 100644 src/store/inner_redis_activation_store.rs create mode 100644 src/store/redis_utils.rs diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs new file mode 100644 index 00000000..27c82d6b --- /dev/null +++ b/src/store/inner_redis_activation_store.rs @@ -0,0 +1,1218 @@ +use base64::{Engine as _, engine::general_purpose}; +use thiserror::Error; +use tracing::{error, info, instrument}; +// use deadpool_redis::Pool; +use crate::config::Config; +use crate::store::inflight_activation::{ + InflightActivation, InflightActivationStatus, QueryResult, +}; +use anyhow::Error; +use chrono::{DateTime, Duration, Utc}; +use cityhasher; +use deadpool_redis::cluster::{ + Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, +}; +use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; +use futures::future::try_join_all; +use redis::AsyncTypedCommands; +use sentry_protos::taskbroker::v1::OnAttemptsExceeded; +use std::collections::HashMap; +// use std::sync::RwLock; +use crate::store::redis_utils::{HashKey, KeyBuilder}; +use tokio::sync::RwLock; + +#[derive(Debug)] +pub struct InnerRedisActivationStore { + pool: Pool, + replicas: usize, + topics: HashMap>, + namespaces: Vec, + payload_ttl_seconds: u64, + bucket_hashes: Vec, + hash_keys: Vec, + key_builder: KeyBuilder, + next_key_idx_for_pending: usize, + total_possible_keys: usize, + processing_deadline_grace_sec: i64, + max_processing_attempts: i32, +} + +impl InnerRedisActivationStore { + pub async fn new( + pool: Pool, + replicas: usize, + topics: HashMap>, + namespaces: Vec, + num_buckets: usize, + payload_ttl_seconds: u64, + processing_deadline_grace_sec: u64, + max_processing_attempts: usize, + ) -> Result { + let bucket_hashes = (0..num_buckets).map(|i| format!("{:04x}", i)).collect(); + let mut hash_keys = Vec::new(); + for (topic, partitions) in topics.iter() { + for partition in partitions.iter() { + for namespace in namespaces.iter() { + hash_keys.push(HashKey::new(namespace.clone(), topic.clone(), *partition)); + } + } + } + + Ok(Self { + pool, + replicas, + topics, + namespaces, + bucket_hashes, + hash_keys, + payload_ttl_seconds, + key_builder: KeyBuilder::new(num_buckets), + next_key_idx_for_pending: 0, + total_possible_keys: 0, + processing_deadline_grace_sec: processing_deadline_grace_sec as i64, // Duration expects i64 + max_processing_attempts: max_processing_attempts as i32, + }) + } + + // Called when rebalancing partitions + pub fn rebalance_partitions(&mut self, topic: String, partitions: Vec) { + // This assumes that the broker is always consuming from the same topics and only the partitions are changing + self.topics.insert(topic.clone(), partitions.clone()); + self.hash_keys.clear(); + self.total_possible_keys = 0; + for (topic, partitions) in self.topics.iter() { + for partition in partitions.iter() { + for namespace in self.namespaces.iter() { + self.hash_keys + .push(HashKey::new(namespace.clone(), topic.clone(), *partition)); + self.total_possible_keys += self.bucket_hashes.len(); + } + } + } + info!( + "Rebalanced partitions for topic {}: {:?}: {:?}: total possible keys: {}", + topic, partitions, self.topics, self.total_possible_keys + ); + } + + pub async fn store(&self, batch: Vec) -> Result { + let mut conn = self.pool.get().await?; + let mut rows_affected: u64 = 0; + for activation in batch { + let payload_key = self + .key_builder + .get_payload_key( + HashKey::new( + activation.namespace.clone(), + activation.topic.clone(), + activation.partition, + ), + activation.id.as_str(), + ) + .build_redis_key(); + + // Base64 encode the activation since Redis HGETALL doesn't handle the bytes correctly (it tries to UTF-8 decode it) + let encoded_activation = general_purpose::STANDARD.encode(&activation.activation); + println!("payload_key: {:?}", payload_key); + let mut pipe = redis::pipe(); + pipe.atomic() + .hset(payload_key.clone(), "id", activation.id.clone()) + .arg("activation") + .arg(encoded_activation) + .arg("status") + .arg(format!("{:?}", activation.status)) + .arg("topic") + .arg(activation.topic.clone()) + .arg("partition") + .arg(activation.partition) + .arg("offset") + .arg(activation.offset) + .arg("added_at") + .arg(activation.added_at.timestamp_millis()) + .arg("received_at") + .arg(activation.received_at.timestamp_millis()) + .arg("processing_attempts") + .arg(activation.processing_attempts) + .arg("processing_deadline_duration") + .arg(activation.processing_deadline_duration) + .arg("at_most_once") + .arg(activation.at_most_once.to_string()) + .arg("namespace") + .arg(activation.namespace.clone()) + .arg("taskname") + .arg(activation.taskname) + .arg("on_attempts_exceeded") + .arg(activation.on_attempts_exceeded.as_str_name()); + + let mut expected_args = 14; + if activation.expires_at.is_some() { + pipe.arg("expires_at") + .arg(activation.expires_at.unwrap().timestamp_millis()); + expected_args += 1; + } + if activation.delay_until.is_some() { + pipe.arg("delay_until") + .arg(activation.delay_until.unwrap().timestamp()); + expected_args += 1; + } + if activation.processing_deadline.is_some() { + pipe.arg("processing_deadline") + .arg(activation.processing_deadline.unwrap().timestamp()); + expected_args += 1; + } + pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); + + let mut queue_key_used = String::new(); + if activation.delay_until.is_some() { + let delay_key = self + .key_builder + .get_delay_key( + HashKey::new( + activation.namespace.clone(), + activation.topic.clone(), + activation.partition, + ), + activation.id.as_str(), + ) + .build_redis_key(); + pipe.zadd( + delay_key.clone(), + activation.id.clone(), + activation.delay_until.unwrap().timestamp(), + ); + queue_key_used = delay_key; + } else { + let pending_key = self + .key_builder + .get_pending_key( + HashKey::new( + activation.namespace.clone(), + activation.topic.clone(), + activation.partition, + ), + activation.id.as_str(), + ) + .build_redis_key(); + pipe.rpush(pending_key.clone(), activation.id.clone()); + queue_key_used = pending_key; + } + + let mut expired_key = String::new(); + if activation.expires_at.is_some() { + expired_key = self + .key_builder + .get_expired_key( + HashKey::new( + activation.namespace.clone(), + activation.topic.clone(), + activation.partition, + ), + activation.id.as_str(), + ) + .build_redis_key(); + println!("expired_key: {:?}", expired_key); + pipe.zadd( + expired_key.clone(), + activation.id.clone(), + activation.expires_at.unwrap().timestamp_millis() as isize, + ); + } + pipe.cmd("WAIT").arg(1).arg(1000); + + let result: Vec = match pipe.query_async(&mut conn).await { + Ok(result) => result, + Err(err) => { + error!( + "Failed to store activation {} in Redis: {}", + payload_key.clone(), + err + ); + return Err(anyhow::anyhow!( + "Failed to store activation: {}", + payload_key.clone() + )); + } + }; + + if result.len() != 4 && result.len() != 5 { + return Err(anyhow::anyhow!( + "Failed to store activation: incorrect number of commands run: expected 4 or 5, got {} for key {}", + result.len(), + payload_key.clone() + )); + } + // WAIT returns the number of replicas that had the write propagated + // If there is only one node then it will return 0. + if result[result.len() - 1] < self.replicas as i32 - 1 { + return Err(anyhow::anyhow!( + "Activation {} was not stored on any replica", + payload_key + )); + } + + // HSET returns the number of fields set + if result[0] != expected_args { + return Err(anyhow::anyhow!( + "Failed to store activation: expected {} arguments, got {} for key {}", + expected_args, + result[0], + payload_key.clone() + )); + } + // EXPIRE returns 1 on success and 0 on failure + if result[1] != 1 { + return Err(anyhow::anyhow!( + "Failed to expire activation for key {}", + payload_key + )); + } + // Both ZADD and RPUSH return a count of elements in the structure + if result[2] <= 0 { + return Err(anyhow::anyhow!( + "Failed to add activation to queue for key {}", + queue_key_used + )); + } + // Check if the ZADD happened on the expired key + if result.len() == 5 && result[3] <= 0 { + return Err(anyhow::anyhow!( + "Failed to add activation to expired queue for key {}", + expired_key + )); + } + println!("result: {:?}", result); + // This should always return 0 + if *result.last().unwrap() != 0 { + return Err(anyhow::anyhow!( + "Failed to wait for activation to be stored on at least one replica for key {}", + payload_key + )); + } + + // This key has to be set separately since the transaction expects all keys to be in the same hash slot + // and this can't be guaranteed since it doesn't contain the hash key. + let mut pipe = redis::pipe(); + let lookup_key = self + .key_builder + .get_id_lookup_key(activation.id.as_str()) + .build_redis_key(); + pipe.hset(lookup_key.clone(), "id", activation.id.clone()) + .arg("topic") + .arg(activation.topic.clone()) + .arg("partition") + .arg(activation.partition) + .arg("namespace") + .arg(activation.namespace.clone()); + pipe.expire(lookup_key.clone(), self.payload_ttl_seconds as i64); + let result: Vec = pipe.query_async(&mut conn).await?; + if result.len() != 2 { + return Err(anyhow::anyhow!( + "Failed to set id lookup for key {}", + lookup_key.clone() + )); + } + if result[0] != 4 { + return Err(anyhow::anyhow!( + "Failed to set id lookup for key {}", + lookup_key.clone() + )); + } + if result[1] != 1 { + return Err(anyhow::anyhow!( + "Failed to expire id lookup for key {}", + lookup_key.clone() + )); + } + rows_affected += 1; + } + Ok(QueryResult { rows_affected }) + } + + // pub async fn add_to_pending(&self, activation: InflightActivation) -> Result<(), Error> { + // let mut conn = self.pool.get().await?; + // let pending_key = self.key_builder.get_pending_key( + // activation.topic.clone(), + // activation.partition, + // activation.namespace.clone(), + // activation.id.as_str(), + // ).build_redis_key(); + // let newlen: usize = conn + // .rpush(pending_key.clone(), activation.id.clone()) + // .await?; + // if newlen == 0 { + // return Err(anyhow::anyhow!( + // "Failed to add activation to pending: {}", + // pending_key.clone() + // )); + // } + // Ok(()) + // } + + // pub async fn add_to_processing(&self, activation: InflightActivation) -> Result<(), Error> { + // let mut conn = self.pool.get().await?; + // let processing_key = Key::new( + // activation.topic.clone(), + // activation.partition, + // activation.namespace.clone(), + // self.num_buckets, + // activation.id.clone(), + // None, + // ).build_redis_key(KeyPrefix::Processing); + // let newlen: usize = conn + // .zadd( + // processing_key.clone(), + // activation.processing_deadline.unwrap().timestamp(), + // activation.id.clone(), + // ) + // .await?; + // if newlen == 0 { + // return Err(anyhow::anyhow!( + // "Failed to add activation to processing: {}", + // processing_key.clone() + // )); + // } + // Ok(()) + // } + + // pub async fn add_to_delay(&self, activation: InflightActivation) -> Result<(), Error> { + // let mut conn = self.pool.get().await?; + // let delay_key = Key::new( + // activation.topic.clone(), + // activation.partition, + // activation.namespace.clone(), + // self.num_buckets, + // activation.id.clone(), + // None, + // ).build_redis_key(KeyPrefix::Delay); + // let newlen: usize = conn + // .zadd( + // delay_key.clone(), + // activation.delay_until.unwrap().timestamp(), + // activation.id.clone(), + // ) + // .await?; + // if newlen == 0 { + // return Err(anyhow::anyhow!( + // "Failed to add activation to delay: {}", + // delay_key.clone() + // )); + // } + // Ok(()) + // } + + // pub async fn add_to_retry(&self, activation: InflightActivation) -> Result<(), Error> { + // let mut conn = self.pool.get().await?; + // let retry_key = Key::new( + // activation.topic.clone(), + // activation.partition, + // activation.namespace.clone(), + // self.num_buckets, + // activation.id.clone(), + // None, + // ).build_redis_key(KeyPrefix::Retry); + // let newlen: usize = conn.rpush(retry_key.clone(), activation.id.clone()).await?; + // if newlen == 0 { + // return Err(anyhow::anyhow!( + // "Failed to add activation to retry: {}", + // retry_key.clone() + // )); + // } + // Ok(()) + // } + + // pub async fn add_to_deadletter(&self, activation: InflightActivation) -> Result<(), Error> { + // let mut conn = self.pool.get().await?; + // let deadletter_key = Key::new( + // activation.topic.clone(), + // activation.partition, + // activation.namespace.clone(), + // self.num_buckets, + // activation.id.clone(), + // None, + // ).build_redis_key(KeyPrefix::Deadletter); + // let newlen: usize = conn + // .rpush(deadletter_key.clone(), activation.id.clone()) + // .await?; + // if newlen == 0 { + // return Err(anyhow::anyhow!( + // "Failed to add activation to deadletter: {}", + // deadletter_key.clone() + // )); + // } + // Ok(()) + // } + + pub async fn discard_activation( + &self, + hashkey: HashKey, + activation_id: &str, + ) -> Result<(), Error> { + // If the activation is not found, return a no-op. + // If the activation is at_most_once, discard the activation and remove the payloads. + // If it has deadletter configured, move it to the deadletter queue and keep the payloads. + let fields = self + .get_fields_by_id( + hashkey.clone(), + activation_id, + &["at_most_once", "on_attempts_exceeded"], + ) + .await?; + if fields.is_empty() { + return Ok(()); + } + let at_most_once = fields.get("at_most_once").unwrap().parse::().unwrap(); + let on_attempts_exceeded = + OnAttemptsExceeded::from_str_name(fields.get("on_attempts_exceeded").unwrap().as_str()) + .unwrap(); + let mut conn = self.pool.get().await?; + if !at_most_once && on_attempts_exceeded == OnAttemptsExceeded::Deadletter { + let deadletter_key = self + .key_builder + .get_deadletter_key(hashkey.clone(), activation_id) + .build_redis_key(); + let result: usize = conn.rpush(deadletter_key.clone(), activation_id).await?; + if result == 0 { + return Err(anyhow::anyhow!( + "Failed to add activation to deadletter: {}", + deadletter_key.clone() + )); + } + return Ok(()); + } + let payload_key = self + .key_builder + .get_payload_key(hashkey, activation_id) + .build_redis_key(); + let id_lookup_key = self + .key_builder + .get_id_lookup_key(activation_id) + .build_redis_key(); + let result: usize = conn.del(payload_key.clone()).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to discard payload for key {}", + payload_key.clone() + )); + } + let result: usize = conn.del(id_lookup_key.clone()).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to discard id lookup for key {}", + id_lookup_key.clone() + )); + } + Ok(()) + } + + // Only used in testing + pub async fn delete_all_keys(&self) -> Result<(), Error> { + let mut conn = self.pool.get().await?; + let keys: Vec = conn.keys("*").await?; + for key in keys { + conn.del(key).await?; + } + Ok(()) + } + + #[instrument(skip_all)] + pub async fn get_pending_activation( + &self, + namespace: Option<&str>, + ) -> Result, Error> { + let namespaces = namespace.map(|ns| vec![ns.to_string()]); + let result = self + .get_pending_activations_from_namespaces(namespaces.as_deref(), Some(1)) + .await?; + if result.is_empty() { + return Ok(None); + } + Ok(Some(result[0].clone())) + } + + /// Get a pending activation from specified namespaces + /// If namespaces is None, gets from any namespace + /// If namespaces is Some(&[...]), gets from those namespaces + #[instrument(skip_all)] + pub async fn get_pending_activations_from_namespaces( + &self, + namespaces: Option<&[String]>, + limit: Option, + ) -> Result, Error> { + let mut conn = self.pool.get().await?; + let mut activations: Vec = Vec::new(); + for hash_key in self.hash_keys.iter() { + if namespaces.is_some() && !namespaces.unwrap().contains(&hash_key.namespace) { + continue; + } + for bucket_hash in self.bucket_hashes.iter() { + // Get the next pending activation + let pending_key = self + .key_builder + .get_pending_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let result = conn.lindex(pending_key.clone(), 0).await?; + if result.is_none() { + continue; + } + let activation_id: String = result.unwrap().to_string(); + println!("activation_id: {:?}", activation_id); + + let act_result = self.get_by_id(hash_key.clone(), &activation_id).await?; + if act_result.is_none() { + continue; + } + let activation = act_result.unwrap(); + + // Push the activation to processing. This will not create two entries for the same activation in the case of duplicates. + let processing_key = self + .key_builder + .get_processing_key(hash_key.clone(), &activation_id) + .build_redis_key(); + let processing_deadline = match activation.processing_deadline { + None => Utc::now() + Duration::seconds(self.processing_deadline_grace_sec), + Some(apd) => apd, + } + .timestamp_millis(); + let result: usize = conn + .zadd( + processing_key.clone(), + activation.id.clone(), + processing_deadline, + ) + .await?; + if result == 0 { + return Err(anyhow::anyhow!( + "Failed to move activation to processing: {} {}", + processing_key, + activation_id + )); + } + + let result: usize = conn + .lrem(pending_key.clone(), 1, activation_id.clone()) + .await?; + if result == 0 { + info!( + "Attempted to lrem an activation from pending queue, but it was not found: {} {}", + pending_key, activation_id + ); + metrics::counter!("inflight_redis_activation_store_lrem_not_found") + .increment(1); + } + activations.push(activation); + if activations.len() >= limit.unwrap() as usize { + return Ok(activations); + } + } + } + Ok(activations) + } + + pub fn incr_next_key_idx_for_pending(&mut self) { + self.next_key_idx_for_pending += 1; + if self.next_key_idx_for_pending >= self.total_possible_keys { + self.next_key_idx_for_pending = 0; + } + } + + /// Get an activation by id. Primarily used for testing + pub async fn get_by_id( + &self, + hash_key: HashKey, + activation_id: &str, + ) -> Result, Error> { + let mut conn = self.pool.get().await?; + let payload_key = self + .key_builder + .get_payload_key(hash_key, activation_id) + .build_redis_key(); + let result: HashMap = conn.hgetall(payload_key.clone()).await?; + if result.is_empty() { + return Ok(None); + } + let activation: InflightActivation = result.into(); + Ok(Some(activation)) + } + + pub async fn get_by_id_lookup( + &self, + activation_id: &str, + ) -> Result, Error> { + let result = self.get_hashkey_by_id(activation_id).await?; + if result.is_none() { + return Ok(None); + } + + let hash_key = result.unwrap(); + let activation = self.get_by_id(hash_key, activation_id).await?; + Ok(activation) + } + + pub async fn get_hashkey_by_id(&self, activation_id: &str) -> Result, Error> { + let mut conn = self.pool.get().await?; + let result: HashMap = conn + .hgetall( + self.key_builder + .get_id_lookup_key(activation_id) + .build_redis_key(), + ) + .await?; + if result.is_empty() { + return Ok(None); + } + Ok(Some(HashKey::new( + result.get("namespace").unwrap().to_string(), + result.get("topic").unwrap().to_string(), + result.get("partition").unwrap().parse().unwrap(), + ))) + } + + pub async fn get_fields_by_id( + &self, + hash_key: HashKey, + activation_id: &str, + fields: &[&str], + ) -> Result, Error> { + let mut conn = self.pool.get().await?; + let payload_key = self + .key_builder + .get_payload_key(hash_key, activation_id) + .build_redis_key(); + let mut pipe = redis::pipe(); + pipe.hmget(payload_key.clone(), fields[0]); + for field in fields.iter().skip(1) { + pipe.arg(field); + } + let result: Vec> = pipe.query_async(&mut *conn).await?; + // Returns an array of tuples with the values in the same order as the fields array. + // These needs to be combined into a map. + let mut fields_map = HashMap::new(); + for values in result.iter() { + for (idx, arg_name) in fields.iter().enumerate() { + fields_map.insert(arg_name.to_string(), values[idx].clone()); + } + } + Ok(fields_map) + } + + pub async fn set_status( + &self, + activation_id: &str, + status: InflightActivationStatus, + ) -> Result<(), Error> { + // If the activation is not found, return a no-op + let activation = self.get_by_id_lookup(activation_id).await?; + if activation.is_none() { + info!( + "Activation not found for id: {}, skipping status update", + activation_id + ); + return Ok(()); + } + let activation = activation.unwrap(); + let hash_key = HashKey::new( + activation.namespace.clone(), + activation.topic.clone(), + activation.partition, + ); + let mut conn = self.pool.get().await?; + let mut pipe = redis::pipe(); + pipe.atomic(); + let mut has_failure = false; + if status == InflightActivationStatus::Retry { + has_failure = true; + pipe.rpush( + self.key_builder + .get_retry_key(hash_key.clone(), activation.id.as_str()) + .build_redis_key(), + activation_id, + ); + } else if status == InflightActivationStatus::Failure + && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter + { + has_failure = true; + pipe.rpush( + self.key_builder + .get_deadletter_key(hash_key.clone(), activation.id.as_str()) + .build_redis_key(), + activation_id, + ); + } + let processing_key = self + .key_builder + .get_processing_key(hash_key.clone(), activation.id.as_str()) + .build_redis_key(); + pipe.zrem(processing_key, activation_id); + + let results: Vec = pipe.query_async(&mut *conn).await?; + let expected_commands = if has_failure { 2 } else { 1 }; + if results.len() != expected_commands { + return Err(anyhow::anyhow!( + "Failed to set status: incorrect number of commands run: expected {}, got {} for key {}", + expected_commands, + results.len(), + activation_id + )); + } + + let processing_removed = if has_failure { results[1] } else { results[0] }; + if has_failure && results[0] != 1 { + return Err(anyhow::anyhow!( + "Failed to add activation to retry/deadletter queue: {}", + activation_id + )); + } + + if processing_removed != 1 { + // Removing from processing set + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } + Ok(()) + } + + pub async fn get_retry_activations(&self) -> Result, Error> { + let mut conn = self.pool.get().await?; + let mut activation_ids: Vec<(HashKey, String)> = Vec::new(); + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let retry_key = self + .key_builder + .get_retry_key_for_iter(hash_key.clone(), bucket_hash.as_str()); + let result: Vec = conn.lrange(retry_key.build_redis_key(), 0, -1).await?; + activation_ids.extend( + result + .iter() + .map(|id| (retry_key.hashkey.clone(), id.clone())), + ); + } + } + + let activations = try_join_all( + activation_ids + .iter() + .map(|(hashkey, id)| self.get_by_id(hashkey.clone(), id)), + ) + .await?; + Ok(activations.into_iter().flatten().collect()) + } + + pub async fn mark_retry_completed( + &self, + activations: Vec, + ) -> Result { + if activations.is_empty() { + return Ok(0); + } + let mut conn = self.pool.get().await?; + + // Since this is a global operation, there is no guarantee that the keys will have the same hash key. + // Group the activations by hash key and then remove them in transactions. + let mut hash_key_to_activations = HashMap::new(); + for activation in activations.iter() { + let hash_key = HashKey::new( + activation.namespace.clone(), + activation.topic.clone(), + activation.partition, + ); + hash_key_to_activations + .entry(hash_key) + .or_insert(Vec::new()) + .push(activation.clone()); + } + + let mut id_lookup_keys: Vec = Vec::new(); + let mut rows_affected: u64 = 0; + for (hash_key, activations) in hash_key_to_activations.iter() { + let mut pipe = redis::pipe(); + pipe.atomic(); + for activation in activations.iter() { + let retry_key = self + .key_builder + .get_retry_key(hash_key.clone(), activation.id.as_str()) + .build_redis_key(); + pipe.lrem(retry_key, 0, activation.id.as_str()); + pipe.del( + self.key_builder + .get_payload_key(hash_key.clone(), activation.id.as_str()) + .build_redis_key(), + ); + id_lookup_keys.push( + self.key_builder + .get_id_lookup_key(activation.id.as_str()) + .build_redis_key(), + ); + } + let results: Vec = pipe.query_async(&mut *conn).await?; + if results.is_empty() { + continue; + } + // Only sum every other element. This will be the output of the LREM command, which returns how many + // elements were removed from the retry queue. + rows_affected += results + .iter() + .enumerate() + .filter(|(i, _)| i % 2 == 0) + .map(|(_, value)| *value) + .sum::() as u64; + } + + let mut pipe = redis::pipe(); + pipe.del(id_lookup_keys[0].clone()); + for id_lookup_key in id_lookup_keys.iter().skip(1) { + pipe.arg(id_lookup_key); + } + + // Since these keys expire, it's not a big deal if not all of them are deleted here. + let deleted_count: Vec = pipe.query_async(&mut *conn).await?; + if deleted_count[0] != id_lookup_keys.len() { + error!( + "Failed to delete all retry id lookup keys: expected {}, got {}", + id_lookup_keys.len(), + deleted_count[0] + ); + } + Ok(rows_affected) + } + + pub async fn handle_processing_deadline(&self) -> Result<(u64, u64, u64), Error> { + // Get all the activations that have exceeded their processing deadline + // Idempotent activations that fail their processing deadlines go directly to failure + // there are no retries, as the worker will reject the activation due to idempotency keys. + // If the task has processing attempts remaining, it is moved back to pending with attempts += 1 + // Otherwise it is either discarded or moved to retry/deadletter. + let mut conn = self.pool.get().await?; + let mut total_rows_affected: u64 = 0; + let mut discarded_count: u64 = 0; + let mut processing_attempts_exceeded_count: u64 = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let mut pipe = redis::pipe(); + let processing_key = self + .key_builder + .get_processing_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + // ZRANGEBYSCORE is deprecated but ZRANGE ... BYSCORE is also not supported so? + let activations: Vec = conn + .zrangebyscore( + processing_key.clone(), + "-inf".to_string(), + Utc::now().timestamp_millis() as isize, + ) + .await?; + if activations.is_empty() { + continue; + } + total_rows_affected += activations.len() as u64; + for activation_id in activations.iter() { + let fields = self + .get_fields_by_id( + hash_key.clone(), + activation_id, + &["processing_attempts", "at_most_once"], + ) + .await?; + if fields.is_empty() { + error!( + "Failed to get payload for activation past processing deadline: {}", + activation_id + ); + continue; + } + let at_most_once = fields + .get("at_most_once") + .unwrap_or(&"false".to_string()) + .parse::() + .unwrap(); + if at_most_once { + let result = conn.zrem(processing_key.clone(), activation_id).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } + self.discard_activation(hash_key.clone(), activation_id) + .await?; + discarded_count += 1; + continue; + } + let processing_attempts = fields + .get("processing_attempts") + .unwrap_or(&"0".to_string()) + .parse::() + .unwrap(); + if processing_attempts >= self.max_processing_attempts { + // Check for deadletter/dlq + processing_attempts_exceeded_count += 1; + let result = conn.zrem(processing_key.clone(), activation_id).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } + self.discard_activation(hash_key.clone(), activation_id) + .await?; + discarded_count += 1; + continue; + } + // Move back to pending + let pending_key = self + .key_builder + .get_pending_key(hash_key.clone(), activation_id) + .build_redis_key(); + let payload_key = self + .key_builder + .get_payload_key(hash_key.clone(), activation_id) + .build_redis_key(); + let mut pipe = redis::pipe(); + pipe.atomic(); + pipe.hset( + payload_key, + "processing_attempts", + (processing_attempts + 1).to_string(), + ); + pipe.rpush(pending_key, activation_id); + pipe.zrem(processing_key.clone(), activation_id); + let results: Vec = pipe.query_async(&mut *conn).await?; + if results.len() != 3 { + return Err(anyhow::anyhow!( + "Failed to move activation back to pending: incorrect number of commands run: expected 3, got {} for key {}", + results.len(), + activation_id + )); + } + // processing_attempts should already be a key in the payload, so this should return 0 + if results[0] != 0 { + return Err(anyhow::anyhow!( + "Failed to increment processing attempts: {}", + activation_id + )); + } + if results[1] != 1 { + return Err(anyhow::anyhow!( + "Failed to add activation to pending queue: {}", + activation_id + )); + } + if results[2] != 1 { + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } + } + } + } + Ok(( + total_rows_affected, + discarded_count, + processing_attempts_exceeded_count, + )) + } + + pub async fn handle_processing_attempts(&self) -> Result { + // No-op + Ok(0) + } + + pub async fn handle_expires_at(&self) -> Result { + let mut conn = self.pool.get().await?; + let mut total_rows_affected = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let expires_at_key = self + .key_builder + .get_expired_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let activations: Vec = conn + .zrangebyscore( + expires_at_key.clone(), + 0, + Utc::now().timestamp_millis() as isize, + ) + .await?; + if activations.is_empty() { + continue; + } + total_rows_affected += activations.len() as u64; + let mut pipe = redis::pipe(); + pipe.atomic(); + for activation_id in activations.iter() { + let pending_key = self + .key_builder + .get_pending_key(hash_key.clone(), activation_id) + .build_redis_key(); + pipe.lrem(pending_key, 0, activation_id); + pipe.zrem(expires_at_key.clone(), activation_id); + self.discard_activation(hash_key.clone(), activation_id) + .await?; + } + let results: Vec = pipe.query_async(&mut *conn).await?; + if results.len() != 2 * activations.len() { + return Err(anyhow::anyhow!( + "Failed to remove expired activations: {}", + expires_at_key + )); + } + } + } + Ok(total_rows_affected) + } + + pub async fn handle_delay_until(&self) -> Result { + Ok(0) + } + + pub async fn handle_deadletter_tasks(&self) -> Result)>, Error> { + Ok(vec![]) + } + + pub async fn mark_deadletter_completed(&self, ids: Vec) -> Result { + Ok(0) + } + + pub async fn remove_killswitched(&self, killswitched_tasks: Vec) -> Result { + Ok(0) + } + + pub async fn mark_demoted_completed(&self, ids: Vec) -> Result { + Ok(0) + } + + pub async fn move_delay_to_pending(&self) -> Result<(), Error> { + Ok(()) + } + + pub async fn get_processing_deadline_exceeded_activations( + &self, + ) -> Result, Error> { + return Ok(vec![]); + } + + pub async fn get_processing_attempts_for_activation(&self, id: &str) -> Result { + return Ok(0); + } + + pub async fn retry_activation_locally(&self, id: &str) -> Result<(), Error> { + // Increment processing attempts by 1 and push back to pending in transaction + return Ok(()); + } + + pub async fn remove_from_processing(&self, id: &str) -> Result<(), Error> { + // Remove from processing in transaction + return Ok(()); + } + + pub async fn remove_from_pending(&self, id: &str) -> Result<(), Error> { + // Remove from pending in transaction + return Ok(()); + } + + pub async fn remove_from_delay(&self, id: &str) -> Result<(), Error> { + // Remove from delay in transaction + return Ok(()); + } + + pub async fn get_deadletter_activations(&self) -> Result, Error> { + return Ok(vec![]); + } + + pub async fn get_expired_activations(&self) -> Result, Error> { + return Ok(vec![]); + } + + pub async fn pending_activation_max_lag(&self, now: &DateTime) -> Result { + Ok(0) + } + + #[instrument(skip_all)] + pub async fn count_pending_activations(&self) -> Result { + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let pending_key = self + .key_builder + .get_pending_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let count: usize = conn.llen(pending_key).await?; + total_count += count; + } + } + return Ok(total_count); + } + + #[instrument(skip_all)] + pub async fn count_delayed_activations(&self) -> Result { + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let delay_key = self + .key_builder + .get_delay_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let count: usize = conn.zcard(delay_key.clone()).await?; + total_count += count; + } + } + return Ok(total_count); + } + + #[instrument(skip_all)] + pub async fn count_processing_activations(&self) -> Result { + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let processing_key = self + .key_builder + .get_processing_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let count: usize = conn.zcard(processing_key.clone()).await?; + total_count += count; + } + } + return Ok(total_count); + } + + pub async fn count_retry_activations(&self) -> Result { + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let retry_key = self + .key_builder + .get_retry_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let count: usize = conn.llen(retry_key.clone()).await?; + total_count += count; + } + } + return Ok(total_count); + } + + pub async fn count_deadletter_activations(&self) -> Result { + let mut conn = self.pool.get().await?; + let mut total_count = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let retry_key = self + .key_builder + .get_deadletter_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let count: usize = conn.llen(retry_key.clone()).await?; + total_count += count; + } + } + return Ok(total_count); + } + + pub async fn db_size(&self) -> Result { + return Ok(0); + } +} diff --git a/src/store/redis_utils.rs b/src/store/redis_utils.rs new file mode 100644 index 00000000..6983c3c7 --- /dev/null +++ b/src/store/redis_utils.rs @@ -0,0 +1,227 @@ +use base64::{Engine as _, engine::general_purpose}; +use thiserror::Error; +use tracing::{error, info, instrument}; +// use deadpool_redis::Pool; +use crate::config::Config; +use crate::store::inflight_activation::{ + InflightActivation, InflightActivationStatus, QueryResult, +}; +use anyhow::Error; +use chrono::{DateTime, Duration, Utc}; +use cityhasher; +use deadpool_redis::cluster::{ + Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, +}; +use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; +use futures::future::try_join_all; +use redis::AsyncTypedCommands; +use sentry_protos::taskbroker::v1::OnAttemptsExceeded; +use std::collections::HashMap; +// use std::sync::RwLock; +use tokio::sync::RwLock; + +pub enum KeyPrefix { + Payload, + IDLookup, + Pending, + Processing, + Delay, + Retry, + Deadletter, + Expired, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct HashKey { + pub namespace: String, + pub topic: String, + pub partition: i32, +} +impl HashKey { + pub fn new(namespace: String, topic: String, partition: i32) -> Self { + Self { + namespace, + topic, + partition, + } + } + + pub fn hash(&self) -> String { + format!("{}:{}:{}", self.namespace, self.topic, self.partition) + } +} + +#[derive(Debug)] +pub struct KeyBuilder { + num_buckets: usize, +} +impl KeyBuilder { + pub fn new(num_buckets: usize) -> Self { + Self { num_buckets } + } + + pub fn compute_bucket(&self, activation_id: &str) -> String { + let hashint: u64 = cityhasher::hash(activation_id); + format!("{:04x}", hashint % self.num_buckets as u64) + } + + pub fn get_id_lookup_key(&self, activation_id: &str) -> Key { + Key::new( + KeyPrefix::IDLookup, + HashKey::new(String::new(), String::new(), 0), + String::new(), + Some(activation_id.to_string()), + ) + } + + pub fn get_payload_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Payload, + hash_key, + self.compute_bucket(activation_id), + Some(activation_id.to_string()), + ) + } + + pub fn get_pending_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Pending, + hash_key, + self.compute_bucket(activation_id), + None, + ) + } + + pub fn get_pending_key_for_iter(&self, hash_key: HashKey, bucket_hash: &str) -> Key { + Key::new(KeyPrefix::Pending, hash_key, bucket_hash.to_string(), None) + } + + pub fn get_processing_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Processing, + hash_key, + self.compute_bucket(activation_id), + None, + ) + } + + pub fn get_processing_key_for_iter(&self, hash_key: HashKey, bucket_hash: &str) -> Key { + Key::new( + KeyPrefix::Processing, + hash_key, + bucket_hash.to_string(), + None, + ) + } + + pub fn get_delay_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Delay, + hash_key, + self.compute_bucket(activation_id), + None, + ) + } + + pub fn get_delay_key_for_iter(&self, hash_key: HashKey, bucket_hash: &str) -> Key { + Key::new(KeyPrefix::Delay, hash_key, bucket_hash.to_string(), None) + } + + pub fn get_retry_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Retry, + hash_key, + self.compute_bucket(activation_id), + None, + ) + } + + pub fn get_retry_key_for_iter(&self, hash_key: HashKey, bucket_hash: &str) -> Key { + Key::new(KeyPrefix::Retry, hash_key, bucket_hash.to_string(), None) + } + + pub fn get_deadletter_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Deadletter, + hash_key, + self.compute_bucket(activation_id), + None, + ) + } + + pub fn get_deadletter_key_for_iter(&self, hash_key: HashKey, bucket_hash: &str) -> Key { + Key::new( + KeyPrefix::Deadletter, + hash_key, + bucket_hash.to_string(), + None, + ) + } + + pub fn get_expired_key(&self, hash_key: HashKey, activation_id: &str) -> Key { + Key::new( + KeyPrefix::Expired, + hash_key, + self.compute_bucket(activation_id), + None, + ) + } + + pub fn get_expired_key_for_iter(&self, hash_key: HashKey, bucket_hash: &str) -> Key { + Key::new(KeyPrefix::Expired, hash_key, bucket_hash.to_string(), None) + } +} + +pub struct Key { + pub prefix: KeyPrefix, + pub hashkey: HashKey, + pub bucket_hash: String, + pub activation_id: Option, +} +impl Key { + pub fn new( + prefix: KeyPrefix, + hash_key: HashKey, + bucket_hash: String, + activation_id: Option, + ) -> Self { + Self { + prefix, + hashkey: hash_key, + bucket_hash, + activation_id, + } + } + + pub fn build_redis_key(&self) -> String { + let key = match self.prefix { + KeyPrefix::Payload => { + format!("payload:{{{}:{}}}", self.hashkey.hash(), self.bucket_hash) + } + KeyPrefix::IDLookup => "idlookup:".to_string(), + KeyPrefix::Pending => { + format!("pending:{{{}:{}}}", self.hashkey.hash(), self.bucket_hash) + } + KeyPrefix::Processing => format!( + "processing:{{{}:{}}}", + self.hashkey.hash(), + self.bucket_hash + ), + KeyPrefix::Delay => format!("delay:{{{}:{}}}", self.hashkey.hash(), self.bucket_hash), + KeyPrefix::Retry => format!("retry:{{{}:{}}}", self.hashkey.hash(), self.bucket_hash), + KeyPrefix::Deadletter => format!( + "deadletter:{{{}:{}}}", + self.hashkey.hash(), + self.bucket_hash + ), + KeyPrefix::Expired => { + format!("expired:{{{}:{}}}", self.hashkey.hash(), self.bucket_hash) + } + }; + if self.activation_id.is_some() { + format!("{}:{}", key, self.activation_id.clone().unwrap()) + } else { + key + } + } +} From 9e431ef1a7b31c66ab246c0053be2269b8e09ded Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 27 Nov 2025 10:18:10 -0500 Subject: [PATCH 10/43] delay until and tests --- src/grpc/server.rs | 2 +- src/grpc/server_tests.rs | 2 +- src/kafka/inflight_activation_writer.rs | 5 +- src/main.rs | 22 +- src/store/inflight_redis_activation.rs | 15 +- src/store/inner_redis_activation_store.rs | 357 ++++++++-------------- src/store/redis_utils.rs | 20 -- src/upkeep.rs | 84 ++--- 8 files changed, 158 insertions(+), 349 deletions(-) diff --git a/src/grpc/server.rs b/src/grpc/server.rs index 1fa4977d..890be775 100644 --- a/src/grpc/server.rs +++ b/src/grpc/server.rs @@ -9,7 +9,7 @@ use std::sync::Arc; use std::time::Instant; use tonic::{Request, Response, Status}; -use crate::store::inflight_activation::{InflightActivationStatus, InflightActivationStore}; +use crate::store::inflight_activation::InflightActivationStatus; use crate::store::inflight_redis_activation::RedisActivationStore; use tracing::{error, instrument}; diff --git a/src/grpc/server_tests.rs b/src/grpc/server_tests.rs index d20b777a..02b30899 100644 --- a/src/grpc/server_tests.rs +++ b/src/grpc/server_tests.rs @@ -4,7 +4,7 @@ use tonic::{Code, Request}; use crate::grpc::server::TaskbrokerServer; -use crate::test_utils::{create_redis_test_store, create_test_store, make_activations}; +use crate::test_utils::{create_redis_test_store, make_activations}; #[tokio::test] async fn test_get_task() { diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index 8622b658..3fdb0080 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -5,13 +5,10 @@ use std::{ use crate::{ config::Config, - store::inflight_activation::{ - InflightActivation, InflightActivationStatus, InflightActivationStore, - }, + store::inflight_activation::{InflightActivation, InflightActivationStatus}, store::inflight_redis_activation::RedisActivationStore, }; use chrono::Utc; -use std::sync::RwLock; use tokio::time::sleep; use tracing::{debug, error, instrument}; diff --git a/src/main.rs b/src/main.rs index c2e6e49f..1054fea5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,12 +6,11 @@ use taskbroker::kafka::inflight_activation_batcher::{ ActivationBatcherConfig, InflightActivationBatcher, }; use taskbroker::upkeep::upkeep; +use tokio::select; use tokio::signal::unix::SignalKind; -use tokio::sync::RwLock; use tokio::task::JoinHandle; -use tokio::{select, time}; use tonic::transport::Server; -use tracing::{debug, error, info, warn}; +use tracing::{error, info}; use sentry_protos::taskbroker::v1::consumer_service_server::ConsumerServiceServer; @@ -31,9 +30,6 @@ use taskbroker::logging; use taskbroker::metrics; use taskbroker::processing_strategy; use taskbroker::runtime_config::RuntimeConfigManager; -use taskbroker::store::inflight_activation::{ - InflightActivationStore, InflightActivationStoreConfig, -}; use taskbroker::store::inflight_redis_activation::{ RedisActivationStore, RedisActivationStoreConfig, }; @@ -66,13 +62,6 @@ async fn main() -> Result<(), Error> { logging::init(logging::LoggingConfig::from_config(&config)); metrics::init(metrics::MetricsConfig::from_config(&config)); - let store = Arc::new( - InflightActivationStore::new( - &config.db_path, - InflightActivationStoreConfig::from_config(&config), - ) - .await?, - ); let redis_store = Arc::new( RedisActivationStore::new( config.redis_cluster_urls.clone(), @@ -90,13 +79,6 @@ async fn main() -> Result<(), Error> { ) .await?; } - if config.full_vacuum_on_start { - info!("Running full vacuum on database"); - match store.full_vacuum_db().await { - Ok(_) => info!("Full vacuum completed."), - Err(err) => error!("Failed to run full vacuum on startup: {:?}", err), - } - } // Get startup time after migrations and vacuum let startup_time = Utc::now(); diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index a5f80b17..bbce6c49 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,23 +1,15 @@ use crate::store::inner_redis_activation_store::InnerRedisActivationStore; use crate::store::redis_utils::HashKey; -use base64::{Engine as _, engine::general_purpose}; use thiserror::Error; -use tracing::{error, info, instrument}; +use tracing::error; // use deadpool_redis::Pool; use crate::config::Config; use crate::store::inflight_activation::{ InflightActivation, InflightActivationStatus, QueryResult, }; -use anyhow::Error; -use chrono::{DateTime, Duration, Utc}; -use cityhasher; -use deadpool_redis::cluster::{ - Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, -}; +use chrono::{DateTime, Utc}; use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; -use redis::AsyncTypedCommands; -use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; // use std::sync::RwLock; use tokio::sync::RwLock; @@ -130,7 +122,6 @@ impl RedisActivationStore { let result = self.inner.read().await.store(batch).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); - println!("error: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "store".to_string(), error: error_string, @@ -301,7 +292,6 @@ impl RedisActivationStore { .await; if result.is_err() { let error_string = result.err().unwrap().to_string(); - println!("error: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "set_status".to_string(), error: error_string, @@ -348,7 +338,6 @@ impl RedisActivationStore { let result = self.inner.read().await.handle_processing_deadline().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); - println!("error: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "handle_processing_deadline".to_string(), error: error_string, diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 27c82d6b..7ddd2cc7 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -1,25 +1,16 @@ -use base64::{Engine as _, engine::general_purpose}; -use thiserror::Error; -use tracing::{error, info, instrument}; -// use deadpool_redis::Pool; -use crate::config::Config; use crate::store::inflight_activation::{ InflightActivation, InflightActivationStatus, QueryResult, }; +use crate::store::redis_utils::{HashKey, KeyBuilder}; use anyhow::Error; +use base64::{Engine as _, engine::general_purpose}; use chrono::{DateTime, Duration, Utc}; -use cityhasher; -use deadpool_redis::cluster::{ - Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, -}; -use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; +use deadpool_redis::Pool; use futures::future::try_join_all; use redis::AsyncTypedCommands; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; -// use std::sync::RwLock; -use crate::store::redis_utils::{HashKey, KeyBuilder}; -use tokio::sync::RwLock; +use tracing::{error, info, instrument}; #[derive(Debug)] pub struct InnerRedisActivationStore { @@ -113,7 +104,6 @@ impl InnerRedisActivationStore { // Base64 encode the activation since Redis HGETALL doesn't handle the bytes correctly (it tries to UTF-8 decode it) let encoded_activation = general_purpose::STANDARD.encode(&activation.activation); - println!("payload_key: {:?}", payload_key); let mut pipe = redis::pipe(); pipe.atomic() .hset(payload_key.clone(), "id", activation.id.clone()) @@ -152,12 +142,12 @@ impl InnerRedisActivationStore { } if activation.delay_until.is_some() { pipe.arg("delay_until") - .arg(activation.delay_until.unwrap().timestamp()); + .arg(activation.delay_until.unwrap().timestamp_millis()); expected_args += 1; } if activation.processing_deadline.is_some() { pipe.arg("processing_deadline") - .arg(activation.processing_deadline.unwrap().timestamp()); + .arg(activation.processing_deadline.unwrap().timestamp_millis()); expected_args += 1; } pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); @@ -178,7 +168,7 @@ impl InnerRedisActivationStore { pipe.zadd( delay_key.clone(), activation.id.clone(), - activation.delay_until.unwrap().timestamp(), + activation.delay_until.unwrap().timestamp_millis() as isize, ); queue_key_used = delay_key; } else { @@ -210,7 +200,6 @@ impl InnerRedisActivationStore { activation.id.as_str(), ) .build_redis_key(); - println!("expired_key: {:?}", expired_key); pipe.zadd( expired_key.clone(), activation.id.clone(), @@ -280,7 +269,6 @@ impl InnerRedisActivationStore { expired_key )); } - println!("result: {:?}", result); // This should always return 0 if *result.last().unwrap() != 0 { return Err(anyhow::anyhow!( @@ -328,120 +316,38 @@ impl InnerRedisActivationStore { Ok(QueryResult { rows_affected }) } - // pub async fn add_to_pending(&self, activation: InflightActivation) -> Result<(), Error> { - // let mut conn = self.pool.get().await?; - // let pending_key = self.key_builder.get_pending_key( - // activation.topic.clone(), - // activation.partition, - // activation.namespace.clone(), - // activation.id.as_str(), - // ).build_redis_key(); - // let newlen: usize = conn - // .rpush(pending_key.clone(), activation.id.clone()) - // .await?; - // if newlen == 0 { - // return Err(anyhow::anyhow!( - // "Failed to add activation to pending: {}", - // pending_key.clone() - // )); - // } - // Ok(()) - // } - - // pub async fn add_to_processing(&self, activation: InflightActivation) -> Result<(), Error> { - // let mut conn = self.pool.get().await?; - // let processing_key = Key::new( - // activation.topic.clone(), - // activation.partition, - // activation.namespace.clone(), - // self.num_buckets, - // activation.id.clone(), - // None, - // ).build_redis_key(KeyPrefix::Processing); - // let newlen: usize = conn - // .zadd( - // processing_key.clone(), - // activation.processing_deadline.unwrap().timestamp(), - // activation.id.clone(), - // ) - // .await?; - // if newlen == 0 { - // return Err(anyhow::anyhow!( - // "Failed to add activation to processing: {}", - // processing_key.clone() - // )); - // } - // Ok(()) - // } - - // pub async fn add_to_delay(&self, activation: InflightActivation) -> Result<(), Error> { - // let mut conn = self.pool.get().await?; - // let delay_key = Key::new( - // activation.topic.clone(), - // activation.partition, - // activation.namespace.clone(), - // self.num_buckets, - // activation.id.clone(), - // None, - // ).build_redis_key(KeyPrefix::Delay); - // let newlen: usize = conn - // .zadd( - // delay_key.clone(), - // activation.delay_until.unwrap().timestamp(), - // activation.id.clone(), - // ) - // .await?; - // if newlen == 0 { - // return Err(anyhow::anyhow!( - // "Failed to add activation to delay: {}", - // delay_key.clone() - // )); - // } - // Ok(()) - // } - - // pub async fn add_to_retry(&self, activation: InflightActivation) -> Result<(), Error> { - // let mut conn = self.pool.get().await?; - // let retry_key = Key::new( - // activation.topic.clone(), - // activation.partition, - // activation.namespace.clone(), - // self.num_buckets, - // activation.id.clone(), - // None, - // ).build_redis_key(KeyPrefix::Retry); - // let newlen: usize = conn.rpush(retry_key.clone(), activation.id.clone()).await?; - // if newlen == 0 { - // return Err(anyhow::anyhow!( - // "Failed to add activation to retry: {}", - // retry_key.clone() - // )); - // } - // Ok(()) - // } - - // pub async fn add_to_deadletter(&self, activation: InflightActivation) -> Result<(), Error> { - // let mut conn = self.pool.get().await?; - // let deadletter_key = Key::new( - // activation.topic.clone(), - // activation.partition, - // activation.namespace.clone(), - // self.num_buckets, - // activation.id.clone(), - // None, - // ).build_redis_key(KeyPrefix::Deadletter); - // let newlen: usize = conn - // .rpush(deadletter_key.clone(), activation.id.clone()) - // .await?; - // if newlen == 0 { - // return Err(anyhow::anyhow!( - // "Failed to add activation to deadletter: {}", - // deadletter_key.clone() - // )); - // } - // Ok(()) - // } - + pub async fn cleanup_activation( + &self, + hashkey: HashKey, + activation_id: &str, + ) -> Result<(), Error> { + let mut conn = self.pool.get().await?; + let payload_key = self + .key_builder + .get_payload_key(hashkey, activation_id) + .build_redis_key(); + let id_lookup_key = self + .key_builder + .get_id_lookup_key(activation_id) + .build_redis_key(); + let result: usize = conn.del(payload_key.clone()).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to cleanup payload for key {}", + payload_key.clone() + )); + } + let result: usize = conn.del(id_lookup_key.clone()).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to cleanup id lookup for key {}", + id_lookup_key.clone() + )); + } + Ok(()) + } + /// Discard an activation. If the activation is at_most_once, remove the payloads. + #[instrument(skip_all)] pub async fn discard_activation( &self, hashkey: HashKey, @@ -479,38 +385,7 @@ impl InnerRedisActivationStore { } return Ok(()); } - let payload_key = self - .key_builder - .get_payload_key(hashkey, activation_id) - .build_redis_key(); - let id_lookup_key = self - .key_builder - .get_id_lookup_key(activation_id) - .build_redis_key(); - let result: usize = conn.del(payload_key.clone()).await?; - if result != 1 { - return Err(anyhow::anyhow!( - "Failed to discard payload for key {}", - payload_key.clone() - )); - } - let result: usize = conn.del(id_lookup_key.clone()).await?; - if result != 1 { - return Err(anyhow::anyhow!( - "Failed to discard id lookup for key {}", - id_lookup_key.clone() - )); - } - Ok(()) - } - - // Only used in testing - pub async fn delete_all_keys(&self) -> Result<(), Error> { - let mut conn = self.pool.get().await?; - let keys: Vec = conn.keys("*").await?; - for key in keys { - conn.del(key).await?; - } + self.cleanup_activation(hashkey, activation_id).await?; Ok(()) } @@ -555,7 +430,6 @@ impl InnerRedisActivationStore { continue; } let activation_id: String = result.unwrap().to_string(); - println!("activation_id: {:?}", activation_id); let act_result = self.get_by_id(hash_key.clone(), &activation_id).await?; if act_result.is_none() { @@ -721,12 +595,11 @@ impl InnerRedisActivationStore { let mut has_failure = false; if status == InflightActivationStatus::Retry { has_failure = true; - pipe.rpush( - self.key_builder - .get_retry_key(hash_key.clone(), activation.id.as_str()) - .build_redis_key(), - activation_id, - ); + let retry_key = self + .key_builder + .get_retry_key(hash_key.clone(), activation.id.as_str()) + .build_redis_key(); + pipe.rpush(retry_key, activation_id); } else if status == InflightActivationStatus::Failure && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter { @@ -737,6 +610,9 @@ impl InnerRedisActivationStore { .build_redis_key(), activation_id, ); + } else if status == InflightActivationStatus::Complete { + self.cleanup_activation(hash_key.clone(), activation.id.as_str()) + .await?; } let processing_key = self .key_builder @@ -889,7 +765,6 @@ impl InnerRedisActivationStore { let mut processing_attempts_exceeded_count: u64 = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { - let mut pipe = redis::pipe(); let processing_key = self .key_builder .get_processing_key_for_iter(hash_key.clone(), bucket_hash.as_str()) @@ -1014,11 +889,6 @@ impl InnerRedisActivationStore { )) } - pub async fn handle_processing_attempts(&self) -> Result { - // No-op - Ok(0) - } - pub async fn handle_expires_at(&self) -> Result { let mut conn = self.pool.get().await?; let mut total_rows_affected = 0; @@ -1064,68 +934,59 @@ impl InnerRedisActivationStore { } pub async fn handle_delay_until(&self) -> Result { - Ok(0) - } - - pub async fn handle_deadletter_tasks(&self) -> Result)>, Error> { - Ok(vec![]) - } - - pub async fn mark_deadletter_completed(&self, ids: Vec) -> Result { - Ok(0) + let mut conn = self.pool.get().await?; + let mut total_rows_affected = 0; + for hash_key in self.hash_keys.iter() { + for bucket_hash in self.bucket_hashes.iter() { + let delay_until_key = self + .key_builder + .get_delay_key_for_iter(hash_key.clone(), bucket_hash.as_str()) + .build_redis_key(); + let activations: Vec = conn + .zrangebyscore( + delay_until_key.clone(), + 0, + Utc::now().timestamp_millis() as isize, + ) + .await?; + if activations.is_empty() { + continue; + } + total_rows_affected += activations.len() as u64; + let mut pipe = redis::pipe(); + pipe.atomic(); + for activation_id in activations.iter() { + let pending_key = self + .key_builder + .get_pending_key(hash_key.clone(), activation_id) + .build_redis_key(); + pipe.rpush(pending_key, activation_id); + pipe.zrem(delay_until_key.clone(), activation_id); + } + let results: Vec = pipe.query_async(&mut *conn).await?; + if results.len() != 2 * activations.len() { + return Err(anyhow::anyhow!( + "Failed to remove expired activations: {}", + delay_until_key + )); + } + } + } + Ok(total_rows_affected) } pub async fn remove_killswitched(&self, killswitched_tasks: Vec) -> Result { + // TODO Ok(0) } pub async fn mark_demoted_completed(&self, ids: Vec) -> Result { + // TODO Ok(0) } - pub async fn move_delay_to_pending(&self) -> Result<(), Error> { - Ok(()) - } - - pub async fn get_processing_deadline_exceeded_activations( - &self, - ) -> Result, Error> { - return Ok(vec![]); - } - - pub async fn get_processing_attempts_for_activation(&self, id: &str) -> Result { - return Ok(0); - } - - pub async fn retry_activation_locally(&self, id: &str) -> Result<(), Error> { - // Increment processing attempts by 1 and push back to pending in transaction - return Ok(()); - } - - pub async fn remove_from_processing(&self, id: &str) -> Result<(), Error> { - // Remove from processing in transaction - return Ok(()); - } - - pub async fn remove_from_pending(&self, id: &str) -> Result<(), Error> { - // Remove from pending in transaction - return Ok(()); - } - - pub async fn remove_from_delay(&self, id: &str) -> Result<(), Error> { - // Remove from delay in transaction - return Ok(()); - } - - pub async fn get_deadletter_activations(&self) -> Result, Error> { - return Ok(vec![]); - } - - pub async fn get_expired_activations(&self) -> Result, Error> { - return Ok(vec![]); - } - pub async fn pending_activation_max_lag(&self, now: &DateTime) -> Result { + // TODO Ok(0) } @@ -1143,7 +1004,7 @@ impl InnerRedisActivationStore { total_count += count; } } - return Ok(total_count); + Ok(total_count) } #[instrument(skip_all)] @@ -1160,7 +1021,7 @@ impl InnerRedisActivationStore { total_count += count; } } - return Ok(total_count); + Ok(total_count) } #[instrument(skip_all)] @@ -1177,7 +1038,7 @@ impl InnerRedisActivationStore { total_count += count; } } - return Ok(total_count); + Ok(total_count) } pub async fn count_retry_activations(&self) -> Result { @@ -1193,7 +1054,7 @@ impl InnerRedisActivationStore { total_count += count; } } - return Ok(total_count); + Ok(total_count) } pub async fn count_deadletter_activations(&self) -> Result { @@ -1209,10 +1070,36 @@ impl InnerRedisActivationStore { total_count += count; } } - return Ok(total_count); + Ok(total_count) + } + + // Only used in testing + pub async fn delete_all_keys(&self) -> Result<(), Error> { + let mut conn = self.pool.get().await?; + let keys: Vec = conn.keys("*").await?; + for key in keys { + conn.del(key).await?; + } + Ok(()) } pub async fn db_size(&self) -> Result { - return Ok(0); + // Not needed + Ok(0) + } + + pub async fn handle_deadletter_tasks(&self) -> Result)>, Error> { + // Not needed + Ok(vec![]) + } + + pub async fn mark_deadletter_completed(&self, ids: Vec) -> Result { + // Not needed + Ok(0) + } + + pub async fn handle_processing_attempts(&self) -> Result { + // Not needed + Ok(0) } } diff --git a/src/store/redis_utils.rs b/src/store/redis_utils.rs index 6983c3c7..01e8e9b1 100644 --- a/src/store/redis_utils.rs +++ b/src/store/redis_utils.rs @@ -1,24 +1,4 @@ -use base64::{Engine as _, engine::general_purpose}; -use thiserror::Error; -use tracing::{error, info, instrument}; -// use deadpool_redis::Pool; -use crate::config::Config; -use crate::store::inflight_activation::{ - InflightActivation, InflightActivationStatus, QueryResult, -}; -use anyhow::Error; -use chrono::{DateTime, Duration, Utc}; use cityhasher; -use deadpool_redis::cluster::{ - Config as RedisClusterConfig, Pool as RedisClusterPool, Runtime as RedisClusterRuntime, -}; -use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; -use futures::future::try_join_all; -use redis::AsyncTypedCommands; -use sentry_protos::taskbroker::v1::OnAttemptsExceeded; -use std::collections::HashMap; -// use std::sync::RwLock; -use tokio::sync::RwLock; pub enum KeyPrefix { Payload, diff --git a/src/upkeep.rs b/src/upkeep.rs index bdc6708c..d3387e75 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -14,18 +14,13 @@ use std::{ sync::Arc, time::{Duration, Instant}, }; -use tokio::{fs, join, select, time}; +use tokio::{join, select, time}; use tonic_health::ServingStatus; use tonic_health::server::HealthReporter; use tracing::{debug, error, info, instrument}; use uuid::Uuid; -use crate::{ - SERVICE_NAME, - config::Config, - runtime_config::RuntimeConfigManager, - store::inflight_activation::{InflightActivationStatus, InflightActivationStore}, -}; +use crate::{SERVICE_NAME, config::Config, runtime_config::RuntimeConfigManager}; /// The upkeep task that periodically performs upkeep /// on the inflight store @@ -149,10 +144,7 @@ pub async fn do_upkeep( async move { let activation = TaskActivation::decode(&inflight.activation as &[u8]).unwrap(); - let act = create_retry_activation(&activation); - println!("act: {:?}", act.retry_state.as_ref().unwrap()); - let serialized = act.encode_to_vec(); - // let serialized = create_retry_activation(&activation).encode_to_vec(); + let serialized = create_retry_activation(&activation).encode_to_vec(); let delivery = producer .send( FutureRecord::<(), Vec>::to(&config.kafka_topic) @@ -176,7 +168,6 @@ pub async fn do_upkeep( |result: Result| match result { Ok(inflight) => Some(inflight), Err(err) => { - println!("retry.publish.failure {:?}", err); error!("retry.publish.failure {}", err); None } @@ -184,15 +175,12 @@ pub async fn do_upkeep( ) .collect(); - println!("to_remove: {:?}", to_remove.len()); // 3. Update retry tasks to complete match store.mark_retry_completed(to_remove).await { Ok(retried_count) => { - println!("retried_count: {:?}", retried_count); result_context.retried = retried_count; } Err(err) => { - println!("failed to mark retry completed: {:?}", err); error!("failed to mark retry completed: {:?}", err); result_context.retried = 0; } @@ -538,14 +526,11 @@ mod tests { use crate::{ config::Config, runtime_config::RuntimeConfigManager, - store::inflight_activation::{ - InflightActivationStatus, InflightActivationStore, InflightActivationStoreConfig, - }, + store::inflight_activation::InflightActivationStatus, store::inflight_redis_activation::{RedisActivationStore, RedisActivationStoreConfig}, test_utils::{ - StatusCount, assert_counts, consume_topic, create_config, create_integration_config, - create_producer, generate_temp_filename, generate_temp_redis_urls, make_activations, - replace_retry_state, reset_topic, + consume_topic, create_config, create_integration_config, create_producer, + generate_temp_redis_urls, make_activations, replace_retry_state, reset_topic, }, upkeep::{create_retry_activation, do_upkeep}, }; @@ -683,10 +668,8 @@ mod tests { record.activation = activation.encode_to_vec(); assert!(store.store(vec![record.clone()]).await.is_ok()); - println!("stored records"); let activation = store.get_pending_activation(None).await.unwrap(); // Move to processing assert!(activation.is_some()); - println!("moved to processing {}", activation.clone().unwrap().id); assert!( store .set_status( @@ -696,7 +679,6 @@ mod tests { .await .is_ok() ); // Move to retry - println!("moved to retry {}", activation.unwrap().id); assert_eq!(store.count_retry_activations().await.unwrap(), 1); @@ -780,13 +762,12 @@ mod tests { record.delay_until = Some(Utc::now() - Duration::from_secs(30)); assert!(store.store(vec![record.clone()]).await.is_ok()); - println!("stored records"); // Move from delay to pending let result_context = do_upkeep( config.clone(), store.clone(), - producer, + producer.clone(), start_time, runtime_config.clone(), &mut last_vacuum, @@ -795,7 +776,6 @@ mod tests { let activation = store.get_pending_activation(None).await.unwrap(); // Move to processing assert!(activation.is_some()); - println!("moved to processing {}", activation.clone().unwrap().id); assert!( store .set_status( @@ -805,7 +785,21 @@ mod tests { .await .is_ok() ); // Move to retry - println!("moved to retry {}", activation.unwrap().id); + + // Activation is queued to be retried, but not retried yet + assert_eq!(store.count_retry_activations().await.unwrap(), 1); + assert_eq!(result_context.retried, 0); + + // Move from retry to pending + let result_context = do_upkeep( + config.clone(), + store.clone(), + producer, + start_time, + runtime_config.clone(), + &mut last_vacuum, + ) + .await; assert_eq!(store.count_retry_activations().await.unwrap(), 0); assert_eq!(result_context.retried, 1); @@ -836,7 +830,7 @@ mod tests { } #[tokio::test] - async fn test_retry_activation_without_retry_is_not_appended_to_kafka() { + async fn test_retry_activation_without_retry_is_appended_to_kafka() { let config = create_integration_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); reset_topic(config.clone()).await; @@ -854,10 +848,8 @@ mod tests { record.activation = activation.encode_to_vec(); assert!(store.store(vec![record.clone()]).await.is_ok()); - println!("stored records"); let activation = store.get_pending_activation(None).await.unwrap(); // Move to processing assert!(activation.is_some()); - println!("moved to processing {}", activation.clone().unwrap().id); assert!( store .set_status( @@ -867,7 +859,6 @@ mod tests { .await .is_ok() ); // Move to retry - println!("moved to retry {}", activation.unwrap().id); assert_eq!(store.count_retry_activations().await.unwrap(), 1); @@ -882,31 +873,10 @@ mod tests { .await; assert_eq!(store.count_retry_activations().await.unwrap(), 0); - assert_eq!(result_context.retried, 0); + assert_eq!(result_context.retried, 1); let messages = consume_topic(config.clone(), config.kafka_topic.as_ref(), 1).await; - assert_eq!(messages.len(), 0); - let activation = &messages[0]; - - // Should spawn a new task - let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); - assert_ne!(activation.id, activation_to_check.id); - // Should increment the attempt counter - assert_eq!(activation.retry_state.as_ref().unwrap().attempts, 2); - - // Retry should retain task and parameters of original task - let activation_to_check = TaskActivation::decode(&record.activation as &[u8]).unwrap(); - assert_eq!(activation.taskname, activation_to_check.taskname); - assert_eq!(activation.namespace, activation_to_check.namespace); - assert_eq!(activation.parameters, activation_to_check.parameters); - // received_at should be set be later than the original activation - assert!( - activation.received_at.unwrap().seconds - > activation_to_check.received_at.unwrap().seconds, - "retry activation should have a later timestamp" - ); - // The delay_until of a retry task should be set to None - assert!(activation.delay.is_none()); + assert_eq!(messages.len(), 1); } #[tokio::test] @@ -1279,6 +1249,7 @@ mod tests { let config = create_config(); let runtime_config = Arc::new(RuntimeConfigManager::new(None).await); let store = create_inflight_store().await; + store.delete_all_keys().await.unwrap(); let producer = create_producer(config.clone()); let start_time = Utc::now(); let mut last_vacuum = Instant::now(); @@ -1340,6 +1311,7 @@ mod tests { } #[tokio::test] + #[ignore = "Needs to be implemented"] async fn test_forward_demoted_namespaces() { // Create runtime config with demoted namespaces let config = create_config(); @@ -1392,6 +1364,7 @@ demoted_namespaces: } #[tokio::test] + #[ignore = "Needs to be implemented"] async fn test_remove_killswitched() { let config = create_config(); let test_yaml = r#" @@ -1434,6 +1407,7 @@ demoted_namespaces: } #[tokio::test] + #[ignore = "Redis doesn't support VACUUM"] async fn test_full_vacuum_on_upkeep() { let raw_config = Config { full_vacuum_on_start: true, From da6be31fc1f1d06892f270f798cfa3bd4087da5b Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 27 Nov 2025 15:12:58 -0500 Subject: [PATCH 11/43] debugging --- src/grpc/server.rs | 1 - src/main.rs | 1 + src/store/inner_redis_activation_store.rs | 21 ++++++++++++++------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/grpc/server.rs b/src/grpc/server.rs index 890be775..90ff8288 100644 --- a/src/grpc/server.rs +++ b/src/grpc/server.rs @@ -30,7 +30,6 @@ impl ConsumerService for TaskbrokerServer { .store .get_pending_activation(namespace.as_deref()) .await; - match inflight { Ok(Some(inflight)) => { let now = Utc::now(); diff --git a/src/main.rs b/src/main.rs index 1054fea5..52379b5c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,6 +69,7 @@ async fn main() -> Result<(), Error> { ) .await?, ); + // If this is an environment where the topics might not exist, check and create them. if config.create_missing_topics { let kafka_client_config = config.kafka_consumer_config(); diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 7ddd2cc7..33243730 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -291,6 +291,7 @@ impl InnerRedisActivationStore { .arg(activation.partition) .arg("namespace") .arg(activation.namespace.clone()); + pipe.expire(lookup_key.clone(), self.payload_ttl_seconds as i64); let result: Vec = pipe.query_async(&mut conn).await?; if result.len() != 2 { @@ -455,11 +456,11 @@ impl InnerRedisActivationStore { ) .await?; if result == 0 { - return Err(anyhow::anyhow!( + // If the activation is already in the processing set, this is not an error. + error!( "Failed to move activation to processing: {} {}", - processing_key, - activation_id - )); + processing_key, activation_id + ); } let result: usize = conn @@ -589,6 +590,7 @@ impl InnerRedisActivationStore { activation.topic.clone(), activation.partition, ); + let mut conn = self.pool.get().await?; let mut pipe = redis::pipe(); pipe.atomic(); @@ -640,11 +642,11 @@ impl InnerRedisActivationStore { } if processing_removed != 1 { - // Removing from processing set - return Err(anyhow::anyhow!( + // If another worker already removed the activation from the processing set, this is not an error. + error!( "Failed to remove activation from processing set: {}", activation_id - )); + ); } Ok(()) } @@ -686,6 +688,7 @@ impl InnerRedisActivationStore { // Since this is a global operation, there is no guarantee that the keys will have the same hash key. // Group the activations by hash key and then remove them in transactions. + // TODO: This is wrong, it should include the bucket hash as well. let mut hash_key_to_activations = HashMap::new(); for activation in activations.iter() { let hash_key = HashKey::new( @@ -1075,11 +1078,15 @@ impl InnerRedisActivationStore { // Only used in testing pub async fn delete_all_keys(&self) -> Result<(), Error> { + error!("deleting all keys"); let mut conn = self.pool.get().await?; let keys: Vec = conn.keys("*").await?; + let mut deleted_keys = 0; for key in keys { conn.del(key).await?; + deleted_keys += 1; } + error!("deleted {:?} keys", deleted_keys); Ok(()) } From 59f93e59b6f2d953351e3193c9170070dfc41d3f Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 1 Dec 2025 14:40:05 -0500 Subject: [PATCH 12/43] logging --- src/store/inflight_redis_activation.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index bbce6c49..871bc68d 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -82,6 +82,7 @@ pub async fn create_redis_pool(urls: Vec) -> Result, + urls: Vec, } // Wraps the InnerRedisActivationStore to manage the locking to avoid the outer code having to handle it. @@ -92,7 +93,7 @@ impl RedisActivationStore { config: RedisActivationStoreConfig, ) -> Result { let replicas = urls.len(); - let pool = create_redis_pool(urls).await?; + let pool = create_redis_pool(urls.clone()).await?; let inner = InnerRedisActivationStore::new( pool, @@ -112,6 +113,7 @@ impl RedisActivationStore { } Ok(Self { inner: RwLock::new(inner.unwrap()), + urls, }) } @@ -228,7 +230,7 @@ impl RedisActivationStore { if result.is_err() { return Err(RedisActivationError::DatabaseOperation { operation: "get_pending_activation".to_string(), - error: (result.err().unwrap()).to_string(), + error: (format!("error: {:?}, urls: {:?}", result.err().unwrap(), self.urls)), }); } let activation = result.unwrap(); From d00c250beb2da3a366137e265affa91e5b6e5e9d Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 1 Dec 2025 14:49:54 -0500 Subject: [PATCH 13/43] use square brackets --- src/store/inflight_redis_activation.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 871bc68d..f9dd075f 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -228,6 +228,7 @@ impl RedisActivationStore { .get_pending_activation(namespace) .await; if result.is_err() { + // error!("error: {:?}, urls: {:?}", result.err().unwrap(), self.urls); return Err(RedisActivationError::DatabaseOperation { operation: "get_pending_activation".to_string(), error: (format!("error: {:?}, urls: {:?}", result.err().unwrap(), self.urls)), From 41c272c6212e8918c91b016bf4482a7bb8283c8a Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 1 Dec 2025 16:20:54 -0500 Subject: [PATCH 14/43] add random load balancer --- src/store/inner_redis_activation_store.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 33243730..8f829982 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -7,6 +7,7 @@ use base64::{Engine as _, engine::general_purpose}; use chrono::{DateTime, Duration, Utc}; use deadpool_redis::Pool; use futures::future::try_join_all; +use rand::Rng; use redis::AsyncTypedCommands; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; @@ -416,7 +417,13 @@ impl InnerRedisActivationStore { ) -> Result, Error> { let mut conn = self.pool.get().await?; let mut activations: Vec = Vec::new(); - for hash_key in self.hash_keys.iter() { + let total_hash_keys = self.hash_keys.len(); + let random_start = rand::thread_rng().gen_range(0..total_hash_keys); + let mut checked = 0; + while checked < total_hash_keys { + let idx = (random_start + checked) % total_hash_keys; + let hash_key = self.hash_keys[idx].clone(); + checked += 1; if namespaces.is_some() && !namespaces.unwrap().contains(&hash_key.namespace) { continue; } From 2dfedb3b1ca90415085b93b3480eddc08de63a95 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 1 Dec 2025 16:34:12 -0500 Subject: [PATCH 15/43] randomize buckets --- src/store/inner_redis_activation_store.rs | 15 +++++------- src/store/redis_utils.rs | 30 +++++++++++++++++++++++ 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 8f829982..75a16218 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -1,13 +1,12 @@ use crate::store::inflight_activation::{ InflightActivation, InflightActivationStatus, QueryResult, }; -use crate::store::redis_utils::{HashKey, KeyBuilder}; +use crate::store::redis_utils::{HashKey, KeyBuilder, RandomStartIterator}; use anyhow::Error; use base64::{Engine as _, engine::general_purpose}; use chrono::{DateTime, Duration, Utc}; use deadpool_redis::Pool; use futures::future::try_join_all; -use rand::Rng; use redis::AsyncTypedCommands; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; @@ -417,17 +416,15 @@ impl InnerRedisActivationStore { ) -> Result, Error> { let mut conn = self.pool.get().await?; let mut activations: Vec = Vec::new(); - let total_hash_keys = self.hash_keys.len(); - let random_start = rand::thread_rng().gen_range(0..total_hash_keys); - let mut checked = 0; - while checked < total_hash_keys { - let idx = (random_start + checked) % total_hash_keys; + let random_iterator = RandomStartIterator::new(self.hash_keys.len()); + for idx in random_iterator { let hash_key = self.hash_keys[idx].clone(); - checked += 1; if namespaces.is_some() && !namespaces.unwrap().contains(&hash_key.namespace) { continue; } - for bucket_hash in self.bucket_hashes.iter() { + let hash_iterator = RandomStartIterator::new(self.bucket_hashes.len()); + for bucket_idx in hash_iterator { + let bucket_hash = self.bucket_hashes[bucket_idx].clone(); // Get the next pending activation let pending_key = self .key_builder diff --git a/src/store/redis_utils.rs b/src/store/redis_utils.rs index 01e8e9b1..6bd7de65 100644 --- a/src/store/redis_utils.rs +++ b/src/store/redis_utils.rs @@ -1,4 +1,5 @@ use cityhasher; +use rand::Rng; pub enum KeyPrefix { Payload, @@ -205,3 +206,32 @@ impl Key { } } } + +pub struct RandomStartIterator { + total_values: usize, + random_start: usize, + current_index: usize, +} + +impl RandomStartIterator { + pub fn new(total_values: usize) -> Self { + Self { + total_values, + random_start: rand::thread_rng().gen_range(0..total_values), + current_index: 0, + } + } +} + +impl Iterator for RandomStartIterator { + type Item = usize; + + fn next(&mut self) -> Option { + if self.current_index >= self.total_values { + return None; + } + self.current_index += 1; + let idx = (self.random_start + self.current_index) % self.total_values; + Some(idx) + } +} From 439ab24c0d4370787610ce4614ad38df5c11e708 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 2 Dec 2025 14:10:18 -0500 Subject: [PATCH 16/43] add metrics --- src/store/inflight_redis_activation.rs | 104 +++++++-- src/store/inner_redis_activation_store.rs | 256 ++++++++++++++++++---- 2 files changed, 300 insertions(+), 60 deletions(-) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index f9dd075f..0d18a29e 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -124,6 +124,7 @@ impl RedisActivationStore { let result = self.inner.read().await.store(batch).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); + error!("Failed to store activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "store".to_string(), error: error_string, @@ -143,9 +144,11 @@ impl RedisActivationStore { pub async fn count_processing_activations(&self) -> Result { let result = self.inner.read().await.count_processing_activations().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to count processing activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "count_processing_activations".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -154,9 +157,11 @@ impl RedisActivationStore { pub async fn count_delayed_activations(&self) -> Result { let result = self.inner.read().await.count_delayed_activations().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to count delayed activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "count_delayed_activations".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -165,9 +170,11 @@ impl RedisActivationStore { pub async fn count_pending_activations(&self) -> Result { let result = self.inner.read().await.count_pending_activations().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to count pending activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "count_pending_activations".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -176,9 +183,11 @@ impl RedisActivationStore { pub async fn count_retry_activations(&self) -> Result { let result = self.inner.read().await.count_retry_activations().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to count retry activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "count_retry_activations".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -187,9 +196,11 @@ impl RedisActivationStore { pub async fn count_deadletter_activations(&self) -> Result { let result = self.inner.read().await.count_deadletter_activations().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to count deadletter activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "count_deadletter_activations".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -198,9 +209,11 @@ impl RedisActivationStore { pub async fn db_size(&self) -> Result { let result = self.inner.read().await.db_size().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to get db size: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "db_size".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -209,9 +222,11 @@ impl RedisActivationStore { pub async fn delete_all_keys(&self) -> Result<(), RedisActivationError> { let result = self.inner.read().await.delete_all_keys().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to delete all keys: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "delete_all_keys".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(()) @@ -228,17 +243,20 @@ impl RedisActivationStore { .get_pending_activation(namespace) .await; if result.is_err() { - // error!("error: {:?}, urls: {:?}", result.err().unwrap(), self.urls); + let error_string = result.err().unwrap().to_string(); + error!( + "Failed to get pending activation ({:?}): {:?}", + self.urls, error_string + ); return Err(RedisActivationError::DatabaseOperation { operation: "get_pending_activation".to_string(), - error: (format!("error: {:?}, urls: {:?}", result.err().unwrap(), self.urls)), + error: error_string, }); } let activation = result.unwrap(); if activation.is_none() { return Ok(None); } - self.inner.write().await.incr_next_key_idx_for_pending(); Ok(Some(activation.unwrap())) } @@ -254,9 +272,14 @@ impl RedisActivationStore { .get_pending_activations_from_namespaces(namespaces, limit) .await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!( + "Failed to get pending activations from namespaces ({:?}): {:?}", + namespaces, error_string + ); return Err(RedisActivationError::DatabaseOperation { operation: "get_pending_activations_from_namespaces".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -271,12 +294,19 @@ impl RedisActivationStore { .inner .read() .await - .get_by_id(hash_key, activation_id) + .get_by_id(hash_key.clone(), activation_id) .await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!( + "Failed to get by id ({:?}, {:?}): {:?}", + hash_key.clone(), + activation_id, + error_string + ); return Err(RedisActivationError::DatabaseOperation { operation: "get_by_id".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -295,6 +325,10 @@ impl RedisActivationStore { .await; if result.is_err() { let error_string = result.err().unwrap().to_string(); + error!( + "Failed to set status ({:?}, {:?}): {:?}", + activation_id, status, error_string + ); return Err(RedisActivationError::DatabaseOperation { operation: "set_status".to_string(), error: error_string, @@ -308,9 +342,11 @@ impl RedisActivationStore { ) -> Result, RedisActivationError> { let result = self.inner.read().await.get_retry_activations().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to get retry activations: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "get_retry_activations".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -327,9 +363,11 @@ impl RedisActivationStore { .mark_retry_completed(activations) .await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to mark retry completed: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "mark_retry_completed".to_string(), - error: result.err().unwrap().to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -341,6 +379,7 @@ impl RedisActivationStore { let result = self.inner.read().await.handle_processing_deadline().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); + error!("Failed to handle processing deadline: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "handle_processing_deadline".to_string(), error: error_string, @@ -352,9 +391,11 @@ impl RedisActivationStore { pub async fn handle_processing_attempts(&self) -> Result { let result = self.inner.read().await.handle_processing_attempts().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to handle processing attempts: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "handle_processing_attempts".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -363,9 +404,11 @@ impl RedisActivationStore { pub async fn handle_expires_at(&self) -> Result { let result = self.inner.read().await.handle_expires_at().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to handle expires at: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "handle_expires_at".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -374,9 +417,11 @@ impl RedisActivationStore { pub async fn handle_delay_until(&self) -> Result { let result = self.inner.read().await.handle_delay_until().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to handle delay until: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "handle_delay_until".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -387,9 +432,11 @@ impl RedisActivationStore { ) -> Result)>, RedisActivationError> { let result = self.inner.read().await.handle_deadletter_tasks().await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to handle deadletter tasks: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "handle_deadletter_tasks".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -401,9 +448,11 @@ impl RedisActivationStore { ) -> Result { let result = self.inner.read().await.mark_deadletter_completed(ids).await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to mark deadletter completed: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "mark_deadletter_completed".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -420,9 +469,11 @@ impl RedisActivationStore { .remove_killswitched(killswitched_tasks) .await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to remove killswitched: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "remove_killswitched".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -434,9 +485,11 @@ impl RedisActivationStore { ) -> Result { let result = self.inner.read().await.mark_demoted_completed(ids).await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!("Failed to mark demoted completed: {:?}", error_string); return Err(RedisActivationError::DatabaseOperation { operation: "mark_demoted_completed".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) @@ -453,9 +506,14 @@ impl RedisActivationStore { .pending_activation_max_lag(now) .await; if result.is_err() { + let error_string = result.err().unwrap().to_string(); + error!( + "Failed to get pending activation max lag: {:?}", + error_string + ); return Err(RedisActivationError::DatabaseOperation { operation: "pending_activation_max_lag".to_string(), - error: (result.err().unwrap()).to_string(), + error: error_string, }); } Ok(result.unwrap()) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 75a16218..bc97fe27 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -10,6 +10,7 @@ use futures::future::try_join_all; use redis::AsyncTypedCommands; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; +use std::time::Instant; use tracing::{error, info, instrument}; #[derive(Debug)] @@ -22,8 +23,6 @@ pub struct InnerRedisActivationStore { bucket_hashes: Vec, hash_keys: Vec, key_builder: KeyBuilder, - next_key_idx_for_pending: usize, - total_possible_keys: usize, processing_deadline_grace_sec: i64, max_processing_attempts: i32, } @@ -58,8 +57,6 @@ impl InnerRedisActivationStore { hash_keys, payload_ttl_seconds, key_builder: KeyBuilder::new(num_buckets), - next_key_idx_for_pending: 0, - total_possible_keys: 0, processing_deadline_grace_sec: processing_deadline_grace_sec as i64, // Duration expects i64 max_processing_attempts: max_processing_attempts as i32, }) @@ -70,25 +67,35 @@ impl InnerRedisActivationStore { // This assumes that the broker is always consuming from the same topics and only the partitions are changing self.topics.insert(topic.clone(), partitions.clone()); self.hash_keys.clear(); - self.total_possible_keys = 0; + let mut hashkeys = 0; for (topic, partitions) in self.topics.iter() { for partition in partitions.iter() { for namespace in self.namespaces.iter() { self.hash_keys .push(HashKey::new(namespace.clone(), topic.clone(), *partition)); - self.total_possible_keys += self.bucket_hashes.len(); + hashkeys += self.bucket_hashes.len(); } } } info!( - "Rebalanced partitions for topic {}: {:?}: {:?}: total possible keys: {}", - topic, partitions, self.topics, self.total_possible_keys + "Rebalanced partitions for topic {}: {:?}: {:?}: total hashkeys: {}", + topic, partitions, self.topics, hashkeys ); } + pub async fn get_conn(&self) -> Result { + let start_time = Instant::now(); + let conn = self.pool.get().await?; + let conn_duration = start_time.duration_since(start_time); + metrics::histogram!("redis_store.conn_duration").record(conn_duration.as_millis() as f64); + Ok(conn) + } + + #[instrument(skip_all)] pub async fn store(&self, batch: Vec) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; let mut rows_affected: u64 = 0; + let start_time = Instant::now(); for activation in batch { let payload_key = self .key_builder @@ -314,6 +321,10 @@ impl InnerRedisActivationStore { } rows_affected += 1; } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.store_duration").record(duration.as_millis() as f64); + metrics::counter!("redis_store.store_count").increment(rows_affected); Ok(QueryResult { rows_affected }) } @@ -322,7 +333,8 @@ impl InnerRedisActivationStore { hashkey: HashKey, activation_id: &str, ) -> Result<(), Error> { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let payload_key = self .key_builder .get_payload_key(hashkey, activation_id) @@ -345,6 +357,9 @@ impl InnerRedisActivationStore { id_lookup_key.clone() )); } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.cleanup_duration").record(duration.as_millis() as f64); Ok(()) } /// Discard an activation. If the activation is at_most_once, remove the payloads. @@ -357,6 +372,7 @@ impl InnerRedisActivationStore { // If the activation is not found, return a no-op. // If the activation is at_most_once, discard the activation and remove the payloads. // If it has deadletter configured, move it to the deadletter queue and keep the payloads. + let start_time = Instant::now(); let fields = self .get_fields_by_id( hashkey.clone(), @@ -371,7 +387,7 @@ impl InnerRedisActivationStore { let on_attempts_exceeded = OnAttemptsExceeded::from_str_name(fields.get("on_attempts_exceeded").unwrap().as_str()) .unwrap(); - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; if !at_most_once && on_attempts_exceeded == OnAttemptsExceeded::Deadletter { let deadletter_key = self .key_builder @@ -384,9 +400,17 @@ impl InnerRedisActivationStore { deadletter_key.clone() )); } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.discard_activation_duration", "deadletter" => "true") + .record(duration.as_millis() as f64); return Ok(()); } self.cleanup_activation(hashkey, activation_id).await?; + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.cleanup_activation_duration", "deadletter" => "false") + .record(duration.as_millis() as f64); Ok(()) } @@ -414,30 +438,47 @@ impl InnerRedisActivationStore { namespaces: Option<&[String]>, limit: Option, ) -> Result, Error> { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut activations: Vec = Vec::new(); let random_iterator = RandomStartIterator::new(self.hash_keys.len()); + let mut buckets_checked = 0; + let mut hashes_checked = 0; for idx in random_iterator { + hashes_checked += 1; let hash_key = self.hash_keys[idx].clone(); if namespaces.is_some() && !namespaces.unwrap().contains(&hash_key.namespace) { + metrics::counter!( + "redis_store.get_pending_activations_from_namespaces.namespace_not_found" + ) + .increment(1); continue; } let hash_iterator = RandomStartIterator::new(self.bucket_hashes.len()); for bucket_idx in hash_iterator { let bucket_hash = self.bucket_hashes[bucket_idx].clone(); + buckets_checked += 1; // Get the next pending activation + let get_by_id_start_time = Instant::now(); let pending_key = self .key_builder .get_pending_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); let result = conn.lindex(pending_key.clone(), 0).await?; if result.is_none() { + let get_by_id_duration = + get_by_id_start_time.duration_since(get_by_id_start_time); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.process_activation.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); continue; } let activation_id: String = result.unwrap().to_string(); let act_result = self.get_by_id(hash_key.clone(), &activation_id).await?; if act_result.is_none() { + let get_by_id_duration = + get_by_id_start_time.duration_since(get_by_id_start_time); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.process_activation.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.process_activation.not_found").increment(1); continue; } let activation = act_result.unwrap(); @@ -465,51 +506,75 @@ impl InnerRedisActivationStore { "Failed to move activation to processing: {} {}", processing_key, activation_id ); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.already_moved_to_processing").increment(1); } let result: usize = conn .lrem(pending_key.clone(), 1, activation_id.clone()) .await?; if result == 0 { - info!( + error!( "Attempted to lrem an activation from pending queue, but it was not found: {} {}", pending_key, activation_id ); - metrics::counter!("inflight_redis_activation_store_lrem_not_found") + metrics::counter!("redis_store.get_pending_activations_from_namespaces.already_removed_from_pending") .increment(1); } + let get_by_id_duration = get_by_id_start_time.duration_since(get_by_id_start_time); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration", "result" => "true").record(get_by_id_duration.as_millis() as f64); activations.push(activation); if activations.len() >= limit.unwrap() as usize { + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!( + "redis_store.get_pending_activations_from_namespaces.duration" + ) + .record(duration.as_millis() as f64); + metrics::counter!( + "redis_store.get_pending_activations_from_namespaces.buckets_checked" + ) + .increment(buckets_checked); + metrics::counter!( + "redis_store.get_pending_activations_from_namespaces.hashes_checked" + ) + .increment(hashes_checked); return Ok(activations); } } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.duration") + .record(duration.as_millis() as f64); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.buckets_checked") + .increment(buckets_checked); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.hashes_checked") + .increment(hashes_checked); Ok(activations) } - pub fn incr_next_key_idx_for_pending(&mut self) { - self.next_key_idx_for_pending += 1; - if self.next_key_idx_for_pending >= self.total_possible_keys { - self.next_key_idx_for_pending = 0; - } - } - /// Get an activation by id. Primarily used for testing pub async fn get_by_id( &self, hash_key: HashKey, activation_id: &str, ) -> Result, Error> { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let payload_key = self .key_builder .get_payload_key(hash_key, activation_id) .build_redis_key(); let result: HashMap = conn.hgetall(payload_key.clone()).await?; if result.is_empty() { + metrics::counter!("redis_store.get_by_id", "result" => "false").increment(1); return Ok(None); } let activation: InflightActivation = result.into(); + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_by_id_duration").record(duration.as_millis() as f64); + metrics::counter!("redis_store.get_by_id", "result" => "true").increment(1); Ok(Some(activation)) } @@ -528,7 +593,8 @@ impl InnerRedisActivationStore { } pub async fn get_hashkey_by_id(&self, activation_id: &str) -> Result, Error> { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let result: HashMap = conn .hgetall( self.key_builder @@ -537,8 +603,18 @@ impl InnerRedisActivationStore { ) .await?; if result.is_empty() { + metrics::counter!("redis_store.get_hashkey_by_id", "result" => "false").increment(1); + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_hashkey_by_id_duration") + .record(duration.as_millis() as f64); return Ok(None); } + metrics::counter!("redis_store.get_hashkey_by_id", "result" => "true").increment(1); + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_hashkey_by_id_duration") + .record(duration.as_millis() as f64); Ok(Some(HashKey::new( result.get("namespace").unwrap().to_string(), result.get("topic").unwrap().to_string(), @@ -552,7 +628,8 @@ impl InnerRedisActivationStore { activation_id: &str, fields: &[&str], ) -> Result, Error> { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let payload_key = self .key_builder .get_payload_key(hash_key, activation_id) @@ -571,6 +648,10 @@ impl InnerRedisActivationStore { fields_map.insert(arg_name.to_string(), values[idx].clone()); } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_fields_by_id_duration") + .record(duration.as_millis() as f64); Ok(fields_map) } @@ -580,12 +661,14 @@ impl InnerRedisActivationStore { status: InflightActivationStatus, ) -> Result<(), Error> { // If the activation is not found, return a no-op + let start_time = Instant::now(); let activation = self.get_by_id_lookup(activation_id).await?; if activation.is_none() { - info!( + error!( "Activation not found for id: {}, skipping status update", activation_id ); + metrics::counter!("redis_store.set_status.activation_not_found").increment(1); return Ok(()); } let activation = activation.unwrap(); @@ -595,7 +678,7 @@ impl InnerRedisActivationStore { activation.partition, ); - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; let mut pipe = redis::pipe(); pipe.atomic(); let mut has_failure = false; @@ -651,12 +734,18 @@ impl InnerRedisActivationStore { "Failed to remove activation from processing set: {}", activation_id ); + metrics::counter!("redis_store.set_status.already_removed_from_processing") + .increment(1); } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.set_status.duration").record(duration.as_millis() as f64); Ok(()) } pub async fn get_retry_activations(&self) -> Result, Error> { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut activation_ids: Vec<(HashKey, String)> = Vec::new(); for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { @@ -671,13 +760,34 @@ impl InnerRedisActivationStore { ); } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_retry_activations.retry_loop.duration") + .record(duration.as_millis() as f64); + metrics::counter!("redis_store.get_retry_activations.retry_loop.activations_found") + .increment(activation_ids.len() as u64); + if activation_ids.is_empty() { + return Ok(Vec::new()); + } + let get_by_id_start_time = Instant::now(); let activations = try_join_all( activation_ids .iter() .map(|(hashkey, id)| self.get_by_id(hashkey.clone(), id)), ) .await?; + let end_time = Instant::now(); + let duration = end_time.duration_since(get_by_id_start_time); + metrics::histogram!("redis_store.get_retry_activations.get_by_id_duration") + .record(duration.as_millis() as f64); + metrics::counter!("redis_store.get_retry_activations.get_by_id.activations_found") + .increment(activations.len() as u64); + metrics::counter!("redis_store.get_retry_activations.get_by_id.activations_not_found") + .increment((activation_ids.len() - activations.len()) as u64); + let total_duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_retry_activations.total_duration") + .record(total_duration.as_millis() as f64); Ok(activations.into_iter().flatten().collect()) } @@ -688,7 +798,8 @@ impl InnerRedisActivationStore { if activations.is_empty() { return Ok(0); } - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); // Since this is a global operation, there is no guarantee that the keys will have the same hash key. // Group the activations by hash key and then remove them in transactions. @@ -757,6 +868,12 @@ impl InnerRedisActivationStore { deleted_count[0] ); } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.mark_retry_completed.duration") + .record(duration.as_millis() as f64); + metrics::counter!("redis_store.mark_retry_completed.rows_affected") + .increment(rows_affected); Ok(rows_affected) } @@ -766,10 +883,11 @@ impl InnerRedisActivationStore { // there are no retries, as the worker will reject the activation due to idempotency keys. // If the task has processing attempts remaining, it is moved back to pending with attempts += 1 // Otherwise it is either discarded or moved to retry/deadletter. - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; let mut total_rows_affected: u64 = 0; let mut discarded_count: u64 = 0; let mut processing_attempts_exceeded_count: u64 = 0; + let start_time = Instant::now(); for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { let processing_key = self @@ -789,6 +907,7 @@ impl InnerRedisActivationStore { } total_rows_affected += activations.len() as u64; for activation_id in activations.iter() { + let single_activation_start_time = Instant::now(); let fields = self .get_fields_by_id( hash_key.clone(), @@ -801,6 +920,9 @@ impl InnerRedisActivationStore { "Failed to get payload for activation past processing deadline: {}", activation_id ); + let single_activation_duration = single_activation_start_time + .duration_since(single_activation_start_time); + metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "not_found").record(single_activation_duration.as_millis() as f64); continue; } let at_most_once = fields @@ -819,6 +941,9 @@ impl InnerRedisActivationStore { self.discard_activation(hash_key.clone(), activation_id) .await?; discarded_count += 1; + let single_activation_duration = single_activation_start_time + .duration_since(single_activation_start_time); + metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "at_most_once").record(single_activation_duration.as_millis() as f64); continue; } let processing_attempts = fields @@ -839,6 +964,9 @@ impl InnerRedisActivationStore { self.discard_activation(hash_key.clone(), activation_id) .await?; discarded_count += 1; + let single_activation_duration = single_activation_start_time + .duration_since(single_activation_start_time); + metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "processing_attempts_exceeded").record(single_activation_duration.as_millis() as f64); continue; } // Move back to pending @@ -860,6 +988,9 @@ impl InnerRedisActivationStore { pipe.rpush(pending_key, activation_id); pipe.zrem(processing_key.clone(), activation_id); let results: Vec = pipe.query_async(&mut *conn).await?; + let single_activation_duration = + single_activation_start_time.duration_since(single_activation_start_time); + metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "moved_to_pending").record(single_activation_duration.as_millis() as f64); if results.len() != 3 { return Err(anyhow::anyhow!( "Failed to move activation back to pending: incorrect number of commands run: expected 3, got {} for key {}", @@ -889,6 +1020,10 @@ impl InnerRedisActivationStore { } } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.handle_processing_deadline.total_duration") + .record(duration.as_millis() as f64); Ok(( total_rows_affected, discarded_count, @@ -897,10 +1032,12 @@ impl InnerRedisActivationStore { } pub async fn handle_expires_at(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_rows_affected = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { + let single_bucket_start_time = Instant::now(); let expires_at_key = self .key_builder .get_expired_key_for_iter(hash_key.clone(), bucket_hash.as_str()) @@ -929,6 +1066,10 @@ impl InnerRedisActivationStore { .await?; } let results: Vec = pipe.query_async(&mut *conn).await?; + let single_bucket_duration = + single_bucket_start_time.duration_since(single_bucket_start_time); + metrics::histogram!("redis_store.handle_expires_at.single_bucket.duration") + .record(single_bucket_duration.as_millis() as f64); if results.len() != 2 * activations.len() { return Err(anyhow::anyhow!( "Failed to remove expired activations: {}", @@ -937,14 +1078,20 @@ impl InnerRedisActivationStore { } } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.handle_expires_at.total_duration") + .record(duration.as_millis() as f64); Ok(total_rows_affected) } pub async fn handle_delay_until(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_rows_affected = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { + let single_bucket_start_time = Instant::now(); let delay_until_key = self .key_builder .get_delay_key_for_iter(hash_key.clone(), bucket_hash.as_str()) @@ -957,6 +1104,9 @@ impl InnerRedisActivationStore { ) .await?; if activations.is_empty() { + let single_bucket_duration = + single_bucket_start_time.duration_since(single_bucket_start_time); + metrics::histogram!("redis_store.handle_delay_until.single_bucket.duration", "result" => "no_activations").record(single_bucket_duration.as_millis() as f64); continue; } total_rows_affected += activations.len() as u64; @@ -971,6 +1121,9 @@ impl InnerRedisActivationStore { pipe.zrem(delay_until_key.clone(), activation_id); } let results: Vec = pipe.query_async(&mut *conn).await?; + let single_bucket_duration = + single_bucket_start_time.duration_since(single_bucket_start_time); + metrics::histogram!("redis_store.handle_delay_until.single_bucket.duration", "result" => "removed_activations").record(single_bucket_duration.as_millis() as f64); if results.len() != 2 * activations.len() { return Err(anyhow::anyhow!( "Failed to remove expired activations: {}", @@ -979,6 +1132,10 @@ impl InnerRedisActivationStore { } } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.handle_delay_until.total_duration") + .record(duration.as_millis() as f64); Ok(total_rows_affected) } @@ -999,7 +1156,8 @@ impl InnerRedisActivationStore { #[instrument(skip_all)] pub async fn count_pending_activations(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_count = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { @@ -1011,12 +1169,17 @@ impl InnerRedisActivationStore { total_count += count; } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.count_pending_activations.total_duration") + .record(duration.as_millis() as f64); Ok(total_count) } #[instrument(skip_all)] pub async fn count_delayed_activations(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_count = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { @@ -1028,12 +1191,17 @@ impl InnerRedisActivationStore { total_count += count; } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.count_delayed_activations.total_duration") + .record(duration.as_millis() as f64); Ok(total_count) } #[instrument(skip_all)] pub async fn count_processing_activations(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_count = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { @@ -1045,11 +1213,16 @@ impl InnerRedisActivationStore { total_count += count; } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.count_processing_activations.total_duration") + .record(duration.as_millis() as f64); Ok(total_count) } pub async fn count_retry_activations(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_count = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { @@ -1061,11 +1234,16 @@ impl InnerRedisActivationStore { total_count += count; } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.count_retry_activations.total_duration") + .record(duration.as_millis() as f64); Ok(total_count) } pub async fn count_deadletter_activations(&self) -> Result { - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; + let start_time = Instant::now(); let mut total_count = 0; for hash_key in self.hash_keys.iter() { for bucket_hash in self.bucket_hashes.iter() { @@ -1077,13 +1255,17 @@ impl InnerRedisActivationStore { total_count += count; } } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.count_deadletter_activations.total_duration") + .record(duration.as_millis() as f64); Ok(total_count) } // Only used in testing pub async fn delete_all_keys(&self) -> Result<(), Error> { error!("deleting all keys"); - let mut conn = self.pool.get().await?; + let mut conn = self.get_conn().await?; let keys: Vec = conn.keys("*").await?; let mut deleted_keys = 0; for key in keys { From fd99b43863d92ec0b111601386e2b26236d44b1f Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 2 Dec 2025 14:39:58 -0500 Subject: [PATCH 17/43] add complete metrics --- src/store/inner_redis_activation_store.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index bc97fe27..e5e3f15f 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -693,6 +693,7 @@ impl InnerRedisActivationStore { && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter { has_failure = true; + metrics::counter!("redis_store.set_status", "status" => "deadletter").increment(1); pipe.rpush( self.key_builder .get_deadletter_key(hash_key.clone(), activation.id.as_str()) @@ -739,7 +740,8 @@ impl InnerRedisActivationStore { } let end_time = Instant::now(); let duration = end_time.duration_since(start_time); - metrics::histogram!("redis_store.set_status.duration").record(duration.as_millis() as f64); + metrics::histogram!("redis_store.set_status.duration", "status" => format!("{:?}", status)) + .record(duration.as_millis() as f64); Ok(()) } From fa4aaa2fd5ad1a74fb944a21a8b282fdf6247dbc Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 2 Dec 2025 16:38:52 -0500 Subject: [PATCH 18/43] histogram --- src/store/inner_redis_activation_store.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index e5e3f15f..37ee57d4 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -530,14 +530,14 @@ impl InnerRedisActivationStore { "redis_store.get_pending_activations_from_namespaces.duration" ) .record(duration.as_millis() as f64); - metrics::counter!( + metrics::histogram!( "redis_store.get_pending_activations_from_namespaces.buckets_checked" ) - .increment(buckets_checked); - metrics::counter!( + .record(buckets_checked as f64); + metrics::histogram!( "redis_store.get_pending_activations_from_namespaces.hashes_checked" ) - .increment(hashes_checked); + .record(hashes_checked as f64); return Ok(activations); } } @@ -693,7 +693,6 @@ impl InnerRedisActivationStore { && activation.on_attempts_exceeded == OnAttemptsExceeded::Deadletter { has_failure = true; - metrics::counter!("redis_store.set_status", "status" => "deadletter").increment(1); pipe.rpush( self.key_builder .get_deadletter_key(hash_key.clone(), activation.id.as_str()) From d4ab0599d48d91ee1b3e866d2f88d15d2a4ba843 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 2 Dec 2025 16:55:42 -0500 Subject: [PATCH 19/43] fix --- src/store/inner_redis_activation_store.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 37ee57d4..bcd964c7 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -468,7 +468,7 @@ impl InnerRedisActivationStore { if result.is_none() { let get_by_id_duration = get_by_id_start_time.duration_since(get_by_id_start_time); - metrics::histogram!("redis_store.get_pending_activations_from_namespaces.process_activation.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); continue; } let activation_id: String = result.unwrap().to_string(); @@ -477,8 +477,8 @@ impl InnerRedisActivationStore { if act_result.is_none() { let get_by_id_duration = get_by_id_start_time.duration_since(get_by_id_start_time); - metrics::histogram!("redis_store.get_pending_activations_from_namespaces.process_activation.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); - metrics::counter!("redis_store.get_pending_activations_from_namespaces.process_activation.not_found").increment(1); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.not_found").increment(1); continue; } let activation = act_result.unwrap(); From ee898459541ee175a9b06ee81e2b103a0383e9e0 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 2 Dec 2025 17:10:17 -0500 Subject: [PATCH 20/43] cleanup --- src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.rs b/src/main.rs index 52379b5c..a04f1195 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,6 +69,8 @@ async fn main() -> Result<(), Error> { ) .await?, ); + // TODO: REMOVE THIS + redis_store.delete_all_keys().await?; // If this is an environment where the topics might not exist, check and create them. if config.create_missing_topics { From 610043665a46aea84f5b09384395c390b4c7fcb1 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 11:15:52 -0500 Subject: [PATCH 21/43] add logging --- src/kafka/inflight_activation_writer.rs | 5 +++-- src/upkeep.rs | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index 3fdb0080..de722f67 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -79,7 +79,7 @@ impl Reducer for InflightActivationWriter { self.batch.take(); return Ok(Some(())); } - + error!("Writing batch of length: {}", batch.len()); // Check if writing the batch would exceed the limits let exceeded_pending_limit = self .store @@ -144,7 +144,7 @@ impl Reducer for InflightActivationWriter { "reason" => reason, ) .increment(1); - + error!("Backpressure triggered: {}", reason); return Ok(None); } @@ -153,6 +153,7 @@ impl Reducer for InflightActivationWriter { let res = self.store.store(batch.clone()).await; match res { Ok(res) => { + error!("Wrote batch of length: {} to store", batch.len()); self.batch.take(); let lag = Utc::now() - batch diff --git a/src/upkeep.rs b/src/upkeep.rs index d3387e75..3a6feec2 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -411,7 +411,7 @@ pub async fn do_upkeep( ); } metrics::histogram!("upkeep.duration").record(upkeep_start.elapsed()); - + error!("Pending count: {}", result_context.pending); // Task statuses metrics::counter!("upkeep.task.state_transition", "state" => "completed") .increment(result_context.completed); From 6adb3fddda1401f35aabced040c039728de198db Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 11:39:34 -0500 Subject: [PATCH 22/43] try removing write lock --- src/store/inflight_redis_activation.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 0d18a29e..44a62cdc 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -135,10 +135,11 @@ impl RedisActivationStore { // Called when rebalancing partitions pub async fn rebalance_partitions(&self, topic: String, partitions: Vec) { - self.inner - .write() - .await - .rebalance_partitions(topic, partitions); + error!("Rebalancing partitions: {:?}", (topic, partitions)); + // self.inner + // .write() + // .await + // .rebalance_partitions(topic, partitions); } pub async fn count_processing_activations(&self) -> Result { From 723b8c025859b64a9874665afaa79c20fcb4e918 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 11:51:32 -0500 Subject: [PATCH 23/43] add logging --- src/store/inflight_redis_activation.rs | 106 +++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 5 deletions(-) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 44a62cdc..eadbaf2d 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -121,6 +121,10 @@ impl RedisActivationStore { &self, batch: Vec, ) -> Result { + error!( + "Trying to acquire read lock in store {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.store(batch).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -135,14 +139,22 @@ impl RedisActivationStore { // Called when rebalancing partitions pub async fn rebalance_partitions(&self, topic: String, partitions: Vec) { - error!("Rebalancing partitions: {:?}", (topic, partitions)); - // self.inner - // .write() - // .await - // .rebalance_partitions(topic, partitions); + error!( + "Rebalancing partitions: {:?}", + (topic.clone(), partitions.clone()) + ); + self.inner + .write() + .await + .rebalance_partitions(topic, partitions); + error!("Rebalanced partitions"); } pub async fn count_processing_activations(&self) -> Result { + error!( + "Trying to acquire read lock in count_processing_activations {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.count_processing_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -156,6 +168,10 @@ impl RedisActivationStore { } pub async fn count_delayed_activations(&self) -> Result { + error!( + "Trying to acquire read lock in count_delayed_activations {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.count_delayed_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -169,6 +185,10 @@ impl RedisActivationStore { } pub async fn count_pending_activations(&self) -> Result { + error!( + "Trying to acquire read lock in count_pending_activations {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.count_pending_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -182,6 +202,10 @@ impl RedisActivationStore { } pub async fn count_retry_activations(&self) -> Result { + error!( + "Trying to acquire read lock in count_retry_activations {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.count_retry_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -195,6 +219,10 @@ impl RedisActivationStore { } pub async fn count_deadletter_activations(&self) -> Result { + error!( + "Trying to acquire read lock in count_deadletter_activations {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.count_deadletter_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -208,6 +236,10 @@ impl RedisActivationStore { } pub async fn db_size(&self) -> Result { + error!( + "Trying to acquire read lock in db_size {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.db_size().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -221,6 +253,10 @@ impl RedisActivationStore { } pub async fn delete_all_keys(&self) -> Result<(), RedisActivationError> { + error!( + "Trying to acquire read lock in delete_all_keys {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.delete_all_keys().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -237,6 +273,10 @@ impl RedisActivationStore { &self, namespace: Option<&str>, ) -> Result, RedisActivationError> { + error!( + "Trying to acquire read lock in get_pending_activation {:?}", + self.inner.try_read() + ); let result = self .inner .read() @@ -266,6 +306,10 @@ impl RedisActivationStore { namespaces: Option<&[String]>, limit: Option, ) -> Result, RedisActivationError> { + error!( + "Trying to acquire read lock in get_pending_activations_from_namespaces {:?}", + self.inner.try_read() + ); let result = self .inner .read() @@ -291,6 +335,10 @@ impl RedisActivationStore { hash_key: HashKey, activation_id: &str, ) -> Result, RedisActivationError> { + error!( + "Trying to acquire read lock in get_by_id {:?}", + self.inner.try_read() + ); let result = self .inner .read() @@ -318,6 +366,10 @@ impl RedisActivationStore { activation_id: &str, status: InflightActivationStatus, ) -> Result<(), RedisActivationError> { + error!( + "Trying to acquire read lock in set_status {:?}", + self.inner.try_read() + ); let result = self .inner .read() @@ -341,6 +393,10 @@ impl RedisActivationStore { pub async fn get_retry_activations( &self, ) -> Result, RedisActivationError> { + error!( + "Trying to acquire read lock in get_retry_activations {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.get_retry_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -357,6 +413,10 @@ impl RedisActivationStore { &self, activations: Vec, ) -> Result { + error!( + "Trying to acquire read lock in mark_retry_completed {:?}", + self.inner.try_read() + ); let result = self .inner .read() @@ -377,6 +437,10 @@ impl RedisActivationStore { pub async fn handle_processing_deadline( &self, ) -> Result<(u64, u64, u64), RedisActivationError> { + error!( + "Trying to acquire read lock in handle_processing_deadline {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.handle_processing_deadline().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -390,6 +454,10 @@ impl RedisActivationStore { } pub async fn handle_processing_attempts(&self) -> Result { + error!( + "Trying to acquire read lock in handle_processing_attempts {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.handle_processing_attempts().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -403,6 +471,10 @@ impl RedisActivationStore { } pub async fn handle_expires_at(&self) -> Result { + error!( + "Trying to acquire read lock in handle_expires_at {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.handle_expires_at().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -416,6 +488,10 @@ impl RedisActivationStore { } pub async fn handle_delay_until(&self) -> Result { + error!( + "Trying to acquire read lock in handle_delay_until {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.handle_delay_until().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -431,6 +507,10 @@ impl RedisActivationStore { pub async fn handle_deadletter_tasks( &self, ) -> Result)>, RedisActivationError> { + error!( + "Trying to acquire read lock in handle_deadletter_tasks {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.handle_deadletter_tasks().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -447,6 +527,10 @@ impl RedisActivationStore { &self, ids: Vec, ) -> Result { + error!( + "Trying to acquire read lock in mark_deadletter_completed {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.mark_deadletter_completed(ids).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -463,6 +547,10 @@ impl RedisActivationStore { &self, killswitched_tasks: Vec, ) -> Result { + error!( + "Trying to acquire read lock in remove_killswitched {:?}", + self.inner.try_read() + ); let result = self .inner .read() @@ -484,6 +572,10 @@ impl RedisActivationStore { &self, ids: Vec, ) -> Result { + error!( + "Trying to acquire read lock in mark_demoted_completed {:?}", + self.inner.try_read() + ); let result = self.inner.read().await.mark_demoted_completed(ids).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -500,6 +592,10 @@ impl RedisActivationStore { &self, now: &DateTime, ) -> Result { + error!( + "Trying to acquire read lock in pending_activation_max_lag {:?}", + self.inner.try_read() + ); let result = self .inner .read() From 2deca56a3986c9de19f9e4d119510be45e2b59e0 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 12:11:14 -0500 Subject: [PATCH 24/43] roll back logging --- src/store/inflight_redis_activation.rs | 187 +++++++++++-------------- 1 file changed, 82 insertions(+), 105 deletions(-) diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index eadbaf2d..5d97b192 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -117,14 +117,26 @@ impl RedisActivationStore { }) } + // Called when rebalancing partitions + pub async fn rebalance_partitions(&self, topic: String, partitions: Vec) { + error!( + "Rebalancing partitions: {:?}", + (topic.clone(), partitions.clone()) + ); + self.inner + .write() + .await + .rebalance_partitions(topic, partitions); + error!("Rebalanced partitions"); + } + pub async fn store( &self, batch: Vec, ) -> Result { - error!( - "Trying to acquire read lock in store {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in store"); + } let result = self.inner.read().await.store(batch).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -137,24 +149,10 @@ impl RedisActivationStore { Ok(result.unwrap()) } - // Called when rebalancing partitions - pub async fn rebalance_partitions(&self, topic: String, partitions: Vec) { - error!( - "Rebalancing partitions: {:?}", - (topic.clone(), partitions.clone()) - ); - self.inner - .write() - .await - .rebalance_partitions(topic, partitions); - error!("Rebalanced partitions"); - } - pub async fn count_processing_activations(&self) -> Result { - error!( - "Trying to acquire read lock in count_processing_activations {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in count_processing_activations"); + } let result = self.inner.read().await.count_processing_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -168,10 +166,9 @@ impl RedisActivationStore { } pub async fn count_delayed_activations(&self) -> Result { - error!( - "Trying to acquire read lock in count_delayed_activations {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in count_delayed_activations"); + } let result = self.inner.read().await.count_delayed_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -185,10 +182,9 @@ impl RedisActivationStore { } pub async fn count_pending_activations(&self) -> Result { - error!( - "Trying to acquire read lock in count_pending_activations {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in count_pending_activations"); + } let result = self.inner.read().await.count_pending_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -202,10 +198,9 @@ impl RedisActivationStore { } pub async fn count_retry_activations(&self) -> Result { - error!( - "Trying to acquire read lock in count_retry_activations {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in count_retry_activations"); + } let result = self.inner.read().await.count_retry_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -219,10 +214,9 @@ impl RedisActivationStore { } pub async fn count_deadletter_activations(&self) -> Result { - error!( - "Trying to acquire read lock in count_deadletter_activations {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in count_deadletter_activations"); + } let result = self.inner.read().await.count_deadletter_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -236,10 +230,9 @@ impl RedisActivationStore { } pub async fn db_size(&self) -> Result { - error!( - "Trying to acquire read lock in db_size {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in db_size"); + } let result = self.inner.read().await.db_size().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -253,10 +246,9 @@ impl RedisActivationStore { } pub async fn delete_all_keys(&self) -> Result<(), RedisActivationError> { - error!( - "Trying to acquire read lock in delete_all_keys {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in delete_all_keys"); + } let result = self.inner.read().await.delete_all_keys().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -273,10 +265,9 @@ impl RedisActivationStore { &self, namespace: Option<&str>, ) -> Result, RedisActivationError> { - error!( - "Trying to acquire read lock in get_pending_activation {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in get_pending_activation"); + } let result = self .inner .read() @@ -306,10 +297,9 @@ impl RedisActivationStore { namespaces: Option<&[String]>, limit: Option, ) -> Result, RedisActivationError> { - error!( - "Trying to acquire read lock in get_pending_activations_from_namespaces {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in get_pending_activations_from_namespaces"); + } let result = self .inner .read() @@ -335,10 +325,9 @@ impl RedisActivationStore { hash_key: HashKey, activation_id: &str, ) -> Result, RedisActivationError> { - error!( - "Trying to acquire read lock in get_by_id {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in get_by_id"); + } let result = self .inner .read() @@ -366,10 +355,9 @@ impl RedisActivationStore { activation_id: &str, status: InflightActivationStatus, ) -> Result<(), RedisActivationError> { - error!( - "Trying to acquire read lock in set_status {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in set_status"); + } let result = self .inner .read() @@ -393,10 +381,9 @@ impl RedisActivationStore { pub async fn get_retry_activations( &self, ) -> Result, RedisActivationError> { - error!( - "Trying to acquire read lock in get_retry_activations {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in get_retry_activations"); + } let result = self.inner.read().await.get_retry_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -413,10 +400,9 @@ impl RedisActivationStore { &self, activations: Vec, ) -> Result { - error!( - "Trying to acquire read lock in mark_retry_completed {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in mark_retry_completed"); + } let result = self .inner .read() @@ -437,10 +423,9 @@ impl RedisActivationStore { pub async fn handle_processing_deadline( &self, ) -> Result<(u64, u64, u64), RedisActivationError> { - error!( - "Trying to acquire read lock in handle_processing_deadline {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in handle_processing_deadline"); + } let result = self.inner.read().await.handle_processing_deadline().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -454,10 +439,9 @@ impl RedisActivationStore { } pub async fn handle_processing_attempts(&self) -> Result { - error!( - "Trying to acquire read lock in handle_processing_attempts {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in handle_processing_attempts"); + } let result = self.inner.read().await.handle_processing_attempts().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -471,10 +455,9 @@ impl RedisActivationStore { } pub async fn handle_expires_at(&self) -> Result { - error!( - "Trying to acquire read lock in handle_expires_at {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in handle_expires_at"); + } let result = self.inner.read().await.handle_expires_at().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -488,10 +471,9 @@ impl RedisActivationStore { } pub async fn handle_delay_until(&self) -> Result { - error!( - "Trying to acquire read lock in handle_delay_until {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in handle_delay_until"); + } let result = self.inner.read().await.handle_delay_until().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -507,10 +489,9 @@ impl RedisActivationStore { pub async fn handle_deadletter_tasks( &self, ) -> Result)>, RedisActivationError> { - error!( - "Trying to acquire read lock in handle_deadletter_tasks {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in handle_deadletter_tasks"); + } let result = self.inner.read().await.handle_deadletter_tasks().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -527,10 +508,9 @@ impl RedisActivationStore { &self, ids: Vec, ) -> Result { - error!( - "Trying to acquire read lock in mark_deadletter_completed {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in mark_deadletter_completed"); + } let result = self.inner.read().await.mark_deadletter_completed(ids).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -547,10 +527,9 @@ impl RedisActivationStore { &self, killswitched_tasks: Vec, ) -> Result { - error!( - "Trying to acquire read lock in remove_killswitched {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in remove_killswitched"); + } let result = self .inner .read() @@ -572,10 +551,9 @@ impl RedisActivationStore { &self, ids: Vec, ) -> Result { - error!( - "Trying to acquire read lock in mark_demoted_completed {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in mark_demoted_completed"); + } let result = self.inner.read().await.mark_demoted_completed(ids).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); @@ -592,10 +570,9 @@ impl RedisActivationStore { &self, now: &DateTime, ) -> Result { - error!( - "Trying to acquire read lock in pending_activation_max_lag {:?}", - self.inner.try_read() - ); + if self.inner.try_read().is_err() { + error!("Failed to acquire read lock in pending_activation_max_lag"); + } let result = self .inner .read() From 843f7a643710e963ed15c0f6675d27d642eb8d3d Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 12:12:23 -0500 Subject: [PATCH 25/43] don't delete --- src/main.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index a04f1195..52379b5c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,8 +69,6 @@ async fn main() -> Result<(), Error> { ) .await?, ); - // TODO: REMOVE THIS - redis_store.delete_all_keys().await?; // If this is an environment where the topics might not exist, check and create them. if config.create_missing_topics { From 2337d802732d5a2c3688fce6108f602156e84e83 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 15:25:53 -0500 Subject: [PATCH 26/43] refactor locks --- src/kafka/consumer.rs | 2 +- src/store/inflight_redis_activation.rs | 156 ++++------------------ src/store/inner_redis_activation_store.rs | 64 +++++---- 3 files changed, 67 insertions(+), 155 deletions(-) diff --git a/src/kafka/consumer.rs b/src/kafka/consumer.rs index 597e4448..ffd725e3 100644 --- a/src/kafka/consumer.rs +++ b/src/kafka/consumer.rs @@ -390,7 +390,7 @@ pub async fn handle_events( } } for (topic, partitions) in topics.iter() { - redis_store.rebalance_partitions(topic.clone(), partitions.clone()).await; + redis_store.rebalance_partitions(topic.clone(), partitions.clone()); } ConsumerState::Consuming(spawn_actors(consumer.clone(), &tpl), tpl) } diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index 5d97b192..f6ad0786 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -81,7 +81,7 @@ pub async fn create_redis_pool(urls: Vec) -> Result, + inner: InnerRedisActivationStore, urls: Vec, } @@ -112,21 +112,18 @@ impl RedisActivationStore { }); } Ok(Self { - inner: RwLock::new(inner.unwrap()), + inner: inner.unwrap(), urls, }) } // Called when rebalancing partitions - pub async fn rebalance_partitions(&self, topic: String, partitions: Vec) { + pub fn rebalance_partitions(&self, topic: String, partitions: Vec) { error!( "Rebalancing partitions: {:?}", (topic.clone(), partitions.clone()) ); - self.inner - .write() - .await - .rebalance_partitions(topic, partitions); + self.inner.rebalance_partitions(topic, partitions); error!("Rebalanced partitions"); } @@ -134,10 +131,7 @@ impl RedisActivationStore { &self, batch: Vec, ) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in store"); - } - let result = self.inner.read().await.store(batch).await; + let result = self.inner.store(batch).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to store activations: {:?}", error_string); @@ -150,10 +144,7 @@ impl RedisActivationStore { } pub async fn count_processing_activations(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in count_processing_activations"); - } - let result = self.inner.read().await.count_processing_activations().await; + let result = self.inner.count_processing_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to count processing activations: {:?}", error_string); @@ -166,10 +157,7 @@ impl RedisActivationStore { } pub async fn count_delayed_activations(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in count_delayed_activations"); - } - let result = self.inner.read().await.count_delayed_activations().await; + let result = self.inner.count_delayed_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to count delayed activations: {:?}", error_string); @@ -182,10 +170,7 @@ impl RedisActivationStore { } pub async fn count_pending_activations(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in count_pending_activations"); - } - let result = self.inner.read().await.count_pending_activations().await; + let result = self.inner.count_pending_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to count pending activations: {:?}", error_string); @@ -198,10 +183,7 @@ impl RedisActivationStore { } pub async fn count_retry_activations(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in count_retry_activations"); - } - let result = self.inner.read().await.count_retry_activations().await; + let result = self.inner.count_retry_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to count retry activations: {:?}", error_string); @@ -214,10 +196,7 @@ impl RedisActivationStore { } pub async fn count_deadletter_activations(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in count_deadletter_activations"); - } - let result = self.inner.read().await.count_deadletter_activations().await; + let result = self.inner.count_deadletter_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to count deadletter activations: {:?}", error_string); @@ -230,10 +209,7 @@ impl RedisActivationStore { } pub async fn db_size(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in db_size"); - } - let result = self.inner.read().await.db_size().await; + let result = self.inner.db_size().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to get db size: {:?}", error_string); @@ -246,10 +222,7 @@ impl RedisActivationStore { } pub async fn delete_all_keys(&self) -> Result<(), RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in delete_all_keys"); - } - let result = self.inner.read().await.delete_all_keys().await; + let result = self.inner.delete_all_keys().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to delete all keys: {:?}", error_string); @@ -265,15 +238,7 @@ impl RedisActivationStore { &self, namespace: Option<&str>, ) -> Result, RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in get_pending_activation"); - } - let result = self - .inner - .read() - .await - .get_pending_activation(namespace) - .await; + let result = self.inner.get_pending_activation(namespace).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!( @@ -297,13 +262,8 @@ impl RedisActivationStore { namespaces: Option<&[String]>, limit: Option, ) -> Result, RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in get_pending_activations_from_namespaces"); - } let result = self .inner - .read() - .await .get_pending_activations_from_namespaces(namespaces, limit) .await; if result.is_err() { @@ -325,15 +285,7 @@ impl RedisActivationStore { hash_key: HashKey, activation_id: &str, ) -> Result, RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in get_by_id"); - } - let result = self - .inner - .read() - .await - .get_by_id(hash_key.clone(), activation_id) - .await; + let result = self.inner.get_by_id(hash_key.clone(), activation_id).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!( @@ -355,15 +307,7 @@ impl RedisActivationStore { activation_id: &str, status: InflightActivationStatus, ) -> Result<(), RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in set_status"); - } - let result = self - .inner - .read() - .await - .set_status(activation_id, status) - .await; + let result = self.inner.set_status(activation_id, status).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!( @@ -381,10 +325,7 @@ impl RedisActivationStore { pub async fn get_retry_activations( &self, ) -> Result, RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in get_retry_activations"); - } - let result = self.inner.read().await.get_retry_activations().await; + let result = self.inner.get_retry_activations().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to get retry activations: {:?}", error_string); @@ -400,15 +341,7 @@ impl RedisActivationStore { &self, activations: Vec, ) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in mark_retry_completed"); - } - let result = self - .inner - .read() - .await - .mark_retry_completed(activations) - .await; + let result = self.inner.mark_retry_completed(activations).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to mark retry completed: {:?}", error_string); @@ -423,10 +356,7 @@ impl RedisActivationStore { pub async fn handle_processing_deadline( &self, ) -> Result<(u64, u64, u64), RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in handle_processing_deadline"); - } - let result = self.inner.read().await.handle_processing_deadline().await; + let result = self.inner.handle_processing_deadline().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to handle processing deadline: {:?}", error_string); @@ -439,10 +369,7 @@ impl RedisActivationStore { } pub async fn handle_processing_attempts(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in handle_processing_attempts"); - } - let result = self.inner.read().await.handle_processing_attempts().await; + let result = self.inner.handle_processing_attempts().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to handle processing attempts: {:?}", error_string); @@ -455,10 +382,7 @@ impl RedisActivationStore { } pub async fn handle_expires_at(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in handle_expires_at"); - } - let result = self.inner.read().await.handle_expires_at().await; + let result = self.inner.handle_expires_at().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to handle expires at: {:?}", error_string); @@ -471,10 +395,7 @@ impl RedisActivationStore { } pub async fn handle_delay_until(&self) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in handle_delay_until"); - } - let result = self.inner.read().await.handle_delay_until().await; + let result = self.inner.handle_delay_until().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to handle delay until: {:?}", error_string); @@ -489,10 +410,7 @@ impl RedisActivationStore { pub async fn handle_deadletter_tasks( &self, ) -> Result)>, RedisActivationError> { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in handle_deadletter_tasks"); - } - let result = self.inner.read().await.handle_deadletter_tasks().await; + let result = self.inner.handle_deadletter_tasks().await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to handle deadletter tasks: {:?}", error_string); @@ -508,10 +426,7 @@ impl RedisActivationStore { &self, ids: Vec, ) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in mark_deadletter_completed"); - } - let result = self.inner.read().await.mark_deadletter_completed(ids).await; + let result = self.inner.mark_deadletter_completed(ids).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to mark deadletter completed: {:?}", error_string); @@ -527,15 +442,7 @@ impl RedisActivationStore { &self, killswitched_tasks: Vec, ) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in remove_killswitched"); - } - let result = self - .inner - .read() - .await - .remove_killswitched(killswitched_tasks) - .await; + let result = self.inner.remove_killswitched(killswitched_tasks).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to remove killswitched: {:?}", error_string); @@ -551,10 +458,7 @@ impl RedisActivationStore { &self, ids: Vec, ) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in mark_demoted_completed"); - } - let result = self.inner.read().await.mark_demoted_completed(ids).await; + let result = self.inner.mark_demoted_completed(ids).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!("Failed to mark demoted completed: {:?}", error_string); @@ -570,15 +474,7 @@ impl RedisActivationStore { &self, now: &DateTime, ) -> Result { - if self.inner.try_read().is_err() { - error!("Failed to acquire read lock in pending_activation_max_lag"); - } - let result = self - .inner - .read() - .await - .pending_activation_max_lag(now) - .await; + let result = self.inner.pending_activation_max_lag(now).await; if result.is_err() { let error_string = result.err().unwrap().to_string(); error!( diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index bcd964c7..bb237a3e 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -10,6 +10,7 @@ use futures::future::try_join_all; use redis::AsyncTypedCommands; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; +use std::sync::RwLock; use std::time::Instant; use tracing::{error, info, instrument}; @@ -17,11 +18,11 @@ use tracing::{error, info, instrument}; pub struct InnerRedisActivationStore { pool: Pool, replicas: usize, - topics: HashMap>, + topics: RwLock>>, namespaces: Vec, payload_ttl_seconds: u64, bucket_hashes: Vec, - hash_keys: Vec, + hash_keys: RwLock>, key_builder: KeyBuilder, processing_deadline_grace_sec: i64, max_processing_attempts: i32, @@ -51,10 +52,10 @@ impl InnerRedisActivationStore { Ok(Self { pool, replicas, - topics, + topics: RwLock::new(topics), namespaces, bucket_hashes, - hash_keys, + hash_keys: RwLock::new(hash_keys), payload_ttl_seconds, key_builder: KeyBuilder::new(num_buckets), processing_deadline_grace_sec: processing_deadline_grace_sec as i64, // Duration expects i64 @@ -63,17 +64,27 @@ impl InnerRedisActivationStore { } // Called when rebalancing partitions - pub fn rebalance_partitions(&mut self, topic: String, partitions: Vec) { + pub fn rebalance_partitions(&self, topic: String, partitions: Vec) { // This assumes that the broker is always consuming from the same topics and only the partitions are changing - self.topics.insert(topic.clone(), partitions.clone()); - self.hash_keys.clear(); + // Old topics are not removed, just the partitions are updated. + { + let mut write_guard = self.topics.write().unwrap(); + write_guard.insert(topic.clone(), partitions.clone()); + } let mut hashkeys = 0; - for (topic, partitions) in self.topics.iter() { - for partition in partitions.iter() { - for namespace in self.namespaces.iter() { - self.hash_keys - .push(HashKey::new(namespace.clone(), topic.clone(), *partition)); - hashkeys += self.bucket_hashes.len(); + { + let mut write_guard = self.hash_keys.write().unwrap(); + write_guard.clear(); + for (topic, partitions) in self.topics.read().unwrap().iter() { + for partition in partitions.iter() { + for namespace in self.namespaces.iter() { + write_guard.push(HashKey::new( + namespace.clone(), + topic.clone(), + *partition, + )); + hashkeys += self.bucket_hashes.len(); + } } } } @@ -441,12 +452,13 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut activations: Vec = Vec::new(); - let random_iterator = RandomStartIterator::new(self.hash_keys.len()); + let hash_keys = self.get_hash_keys(); + let random_iterator = RandomStartIterator::new(hash_keys.len()); let mut buckets_checked = 0; let mut hashes_checked = 0; for idx in random_iterator { hashes_checked += 1; - let hash_key = self.hash_keys[idx].clone(); + let hash_key = hash_keys[idx].clone(); if namespaces.is_some() && !namespaces.unwrap().contains(&hash_key.namespace) { metrics::counter!( "redis_store.get_pending_activations_from_namespaces.namespace_not_found" @@ -748,7 +760,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut activation_ids: Vec<(HashKey, String)> = Vec::new(); - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let retry_key = self .key_builder @@ -889,7 +901,7 @@ impl InnerRedisActivationStore { let mut discarded_count: u64 = 0; let mut processing_attempts_exceeded_count: u64 = 0; let start_time = Instant::now(); - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let processing_key = self .key_builder @@ -1036,7 +1048,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_rows_affected = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let single_bucket_start_time = Instant::now(); let expires_at_key = self @@ -1090,7 +1102,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_rows_affected = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let single_bucket_start_time = Instant::now(); let delay_until_key = self @@ -1155,12 +1167,16 @@ impl InnerRedisActivationStore { Ok(0) } + fn get_hash_keys(&self) -> Vec { + self.hash_keys.read().unwrap().clone() + } + #[instrument(skip_all)] pub async fn count_pending_activations(&self) -> Result { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_count = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let pending_key = self .key_builder @@ -1182,7 +1198,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_count = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let delay_key = self .key_builder @@ -1204,7 +1220,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_count = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let processing_key = self .key_builder @@ -1225,7 +1241,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_count = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let retry_key = self .key_builder @@ -1246,7 +1262,7 @@ impl InnerRedisActivationStore { let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_count = 0; - for hash_key in self.hash_keys.iter() { + for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let retry_key = self .key_builder From ef10a872c4af26e7955476f91a7f5ac955f6c00d Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Wed, 3 Dec 2025 16:00:32 -0500 Subject: [PATCH 27/43] try to speed up calls --- src/store/inner_redis_activation_store.rs | 48 +++++++++++++---------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index bb237a3e..57519eb5 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -384,6 +384,7 @@ impl InnerRedisActivationStore { // If the activation is at_most_once, discard the activation and remove the payloads. // If it has deadletter configured, move it to the deadletter queue and keep the payloads. let start_time = Instant::now(); + let mut conn = self.get_conn().await?; let fields = self .get_fields_by_id( hashkey.clone(), @@ -398,7 +399,7 @@ impl InnerRedisActivationStore { let on_attempts_exceeded = OnAttemptsExceeded::from_str_name(fields.get("on_attempts_exceeded").unwrap().as_str()) .unwrap(); - let mut conn = self.get_conn().await?; + if !at_most_once && on_attempts_exceeded == OnAttemptsExceeded::Deadletter { let deadletter_key = self .key_builder @@ -1173,19 +1174,20 @@ impl InnerRedisActivationStore { #[instrument(skip_all)] pub async fn count_pending_activations(&self) -> Result { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); - let mut total_count = 0; + let mut pipe = redis::pipe(); for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let pending_key = self .key_builder .get_pending_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let count: usize = conn.llen(pending_key).await?; - total_count += count; + pipe.llen(pending_key.clone()); } } + let mut conn = self.get_conn().await?; + let results: Vec = pipe.query_async(&mut *conn).await?; + let total_count = results.iter().sum(); let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.count_pending_activations.total_duration") @@ -1195,19 +1197,20 @@ impl InnerRedisActivationStore { #[instrument(skip_all)] pub async fn count_delayed_activations(&self) -> Result { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); - let mut total_count = 0; + let mut pipe = redis::pipe(); for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let delay_key = self .key_builder .get_delay_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let count: usize = conn.zcard(delay_key.clone()).await?; - total_count += count; + pipe.zcard(delay_key.clone()); } } + let mut conn = self.get_conn().await?; + let results: Vec = pipe.query_async(&mut *conn).await?; + let total_count = results.iter().sum(); let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.count_delayed_activations.total_duration") @@ -1217,19 +1220,20 @@ impl InnerRedisActivationStore { #[instrument(skip_all)] pub async fn count_processing_activations(&self) -> Result { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); - let mut total_count = 0; + let mut pipe = redis::pipe(); for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let processing_key = self .key_builder .get_processing_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let count: usize = conn.zcard(processing_key.clone()).await?; - total_count += count; + pipe.zcard(processing_key.clone()); } } + let mut conn = self.get_conn().await?; + let results: Vec = pipe.query_async(&mut *conn).await?; + let total_count = results.iter().sum(); let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.count_processing_activations.total_duration") @@ -1238,19 +1242,20 @@ impl InnerRedisActivationStore { } pub async fn count_retry_activations(&self) -> Result { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); - let mut total_count = 0; + let mut pipe = redis::pipe(); for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let retry_key = self .key_builder .get_retry_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let count: usize = conn.llen(retry_key.clone()).await?; - total_count += count; + pipe.llen(retry_key.clone()); } } + let mut conn = self.get_conn().await?; + let results: Vec = pipe.query_async(&mut *conn).await?; + let total_count = results.iter().sum(); let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.count_retry_activations.total_duration") @@ -1259,19 +1264,20 @@ impl InnerRedisActivationStore { } pub async fn count_deadletter_activations(&self) -> Result { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); - let mut total_count = 0; + let mut pipe = redis::pipe(); for hash_key in self.get_hash_keys().iter() { for bucket_hash in self.bucket_hashes.iter() { let retry_key = self .key_builder .get_deadletter_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let count: usize = conn.llen(retry_key.clone()).await?; - total_count += count; + pipe.llen(retry_key.clone()); } } + let mut conn = self.get_conn().await?; + let results: Vec = pipe.query_async(&mut *conn).await?; + let total_count = results.iter().sum(); let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.count_deadletter_activations.total_duration") From e249c32fdfc2e1d5ed5cb07d7a4de73eba3eedde Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 4 Dec 2025 12:30:07 -0500 Subject: [PATCH 28/43] some cleanup --- src/grpc/server.rs | 1 - src/kafka/inflight_activation_writer.rs | 6 ++---- src/store/inner_redis_activation_store.rs | 26 +++++++++++++++-------- src/upkeep.rs | 9 ++++++++ 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/grpc/server.rs b/src/grpc/server.rs index 90ff8288..26210330 100644 --- a/src/grpc/server.rs +++ b/src/grpc/server.rs @@ -53,7 +53,6 @@ impl ConsumerService for TaskbrokerServer { } Ok(None) => Err(Status::not_found("No pending activation")), Err(e) => { - println!("error: {:?}", e); error!("Unable to retrieve pending activation: {:?}", e); Err(Status::internal("Unable to retrieve pending activation")) } diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index de722f67..7d477b7c 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -79,7 +79,7 @@ impl Reducer for InflightActivationWriter { self.batch.take(); return Ok(Some(())); } - error!("Writing batch of length: {}", batch.len()); + // Check if writing the batch would exceed the limits let exceeded_pending_limit = self .store @@ -144,7 +144,6 @@ impl Reducer for InflightActivationWriter { "reason" => reason, ) .increment(1); - error!("Backpressure triggered: {}", reason); return Ok(None); } @@ -153,7 +152,6 @@ impl Reducer for InflightActivationWriter { let res = self.store.store(batch.clone()).await; match res { Ok(res) => { - error!("Wrote batch of length: {} to store", batch.len()); self.batch.take(); let lag = Utc::now() - batch @@ -176,7 +174,7 @@ impl Reducer for InflightActivationWriter { Ok(Some(())) } Err(err) => { - error!("Unable to write to sqlite: {}", err); + error!("Unable to write to db: {}", err); metrics::counter!("consumer.inflight_activation_writer.write_failed").increment(1); sleep(Duration::from_millis(self.config.write_failure_backoff_ms)).await; Ok(None) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 57519eb5..26984729 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -999,7 +999,7 @@ impl InnerRedisActivationStore { "processing_attempts", (processing_attempts + 1).to_string(), ); - pipe.rpush(pending_key, activation_id); + pipe.rpush(pending_key.clone(), activation_id); pipe.zrem(processing_key.clone(), activation_id); let results: Vec = pipe.query_async(&mut *conn).await?; let single_activation_duration = @@ -1020,16 +1020,24 @@ impl InnerRedisActivationStore { )); } if results[1] != 1 { - return Err(anyhow::anyhow!( - "Failed to add activation to pending queue: {}", - activation_id - )); + error!( + "Failed to add activation to pending queue (output: {}): {} {}", + results[1], pending_key, activation_id + ); + // return Err(anyhow::anyhow!( + // "Failed to add activation to pending queue: {}", + // activation_id + // )); } if results[2] != 1 { - return Err(anyhow::anyhow!( - "Failed to remove activation from processing set: {}", - activation_id - )); + error!( + "Failed to remove activation from processing set (output: {}): {} {}", + results[2], processing_key, activation_id + ); + // return Err(anyhow::anyhow!( + // "Failed to remove activation from processing set: {}", + // activation_id + // )); } } } diff --git a/src/upkeep.rs b/src/upkeep.rs index 3a6feec2..614bc90f 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -131,6 +131,7 @@ pub async fn do_upkeep( killswitched: 0, forwarded: 0, }; + error!("Starting upkeep"); // 1. Handle retry tasks let handle_retries_start = Instant::now(); @@ -412,6 +413,14 @@ pub async fn do_upkeep( } metrics::histogram!("upkeep.duration").record(upkeep_start.elapsed()); error!("Pending count: {}", result_context.pending); + error!("Processing count: {}", result_context.processing); + error!("Delay count: {}", result_context.delay); + error!("Delay elapsed: {}", result_context.delay_elapsed); + error!("Failed: {}", result_context.failed); + error!("Retried: {}", result_context.retried); + error!("Deadlettered: {}", result_context.deadlettered); + error!("Expired: {}", result_context.expired); + error!("Discarded: {}", result_context.discarded); // Task statuses metrics::counter!("upkeep.task.state_transition", "state" => "completed") .increment(result_context.completed); From 970be70c9f43ac03e07be1d16159b5d2839ecb89 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 4 Dec 2025 13:42:27 -0500 Subject: [PATCH 29/43] add mre logging --- src/store/inner_redis_activation_store.rs | 15 ++++++--------- src/upkeep.rs | 1 + 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 26984729..0337c695 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -1019,15 +1019,12 @@ impl InnerRedisActivationStore { activation_id )); } - if results[1] != 1 { - error!( - "Failed to add activation to pending queue (output: {}): {} {}", - results[1], pending_key, activation_id - ); - // return Err(anyhow::anyhow!( - // "Failed to add activation to pending queue: {}", - // activation_id - // )); + if results[1] == 0 { + // Should at least have added itself to the pending queue + return Err(anyhow::anyhow!( + "Failed to add activation to pending queue: {}", + activation_id + )); } if results[2] != 1 { error!( diff --git a/src/upkeep.rs b/src/upkeep.rs index 614bc90f..dab25f36 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -46,6 +46,7 @@ pub async fn upkeep( loop { select! { _ = timer.tick() => { + error!("Running upkeep at {}", last_run.elapsed().as_millis()); let _ = do_upkeep( config.clone(), store.clone(), From 02fe03c5039ffa657ce16fc8b0d93c32b9def5a6 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Thu, 4 Dec 2025 17:28:21 -0500 Subject: [PATCH 30/43] try using async backtrace --- Cargo.lock | 100 ++++++++++++++++++++++ Cargo.toml | 1 + src/kafka/inflight_activation_writer.rs | 12 +-- src/store/inflight_redis_activation.rs | 26 +++++- src/store/inner_redis_activation_store.rs | 41 +++++++-- src/upkeep.rs | 4 + 6 files changed, 168 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 371419e2..df5c5c55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,6 +121,33 @@ version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" +[[package]] +name = "async-backtrace" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcb391558246d27a13f195c1e3a53eda422270fdd452bd57a5aa9c1da1bb198" +dependencies = [ + "async-backtrace-attributes", + "dashmap", + "futures", + "loom", + "once_cell", + "pin-project-lite", + "rustc-hash", + "static_assertions", +] + +[[package]] +name = "async-backtrace-attributes" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "affbba0d438add06462a0371997575927bc05052f7ec486e7a4ca405c956c3d7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -610,6 +637,19 @@ dependencies = [ "typenum", ] +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "deadpool" version = "0.12.3" @@ -940,6 +980,19 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cc16584ff22b460a382b7feec54b23d2908d858152e5739a120b949293bd74e" +dependencies = [ + "cc", + "libc", + "log", + "rustversion", + "windows", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1014,6 +1067,12 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.5" @@ -1530,6 +1589,19 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "loom" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff50ecb28bb86013e935fb6683ab1f6d3a20016f123c76fd4c27470076ac30f5" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "matchers" version = "0.1.0" @@ -2357,6 +2429,12 @@ version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.4.1" @@ -2427,6 +2505,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" @@ -2962,6 +3046,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stringprep" version = "0.1.5" @@ -3021,6 +3111,7 @@ name = "taskbroker" version = "0.1.0" dependencies = [ "anyhow", + "async-backtrace", "base64 0.21.7", "bytes", "chrono", @@ -3780,6 +3871,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-core" version = "0.61.2" diff --git a/Cargo.toml b/Cargo.toml index 5aca710d..eb62db02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ debug = 1 [dependencies] anyhow = "1.0.92" +async-backtrace = "0.2.7" base64 = "0.21.0" bytes = "1.10.0" chrono = { version = "0.4.26" } diff --git a/src/kafka/inflight_activation_writer.rs b/src/kafka/inflight_activation_writer.rs index 7d477b7c..7fc9b808 100644 --- a/src/kafka/inflight_activation_writer.rs +++ b/src/kafka/inflight_activation_writer.rs @@ -1,14 +1,14 @@ -use std::{ - sync::Arc, - time::{Duration, Instant}, -}; - use crate::{ config::Config, store::inflight_activation::{InflightActivation, InflightActivationStatus}, store::inflight_redis_activation::RedisActivationStore, }; +use async_backtrace::framed; use chrono::Utc; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; use tokio::time::sleep; use tracing::{debug, error, instrument}; @@ -68,7 +68,7 @@ impl Reducer for InflightActivationWriter { Ok(()) } - #[instrument(skip_all)] + #[framed] async fn flush(&mut self) -> Result, anyhow::Error> { let Some(ref batch) = self.batch else { return Ok(None); diff --git a/src/store/inflight_redis_activation.rs b/src/store/inflight_redis_activation.rs index f6ad0786..58c7b9a6 100644 --- a/src/store/inflight_redis_activation.rs +++ b/src/store/inflight_redis_activation.rs @@ -1,5 +1,6 @@ use crate::store::inner_redis_activation_store::InnerRedisActivationStore; use crate::store::redis_utils::HashKey; +use async_backtrace::framed; use thiserror::Error; use tracing::error; @@ -11,8 +12,6 @@ use crate::store::inflight_activation::{ use chrono::{DateTime, Utc}; use deadpool_redis::{Config as RedisConfig, Pool, Runtime}; use std::collections::HashMap; -// use std::sync::RwLock; -use tokio::sync::RwLock; #[derive(Error, Debug)] pub enum RedisActivationError { @@ -127,6 +126,7 @@ impl RedisActivationStore { error!("Rebalanced partitions"); } + #[framed] pub async fn store( &self, batch: Vec, @@ -143,6 +143,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn count_processing_activations(&self) -> Result { let result = self.inner.count_processing_activations().await; if result.is_err() { @@ -156,6 +157,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn count_delayed_activations(&self) -> Result { let result = self.inner.count_delayed_activations().await; if result.is_err() { @@ -169,6 +171,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn count_pending_activations(&self) -> Result { let result = self.inner.count_pending_activations().await; if result.is_err() { @@ -182,6 +185,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn count_retry_activations(&self) -> Result { let result = self.inner.count_retry_activations().await; if result.is_err() { @@ -195,6 +199,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn count_deadletter_activations(&self) -> Result { let result = self.inner.count_deadletter_activations().await; if result.is_err() { @@ -208,6 +213,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn db_size(&self) -> Result { let result = self.inner.db_size().await; if result.is_err() { @@ -221,6 +227,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn delete_all_keys(&self) -> Result<(), RedisActivationError> { let result = self.inner.delete_all_keys().await; if result.is_err() { @@ -234,6 +241,7 @@ impl RedisActivationStore { Ok(()) } + #[framed] pub async fn get_pending_activation( &self, namespace: Option<&str>, @@ -257,6 +265,7 @@ impl RedisActivationStore { Ok(Some(activation.unwrap())) } + #[framed] pub async fn get_pending_activations_from_namespaces( &self, namespaces: Option<&[String]>, @@ -280,6 +289,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn get_by_id( &self, hash_key: HashKey, @@ -302,6 +312,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn set_status( &self, activation_id: &str, @@ -322,6 +333,7 @@ impl RedisActivationStore { Ok(()) } + #[framed] pub async fn get_retry_activations( &self, ) -> Result, RedisActivationError> { @@ -337,6 +349,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn mark_retry_completed( &self, activations: Vec, @@ -353,6 +366,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn handle_processing_deadline( &self, ) -> Result<(u64, u64, u64), RedisActivationError> { @@ -368,6 +382,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn handle_processing_attempts(&self) -> Result { let result = self.inner.handle_processing_attempts().await; if result.is_err() { @@ -381,6 +396,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn handle_expires_at(&self) -> Result { let result = self.inner.handle_expires_at().await; if result.is_err() { @@ -394,6 +410,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn handle_delay_until(&self) -> Result { let result = self.inner.handle_delay_until().await; if result.is_err() { @@ -407,6 +424,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn handle_deadletter_tasks( &self, ) -> Result)>, RedisActivationError> { @@ -422,6 +440,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn mark_deadletter_completed( &self, ids: Vec, @@ -438,6 +457,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn remove_killswitched( &self, killswitched_tasks: Vec, @@ -454,6 +474,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn mark_demoted_completed( &self, ids: Vec, @@ -470,6 +491,7 @@ impl RedisActivationStore { Ok(result.unwrap()) } + #[framed] pub async fn pending_activation_max_lag( &self, now: &DateTime, diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 0337c695..40d86831 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -3,6 +3,7 @@ use crate::store::inflight_activation::{ }; use crate::store::redis_utils::{HashKey, KeyBuilder, RandomStartIterator}; use anyhow::Error; +use async_backtrace::framed; use base64::{Engine as _, engine::general_purpose}; use chrono::{DateTime, Duration, Utc}; use deadpool_redis::Pool; @@ -12,7 +13,7 @@ use sentry_protos::taskbroker::v1::OnAttemptsExceeded; use std::collections::HashMap; use std::sync::RwLock; use std::time::Instant; -use tracing::{error, info, instrument}; +use tracing::{error, info}; #[derive(Debug)] pub struct InnerRedisActivationStore { @@ -29,6 +30,7 @@ pub struct InnerRedisActivationStore { } impl InnerRedisActivationStore { + #[framed] pub async fn new( pool: Pool, replicas: usize, @@ -94,6 +96,7 @@ impl InnerRedisActivationStore { ); } + #[framed] pub async fn get_conn(&self) -> Result { let start_time = Instant::now(); let conn = self.pool.get().await?; @@ -102,7 +105,7 @@ impl InnerRedisActivationStore { Ok(conn) } - #[instrument(skip_all)] + #[framed] pub async fn store(&self, batch: Vec) -> Result { let mut conn = self.get_conn().await?; let mut rows_affected: u64 = 0; @@ -339,6 +342,7 @@ impl InnerRedisActivationStore { Ok(QueryResult { rows_affected }) } + #[framed] pub async fn cleanup_activation( &self, hashkey: HashKey, @@ -373,8 +377,9 @@ impl InnerRedisActivationStore { metrics::histogram!("redis_store.cleanup_duration").record(duration.as_millis() as f64); Ok(()) } + /// Discard an activation. If the activation is at_most_once, remove the payloads. - #[instrument(skip_all)] + #[framed] pub async fn discard_activation( &self, hashkey: HashKey, @@ -426,7 +431,7 @@ impl InnerRedisActivationStore { Ok(()) } - #[instrument(skip_all)] + #[framed] pub async fn get_pending_activation( &self, namespace: Option<&str>, @@ -444,7 +449,7 @@ impl InnerRedisActivationStore { /// Get a pending activation from specified namespaces /// If namespaces is None, gets from any namespace /// If namespaces is Some(&[...]), gets from those namespaces - #[instrument(skip_all)] + #[framed] pub async fn get_pending_activations_from_namespaces( &self, namespaces: Option<&[String]>, @@ -567,6 +572,7 @@ impl InnerRedisActivationStore { } /// Get an activation by id. Primarily used for testing + #[framed] pub async fn get_by_id( &self, hash_key: HashKey, @@ -591,6 +597,7 @@ impl InnerRedisActivationStore { Ok(Some(activation)) } + #[framed] pub async fn get_by_id_lookup( &self, activation_id: &str, @@ -605,6 +612,7 @@ impl InnerRedisActivationStore { Ok(activation) } + #[framed] pub async fn get_hashkey_by_id(&self, activation_id: &str) -> Result, Error> { let mut conn = self.get_conn().await?; let start_time = Instant::now(); @@ -635,6 +643,7 @@ impl InnerRedisActivationStore { ))) } + #[framed] pub async fn get_fields_by_id( &self, hash_key: HashKey, @@ -668,6 +677,7 @@ impl InnerRedisActivationStore { Ok(fields_map) } + #[framed] pub async fn set_status( &self, activation_id: &str, @@ -757,6 +767,7 @@ impl InnerRedisActivationStore { Ok(()) } + #[framed] pub async fn get_retry_activations(&self) -> Result, Error> { let mut conn = self.get_conn().await?; let start_time = Instant::now(); @@ -805,6 +816,7 @@ impl InnerRedisActivationStore { Ok(activations.into_iter().flatten().collect()) } + #[framed] pub async fn mark_retry_completed( &self, activations: Vec, @@ -891,6 +903,7 @@ impl InnerRedisActivationStore { Ok(rows_affected) } + #[framed] pub async fn handle_processing_deadline(&self) -> Result<(u64, u64, u64), Error> { // Get all the activations that have exceeded their processing deadline // Idempotent activations that fail their processing deadlines go directly to failure @@ -1050,6 +1063,7 @@ impl InnerRedisActivationStore { )) } + #[framed] pub async fn handle_expires_at(&self) -> Result { let mut conn = self.get_conn().await?; let start_time = Instant::now(); @@ -1104,6 +1118,7 @@ impl InnerRedisActivationStore { Ok(total_rows_affected) } + #[framed] pub async fn handle_delay_until(&self) -> Result { let mut conn = self.get_conn().await?; let start_time = Instant::now(); @@ -1158,16 +1173,19 @@ impl InnerRedisActivationStore { Ok(total_rows_affected) } + #[framed] pub async fn remove_killswitched(&self, killswitched_tasks: Vec) -> Result { // TODO Ok(0) } + #[framed] pub async fn mark_demoted_completed(&self, ids: Vec) -> Result { // TODO Ok(0) } + #[framed] pub async fn pending_activation_max_lag(&self, now: &DateTime) -> Result { // TODO Ok(0) @@ -1177,7 +1195,7 @@ impl InnerRedisActivationStore { self.hash_keys.read().unwrap().clone() } - #[instrument(skip_all)] + #[framed] pub async fn count_pending_activations(&self) -> Result { let start_time = Instant::now(); let mut pipe = redis::pipe(); @@ -1200,7 +1218,7 @@ impl InnerRedisActivationStore { Ok(total_count) } - #[instrument(skip_all)] + #[framed] pub async fn count_delayed_activations(&self) -> Result { let start_time = Instant::now(); let mut pipe = redis::pipe(); @@ -1223,7 +1241,7 @@ impl InnerRedisActivationStore { Ok(total_count) } - #[instrument(skip_all)] + #[framed] pub async fn count_processing_activations(&self) -> Result { let start_time = Instant::now(); let mut pipe = redis::pipe(); @@ -1246,6 +1264,7 @@ impl InnerRedisActivationStore { Ok(total_count) } + #[framed] pub async fn count_retry_activations(&self) -> Result { let start_time = Instant::now(); let mut pipe = redis::pipe(); @@ -1268,6 +1287,7 @@ impl InnerRedisActivationStore { Ok(total_count) } + #[framed] pub async fn count_deadletter_activations(&self) -> Result { let start_time = Instant::now(); let mut pipe = redis::pipe(); @@ -1291,6 +1311,7 @@ impl InnerRedisActivationStore { } // Only used in testing + #[framed] pub async fn delete_all_keys(&self) -> Result<(), Error> { error!("deleting all keys"); let mut conn = self.get_conn().await?; @@ -1304,21 +1325,25 @@ impl InnerRedisActivationStore { Ok(()) } + #[framed] pub async fn db_size(&self) -> Result { // Not needed Ok(0) } + #[framed] pub async fn handle_deadletter_tasks(&self) -> Result)>, Error> { // Not needed Ok(vec![]) } + #[framed] pub async fn mark_deadletter_completed(&self, ids: Vec) -> Result { // Not needed Ok(0) } + #[framed] pub async fn handle_processing_attempts(&self) -> Result { // Not needed Ok(0) diff --git a/src/upkeep.rs b/src/upkeep.rs index dab25f36..04d2111f 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -1,5 +1,6 @@ use crate::store::inflight_activation::InflightActivation; use crate::store::inflight_redis_activation::RedisActivationStore; +use async_backtrace::{framed, taskdump_tree}; use chrono::{DateTime, Timelike, Utc}; use futures::{StreamExt, stream::FuturesUnordered}; use prost::Message; @@ -22,6 +23,7 @@ use uuid::Uuid; use crate::{SERVICE_NAME, config::Config, runtime_config::RuntimeConfigManager}; +#[framed] /// The upkeep task that periodically performs upkeep /// on the inflight store pub async fn upkeep( @@ -47,6 +49,7 @@ pub async fn upkeep( select! { _ = timer.tick() => { error!("Running upkeep at {}", last_run.elapsed().as_millis()); + error!("backtrace:\n{:?}", taskdump_tree(false)); let _ = do_upkeep( config.clone(), store.clone(), @@ -102,6 +105,7 @@ impl UpkeepResults { } } +#[framed] #[instrument( name = "upkeep::do_upkeep", skip(store, config, producer, runtime_config_manager) From f54a3b9351a3e5d2d2df36c5f02fbbee560a080f Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 5 Dec 2025 11:55:40 -0500 Subject: [PATCH 31/43] add retries --- src/store/inner_redis_activation_store.rs | 28 +++++++++++++++++------ 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 40d86831..f85ff571 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -6,7 +6,7 @@ use anyhow::Error; use async_backtrace::framed; use base64::{Engine as _, engine::general_purpose}; use chrono::{DateTime, Duration, Utc}; -use deadpool_redis::Pool; +use deadpool_redis::{Pool, Timeouts}; use futures::future::try_join_all; use redis::AsyncTypedCommands; use sentry_protos::taskbroker::v1::OnAttemptsExceeded; @@ -99,10 +99,20 @@ impl InnerRedisActivationStore { #[framed] pub async fn get_conn(&self) -> Result { let start_time = Instant::now(); - let conn = self.pool.get().await?; - let conn_duration = start_time.duration_since(start_time); - metrics::histogram!("redis_store.conn_duration").record(conn_duration.as_millis() as f64); - Ok(conn) + let mut retries = 0; + let timeouts = Timeouts::wait_millis(50); + while retries < 3 { + let conn = self.pool.timeout_get(&timeouts).await; + if conn.is_ok() { + return Ok(conn.unwrap()); + } + retries += 1; + } + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_conn.duration").record(duration.as_millis() as f64); + metrics::counter!("redis_store.get_conn.retries").increment(retries as u64); + return Err(anyhow::anyhow!("Failed to get connection after 3 retries")); } #[framed] @@ -661,11 +671,15 @@ impl InnerRedisActivationStore { for field in fields.iter().skip(1) { pipe.arg(field); } - let result: Vec> = pipe.query_async(&mut *conn).await?; + let result = pipe.query_async(&mut *conn).await; + if result.is_err() { + return Ok(HashMap::new()); + } + let raw_fields: Vec> = result.unwrap(); // Returns an array of tuples with the values in the same order as the fields array. // These needs to be combined into a map. let mut fields_map = HashMap::new(); - for values in result.iter() { + for values in raw_fields.iter() { for (idx, arg_name) in fields.iter().enumerate() { fields_map.insert(arg_name.to_string(), values[idx].clone()); } From 7e14e898af6db8541ab3c95a717bd1ab3985e58a Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 5 Dec 2025 15:03:21 -0500 Subject: [PATCH 32/43] refactor connections --- src/store/inner_redis_activation_store.rs | 284 +++++++++++++--------- 1 file changed, 166 insertions(+), 118 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index f85ff571..4b6f67bc 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -117,9 +117,9 @@ impl InnerRedisActivationStore { #[framed] pub async fn store(&self, batch: Vec) -> Result { - let mut conn = self.get_conn().await?; let mut rows_affected: u64 = 0; let start_time = Instant::now(); + let mut conn = self.get_conn().await?; for activation in batch { let payload_key = self .key_builder @@ -399,7 +399,6 @@ impl InnerRedisActivationStore { // If the activation is at_most_once, discard the activation and remove the payloads. // If it has deadletter configured, move it to the deadletter queue and keep the payloads. let start_time = Instant::now(); - let mut conn = self.get_conn().await?; let fields = self .get_fields_by_id( hashkey.clone(), @@ -420,6 +419,7 @@ impl InnerRedisActivationStore { .key_builder .get_deadletter_key(hashkey.clone(), activation_id) .build_redis_key(); + let mut conn = self.get_conn().await?; let result: usize = conn.rpush(deadletter_key.clone(), activation_id).await?; if result == 0 { return Err(anyhow::anyhow!( @@ -465,7 +465,6 @@ impl InnerRedisActivationStore { namespaces: Option<&[String]>, limit: Option, ) -> Result, Error> { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut activations: Vec = Vec::new(); let hash_keys = self.get_hash_keys(); @@ -492,14 +491,19 @@ impl InnerRedisActivationStore { .key_builder .get_pending_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let result = conn.lindex(pending_key.clone(), 0).await?; - if result.is_none() { - let get_by_id_duration = - get_by_id_start_time.duration_since(get_by_id_start_time); - metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); - continue; - } - let activation_id: String = result.unwrap().to_string(); + + let activation_id: String = { + // Scope for the connection + let mut conn = self.get_conn().await?; + let result = conn.lindex(pending_key.clone(), 0).await?; + if result.is_none() { + let get_by_id_duration = + get_by_id_start_time.duration_since(get_by_id_start_time); + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); + continue; + } + result.unwrap().to_string() + }; let act_result = self.get_by_id(hash_key.clone(), &activation_id).await?; if act_result.is_none() { @@ -521,32 +525,36 @@ impl InnerRedisActivationStore { Some(apd) => apd, } .timestamp_millis(); - let result: usize = conn - .zadd( - processing_key.clone(), - activation.id.clone(), - processing_deadline, - ) - .await?; - if result == 0 { - // If the activation is already in the processing set, this is not an error. - error!( - "Failed to move activation to processing: {} {}", - processing_key, activation_id - ); - metrics::counter!("redis_store.get_pending_activations_from_namespaces.already_moved_to_processing").increment(1); - } + { + // Scope for the connection + let mut conn = self.get_conn().await?; + let result: usize = conn + .zadd( + processing_key.clone(), + activation.id.clone(), + processing_deadline, + ) + .await?; + if result == 0 { + // If the activation is already in the processing set, this is not an error. + error!( + "Failed to move activation to processing: {} {}", + processing_key, activation_id + ); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.already_moved_to_processing").increment(1); + } - let result: usize = conn - .lrem(pending_key.clone(), 1, activation_id.clone()) - .await?; - if result == 0 { - error!( - "Attempted to lrem an activation from pending queue, but it was not found: {} {}", - pending_key, activation_id - ); - metrics::counter!("redis_store.get_pending_activations_from_namespaces.already_removed_from_pending") - .increment(1); + let result: usize = conn + .lrem(pending_key.clone(), 1, activation_id.clone()) + .await?; + if result == 0 { + error!( + "Attempted to lrem an activation from pending queue, but it was not found: {} {}", + pending_key, activation_id + ); + metrics::counter!("redis_store.get_pending_activations_from_namespaces.already_removed_from_pending") + .increment(1); + } } let get_by_id_duration = get_by_id_start_time.duration_since(get_by_id_start_time); metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration", "result" => "true").record(get_by_id_duration.as_millis() as f64); @@ -588,18 +596,21 @@ impl InnerRedisActivationStore { hash_key: HashKey, activation_id: &str, ) -> Result, Error> { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); let payload_key = self .key_builder .get_payload_key(hash_key, activation_id) .build_redis_key(); - let result: HashMap = conn.hgetall(payload_key.clone()).await?; - if result.is_empty() { - metrics::counter!("redis_store.get_by_id", "result" => "false").increment(1); - return Ok(None); - } - let activation: InflightActivation = result.into(); + + let activation: InflightActivation = { + let mut conn = self.get_conn().await?; + let result: HashMap = conn.hgetall(payload_key.clone()).await?; + if result.is_empty() { + metrics::counter!("redis_store.get_by_id", "result" => "false").increment(1); + return Ok(None); + } + result.into() + }; let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.get_by_id_duration").record(duration.as_millis() as f64); @@ -624,32 +635,37 @@ impl InnerRedisActivationStore { #[framed] pub async fn get_hashkey_by_id(&self, activation_id: &str) -> Result, Error> { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); - let result: HashMap = conn - .hgetall( - self.key_builder - .get_id_lookup_key(activation_id) - .build_redis_key(), - ) - .await?; - if result.is_empty() { - metrics::counter!("redis_store.get_hashkey_by_id", "result" => "false").increment(1); - let end_time = Instant::now(); - let duration = end_time.duration_since(start_time); - metrics::histogram!("redis_store.get_hashkey_by_id_duration") - .record(duration.as_millis() as f64); - return Ok(None); - } + let fields: HashMap = { + // Scope for the connection + let mut conn = self.get_conn().await?; + let result: HashMap = conn + .hgetall( + self.key_builder + .get_id_lookup_key(activation_id) + .build_redis_key(), + ) + .await?; + if result.is_empty() { + metrics::counter!("redis_store.get_hashkey_by_id", "result" => "false") + .increment(1); + let end_time = Instant::now(); + let duration = end_time.duration_since(start_time); + metrics::histogram!("redis_store.get_hashkey_by_id_duration") + .record(duration.as_millis() as f64); + return Ok(None); + } + result + }; metrics::counter!("redis_store.get_hashkey_by_id", "result" => "true").increment(1); let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.get_hashkey_by_id_duration") .record(duration.as_millis() as f64); Ok(Some(HashKey::new( - result.get("namespace").unwrap().to_string(), - result.get("topic").unwrap().to_string(), - result.get("partition").unwrap().parse().unwrap(), + fields.get("namespace").unwrap().to_string(), + fields.get("topic").unwrap().to_string(), + fields.get("partition").unwrap().parse().unwrap(), ))) } @@ -660,7 +676,6 @@ impl InnerRedisActivationStore { activation_id: &str, fields: &[&str], ) -> Result, Error> { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); let payload_key = self .key_builder @@ -671,13 +686,18 @@ impl InnerRedisActivationStore { for field in fields.iter().skip(1) { pipe.arg(field); } - let result = pipe.query_async(&mut *conn).await; - if result.is_err() { - return Ok(HashMap::new()); - } - let raw_fields: Vec> = result.unwrap(); // Returns an array of tuples with the values in the same order as the fields array. // These needs to be combined into a map. + let raw_fields: Vec> = { + // Scope for the connection + let mut conn = self.get_conn().await?; + let result = pipe.query_async(&mut *conn).await; + if result.is_err() { + return Ok(HashMap::new()); + } + result.unwrap() + }; + let mut fields_map = HashMap::new(); for values in raw_fields.iter() { for (idx, arg_name) in fields.iter().enumerate() { @@ -715,7 +735,6 @@ impl InnerRedisActivationStore { activation.partition, ); - let mut conn = self.get_conn().await?; let mut pipe = redis::pipe(); pipe.atomic(); let mut has_failure = false; @@ -746,7 +765,11 @@ impl InnerRedisActivationStore { .build_redis_key(); pipe.zrem(processing_key, activation_id); - let results: Vec = pipe.query_async(&mut *conn).await?; + let results: Vec = { + // Scope for the connection + let mut conn = self.get_conn().await?; + pipe.query_async(&mut *conn).await? + }; let expected_commands = if has_failure { 2 } else { 1 }; if results.len() != expected_commands { return Err(anyhow::anyhow!( @@ -783,22 +806,26 @@ impl InnerRedisActivationStore { #[framed] pub async fn get_retry_activations(&self) -> Result, Error> { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut activation_ids: Vec<(HashKey, String)> = Vec::new(); - for hash_key in self.get_hash_keys().iter() { - for bucket_hash in self.bucket_hashes.iter() { - let retry_key = self - .key_builder - .get_retry_key_for_iter(hash_key.clone(), bucket_hash.as_str()); - let result: Vec = conn.lrange(retry_key.build_redis_key(), 0, -1).await?; - activation_ids.extend( - result - .iter() - .map(|id| (retry_key.hashkey.clone(), id.clone())), - ); + { + // Scope for the connection + let mut conn = self.get_conn().await?; + for hash_key in self.get_hash_keys().iter() { + for bucket_hash in self.bucket_hashes.iter() { + let retry_key = self + .key_builder + .get_retry_key_for_iter(hash_key.clone(), bucket_hash.as_str()); + let result: Vec = + conn.lrange(retry_key.build_redis_key(), 0, -1).await?; + activation_ids.extend( + result + .iter() + .map(|id| (retry_key.hashkey.clone(), id.clone())), + ); + } } - } + }; let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!("redis_store.get_retry_activations.retry_loop.duration") @@ -838,7 +865,6 @@ impl InnerRedisActivationStore { if activations.is_empty() { return Ok(0); } - let mut conn = self.get_conn().await?; let start_time = Instant::now(); // Since this is a global operation, there is no guarantee that the keys will have the same hash key. @@ -859,6 +885,7 @@ impl InnerRedisActivationStore { let mut id_lookup_keys: Vec = Vec::new(); let mut rows_affected: u64 = 0; + let mut conn = self.get_conn().await?; for (hash_key, activations) in hash_key_to_activations.iter() { let mut pipe = redis::pipe(); pipe.atomic(); @@ -924,7 +951,6 @@ impl InnerRedisActivationStore { // there are no retries, as the worker will reject the activation due to idempotency keys. // If the task has processing attempts remaining, it is moved back to pending with attempts += 1 // Otherwise it is either discarded or moved to retry/deadletter. - let mut conn = self.get_conn().await?; let mut total_rows_affected: u64 = 0; let mut discarded_count: u64 = 0; let mut processing_attempts_exceeded_count: u64 = 0; @@ -936,13 +962,16 @@ impl InnerRedisActivationStore { .get_processing_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); // ZRANGEBYSCORE is deprecated but ZRANGE ... BYSCORE is also not supported so? - let activations: Vec = conn - .zrangebyscore( + let activations: Vec = { + // Scope for the connection + let mut conn = self.get_conn().await?; + conn.zrangebyscore( processing_key.clone(), "-inf".to_string(), Utc::now().timestamp_millis() as isize, ) - .await?; + .await? + }; if activations.is_empty() { continue; } @@ -972,12 +1001,16 @@ impl InnerRedisActivationStore { .parse::() .unwrap(); if at_most_once { - let result = conn.zrem(processing_key.clone(), activation_id).await?; - if result != 1 { - return Err(anyhow::anyhow!( - "Failed to remove activation from processing set: {}", - activation_id - )); + { + // Scope for the connection + let mut conn = self.get_conn().await?; + let result = conn.zrem(processing_key.clone(), activation_id).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } } self.discard_activation(hash_key.clone(), activation_id) .await?; @@ -995,12 +1028,16 @@ impl InnerRedisActivationStore { if processing_attempts >= self.max_processing_attempts { // Check for deadletter/dlq processing_attempts_exceeded_count += 1; - let result = conn.zrem(processing_key.clone(), activation_id).await?; - if result != 1 { - return Err(anyhow::anyhow!( - "Failed to remove activation from processing set: {}", - activation_id - )); + { + // Scope for the connection + let mut conn = self.get_conn().await?; + let result = conn.zrem(processing_key.clone(), activation_id).await?; + if result != 1 { + return Err(anyhow::anyhow!( + "Failed to remove activation from processing set: {}", + activation_id + )); + } } self.discard_activation(hash_key.clone(), activation_id) .await?; @@ -1028,10 +1065,11 @@ impl InnerRedisActivationStore { ); pipe.rpush(pending_key.clone(), activation_id); pipe.zrem(processing_key.clone(), activation_id); - let results: Vec = pipe.query_async(&mut *conn).await?; - let single_activation_duration = - single_activation_start_time.duration_since(single_activation_start_time); - metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "moved_to_pending").record(single_activation_duration.as_millis() as f64); + let results: Vec = { + // Scope for the connection + let mut conn = self.get_conn().await?; + pipe.query_async(&mut *conn).await? + }; if results.len() != 3 { return Err(anyhow::anyhow!( "Failed to move activation back to pending: incorrect number of commands run: expected 3, got {} for key {}", @@ -1062,7 +1100,11 @@ impl InnerRedisActivationStore { // "Failed to remove activation from processing set: {}", // activation_id // )); + // } } + let single_activation_duration = + single_activation_start_time.duration_since(single_activation_start_time); + metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "moved_to_pending").record(single_activation_duration.as_millis() as f64); } } } @@ -1079,7 +1121,6 @@ impl InnerRedisActivationStore { #[framed] pub async fn handle_expires_at(&self) -> Result { - let mut conn = self.get_conn().await?; let start_time = Instant::now(); let mut total_rows_affected = 0; for hash_key in self.get_hash_keys().iter() { @@ -1089,13 +1130,16 @@ impl InnerRedisActivationStore { .key_builder .get_expired_key_for_iter(hash_key.clone(), bucket_hash.as_str()) .build_redis_key(); - let activations: Vec = conn - .zrangebyscore( + let activations: Vec = { + // Scope for the connection + let mut conn = self.get_conn().await?; + conn.zrangebyscore( expires_at_key.clone(), 0, Utc::now().timestamp_millis() as isize, ) - .await?; + .await? + }; if activations.is_empty() { continue; } @@ -1112,16 +1156,20 @@ impl InnerRedisActivationStore { self.discard_activation(hash_key.clone(), activation_id) .await?; } - let results: Vec = pipe.query_async(&mut *conn).await?; - let single_bucket_duration = - single_bucket_start_time.duration_since(single_bucket_start_time); - metrics::histogram!("redis_store.handle_expires_at.single_bucket.duration") - .record(single_bucket_duration.as_millis() as f64); - if results.len() != 2 * activations.len() { - return Err(anyhow::anyhow!( - "Failed to remove expired activations: {}", - expires_at_key - )); + { + // Scope for the connection + let mut conn = self.get_conn().await?; + let results: Vec = pipe.query_async(&mut *conn).await?; + let single_bucket_duration = + single_bucket_start_time.duration_since(single_bucket_start_time); + metrics::histogram!("redis_store.handle_expires_at.single_bucket.duration") + .record(single_bucket_duration.as_millis() as f64); + if results.len() != 2 * activations.len() { + return Err(anyhow::anyhow!( + "Failed to remove expired activations: {}", + expires_at_key + )); + } } } } From ba31ce57cfb2c638a5a864de8ef743c18004e3d0 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 5 Dec 2025 15:16:30 -0500 Subject: [PATCH 33/43] fix processing bug for expired keys --- src/store/inner_redis_activation_store.rs | 36 ++++++++++++++--------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 4b6f67bc..b9e06708 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -990,16 +990,20 @@ impl InnerRedisActivationStore { "Failed to get payload for activation past processing deadline: {}", activation_id ); - let single_activation_duration = single_activation_start_time - .duration_since(single_activation_start_time); - metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "not_found").record(single_activation_duration.as_millis() as f64); - continue; + // let single_activation_duration = single_activation_start_time + // .duration_since(single_activation_start_time); + // metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "not_found").record(single_activation_duration.as_millis() as f64); + // continue; } - let at_most_once = fields - .get("at_most_once") - .unwrap_or(&"false".to_string()) - .parse::() - .unwrap(); + let at_most_once = if fields.is_empty() { + false + } else { + fields + .get("at_most_once") + .unwrap_or(&"false".to_string()) + .parse::() + .unwrap() + }; if at_most_once { { // Scope for the connection @@ -1020,11 +1024,15 @@ impl InnerRedisActivationStore { metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "at_most_once").record(single_activation_duration.as_millis() as f64); continue; } - let processing_attempts = fields - .get("processing_attempts") - .unwrap_or(&"0".to_string()) - .parse::() - .unwrap(); + let processing_attempts = if fields.is_empty() { + 0 + } else { + fields + .get("processing_attempts") + .unwrap_or(&"0".to_string()) + .parse::() + .unwrap() + }; if processing_attempts >= self.max_processing_attempts { // Check for deadletter/dlq processing_attempts_exceeded_count += 1; From 78e4863f033c741fb6851e5c4aac64b29f6135b5 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 5 Dec 2025 15:46:34 -0500 Subject: [PATCH 34/43] remove --- src/store/inner_redis_activation_store.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index b9e06708..8902be50 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -1085,13 +1085,13 @@ impl InnerRedisActivationStore { activation_id )); } - // processing_attempts should already be a key in the payload, so this should return 0 - if results[0] != 0 { - return Err(anyhow::anyhow!( - "Failed to increment processing attempts: {}", - activation_id - )); - } + // // processing_attempts should already be a key in the payload, so this should return 0 + // if results[0] != 0 { + // return Err(anyhow::anyhow!( + // "Failed to increment processing attempts: {}", + // activation_id + // )); + // } if results[1] == 0 { // Should at least have added itself to the pending queue return Err(anyhow::anyhow!( From dd45f27e90df0173d91d807e7133334980c2c301 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 5 Dec 2025 16:41:45 -0500 Subject: [PATCH 35/43] fix potential bug --- src/main.rs | 1 + src/store/inner_redis_activation_store.rs | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 52379b5c..6d0ec752 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,6 +69,7 @@ async fn main() -> Result<(), Error> { ) .await?, ); + redis_store.delete_all_keys().await?; // If this is an environment where the topics might not exist, check and create them. if config.create_missing_topics { diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 8902be50..bc21c668 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -990,10 +990,14 @@ impl InnerRedisActivationStore { "Failed to get payload for activation past processing deadline: {}", activation_id ); - // let single_activation_duration = single_activation_start_time - // .duration_since(single_activation_start_time); - // metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "not_found").record(single_activation_duration.as_millis() as f64); - // continue; + { + let mut conn = self.get_conn().await?; + conn.zrem(processing_key.clone(), activation_id).await?; + } + let single_activation_duration = single_activation_start_time + .duration_since(single_activation_start_time); + metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "not_found").record(single_activation_duration.as_millis() as f64); + continue; } let at_most_once = if fields.is_empty() { false From 7e40900ef55ed1384d6704530947ef0dbc648143 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Fri, 5 Dec 2025 16:52:06 -0500 Subject: [PATCH 36/43] remove delete --- src/main.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 6d0ec752..52379b5c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,7 +69,6 @@ async fn main() -> Result<(), Error> { ) .await?, ); - redis_store.delete_all_keys().await?; // If this is an environment where the topics might not exist, check and create them. if config.create_missing_topics { From b07c76d487d5b44884e534ccd6c919869c05cf93 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 8 Dec 2025 16:01:57 -0500 Subject: [PATCH 37/43] handle panic --- src/store/inner_redis_activation_store.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index bc21c668..c93389f2 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -505,15 +505,25 @@ impl InnerRedisActivationStore { result.unwrap().to_string() }; - let act_result = self.get_by_id(hash_key.clone(), &activation_id).await?; - if act_result.is_none() { + let act_result = self.get_by_id(hash_key.clone(), &activation_id).await; + if act_result.is_err() { + // TODO: This isn't the correct behaviour. We should be able to recover without removing the activation. + self.cleanup_activation(hash_key.clone(), &activation_id) + .await?; + let mut conn = self.get_conn().await?; + conn.lrem(pending_key.clone(), 1, activation_id.clone()) + .await?; + continue; + } + let potential = act_result.unwrap(); + if potential.is_none() { let get_by_id_duration = get_by_id_start_time.duration_since(get_by_id_start_time); metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); metrics::counter!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.not_found").increment(1); continue; } - let activation = act_result.unwrap(); + let activation = potential.unwrap(); // Push the activation to processing. This will not create two entries for the same activation in the case of duplicates. let processing_key = self From ce0aee5a1ca55ae21b92a1643b4f21bb193c39b2 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 8 Dec 2025 16:19:58 -0500 Subject: [PATCH 38/43] print error --- src/store/inner_redis_activation_store.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index c93389f2..09f1d3ae 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -619,6 +619,14 @@ impl InnerRedisActivationStore { metrics::counter!("redis_store.get_by_id", "result" => "false").increment(1); return Ok(None); } + if !result.contains_key("activation") { + // TODO remove this + error!( + "Activation not found for id: {}, skipping get by id: {:?}", + activation_id, result + ); + return Ok(None); + } result.into() }; let end_time = Instant::now(); From 8ed8bba49a1db9164f1e8ea6c703b31bb1d97444 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Mon, 8 Dec 2025 16:54:06 -0500 Subject: [PATCH 39/43] temp --- src/main.rs | 1 + src/store/inner_redis_activation_store.rs | 40 +++++++++++------------ 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/src/main.rs b/src/main.rs index 52379b5c..6d0ec752 100644 --- a/src/main.rs +++ b/src/main.rs @@ -69,6 +69,7 @@ async fn main() -> Result<(), Error> { ) .await?, ); + redis_store.delete_all_keys().await?; // If this is an environment where the topics might not exist, check and create them. if config.create_missing_topics { diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 09f1d3ae..19b1a771 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -181,7 +181,7 @@ impl InnerRedisActivationStore { .arg(activation.processing_deadline.unwrap().timestamp_millis()); expected_args += 1; } - pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); + // pipe.expire(payload_key.clone(), self.payload_ttl_seconds as i64); let mut queue_key_used = String::new(); if activation.delay_until.is_some() { @@ -254,9 +254,9 @@ impl InnerRedisActivationStore { } }; - if result.len() != 4 && result.len() != 5 { + if result.len() != 3 && result.len() != 4 { return Err(anyhow::anyhow!( - "Failed to store activation: incorrect number of commands run: expected 4 or 5, got {} for key {}", + "Failed to store activation: incorrect number of commands run: expected 3 or 4, got {} for key {}", result.len(), payload_key.clone() )); @@ -279,22 +279,22 @@ impl InnerRedisActivationStore { payload_key.clone() )); } - // EXPIRE returns 1 on success and 0 on failure - if result[1] != 1 { - return Err(anyhow::anyhow!( - "Failed to expire activation for key {}", - payload_key - )); - } + // // EXPIRE returns 1 on success and 0 on failure + // if result[1] != 1 { + // return Err(anyhow::anyhow!( + // "Failed to expire activation for key {}", + // payload_key + // )); + // } // Both ZADD and RPUSH return a count of elements in the structure - if result[2] <= 0 { + if result[1] <= 0 { return Err(anyhow::anyhow!( "Failed to add activation to queue for key {}", queue_key_used )); } // Check if the ZADD happened on the expired key - if result.len() == 5 && result[3] <= 0 { + if result.len() == 4 && result[2] <= 0 { return Err(anyhow::anyhow!( "Failed to add activation to expired queue for key {}", expired_key @@ -323,9 +323,9 @@ impl InnerRedisActivationStore { .arg("namespace") .arg(activation.namespace.clone()); - pipe.expire(lookup_key.clone(), self.payload_ttl_seconds as i64); + // pipe.expire(lookup_key.clone(), self.payload_ttl_seconds as i64); let result: Vec = pipe.query_async(&mut conn).await?; - if result.len() != 2 { + if result.len() != 1 { return Err(anyhow::anyhow!( "Failed to set id lookup for key {}", lookup_key.clone() @@ -337,12 +337,12 @@ impl InnerRedisActivationStore { lookup_key.clone() )); } - if result[1] != 1 { - return Err(anyhow::anyhow!( - "Failed to expire id lookup for key {}", - lookup_key.clone() - )); - } + // if result[1] != 1 { + // return Err(anyhow::anyhow!( + // "Failed to expire id lookup for key {}", + // lookup_key.clone() + // )); + // } rows_affected += 1; } let end_time = Instant::now(); From 24fe656d802c29becec1cd761119d306d1870a5a Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 9 Dec 2025 12:10:08 -0500 Subject: [PATCH 40/43] some logging --- src/store/inner_redis_activation_store.rs | 30 ++++++++++++++--------- src/store/redis_utils.rs | 2 +- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 19b1a771..6015f362 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -469,6 +469,7 @@ impl InnerRedisActivationStore { let mut activations: Vec = Vec::new(); let hash_keys = self.get_hash_keys(); let random_iterator = RandomStartIterator::new(hash_keys.len()); + error!("Hash Start: {:?}", random_iterator.random_start); let mut buckets_checked = 0; let mut hashes_checked = 0; for idx in random_iterator { @@ -482,6 +483,7 @@ impl InnerRedisActivationStore { continue; } let hash_iterator = RandomStartIterator::new(self.bucket_hashes.len()); + error!("Bucket Start: {:?}", hash_iterator.random_start); for bucket_idx in hash_iterator { let bucket_hash = self.bucket_hashes[bucket_idx].clone(); buckets_checked += 1; @@ -507,6 +509,10 @@ impl InnerRedisActivationStore { let act_result = self.get_by_id(hash_key.clone(), &activation_id).await; if act_result.is_err() { + error!( + "Failed to get activation by id: {:?}", + act_result.err().unwrap() + ); // TODO: This isn't the correct behaviour. We should be able to recover without removing the activation. self.cleanup_activation(hash_key.clone(), &activation_id) .await?; @@ -573,29 +579,29 @@ impl InnerRedisActivationStore { let end_time = Instant::now(); let duration = end_time.duration_since(start_time); metrics::histogram!( - "redis_store.get_pending_activations_from_namespaces.duration" + "redis_store.get_pending_activations_from_namespaces.duration", "found" => "true" ) .record(duration.as_millis() as f64); - metrics::histogram!( - "redis_store.get_pending_activations_from_namespaces.buckets_checked" + metrics::gauge!( + "redis_store.get_pending_activations_from_namespaces.buckets_checked", "found" => "true" ) - .record(buckets_checked as f64); - metrics::histogram!( - "redis_store.get_pending_activations_from_namespaces.hashes_checked" + .set(buckets_checked as f64); + metrics::gauge!( + "redis_store.get_pending_activations_from_namespaces.hashes_checked", "found" => "true" ) - .record(hashes_checked as f64); + .set(hashes_checked as f64); return Ok(activations); } } } let end_time = Instant::now(); let duration = end_time.duration_since(start_time); - metrics::histogram!("redis_store.get_pending_activations_from_namespaces.duration") + metrics::histogram!("redis_store.get_pending_activations_from_namespaces.duration", "found" => "false") .record(duration.as_millis() as f64); - metrics::counter!("redis_store.get_pending_activations_from_namespaces.buckets_checked") - .increment(buckets_checked); - metrics::counter!("redis_store.get_pending_activations_from_namespaces.hashes_checked") - .increment(hashes_checked); + metrics::gauge!("redis_store.get_pending_activations_from_namespaces.buckets_checked", "found" => "false") + .set(buckets_checked as f64); + metrics::gauge!("redis_store.get_pending_activations_from_namespaces.hashes_checked", "found" => "false") + .set(hashes_checked as f64); Ok(activations) } diff --git a/src/store/redis_utils.rs b/src/store/redis_utils.rs index 6bd7de65..fe2d808b 100644 --- a/src/store/redis_utils.rs +++ b/src/store/redis_utils.rs @@ -209,7 +209,7 @@ impl Key { pub struct RandomStartIterator { total_values: usize, - random_start: usize, + pub random_start: usize, current_index: usize, } From 284c3675c1c11eb8529e90e410c459db1b2ac1c4 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 9 Dec 2025 14:20:47 -0500 Subject: [PATCH 41/43] try to fix empty activation bug --- src/store/inner_redis_activation_store.rs | 269 ++++++++++++++++++---- src/upkeep.rs | 20 ++ 2 files changed, 238 insertions(+), 51 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 6015f362..03622655 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -510,7 +510,7 @@ impl InnerRedisActivationStore { let act_result = self.get_by_id(hash_key.clone(), &activation_id).await; if act_result.is_err() { error!( - "Failed to get activation by id: {:?}", + "Failed to get activation by id, continuing: {:?}", act_result.err().unwrap() ); // TODO: This isn't the correct behaviour. We should be able to recover without removing the activation. @@ -527,6 +527,19 @@ impl InnerRedisActivationStore { get_by_id_start_time.duration_since(get_by_id_start_time); metrics::histogram!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.duration", "result" => "false").record(get_by_id_duration.as_millis() as f64); metrics::counter!("redis_store.get_pending_activations_from_namespaces.get_by_id_duration.not_found").increment(1); + + error!( + "Activation is missing payload, continuing: {:?}", + activation_id + ); + // TODO: There is a bug somewhere that is setting activation payloads to have just "processing_attempts" set with nothing else. + // When this code finds one of these, it will clear the activation and remove it from the pending queue. + // This isn't correct, we shouldn't have this behaviour in the first place but for now I am ignoring it. + self.cleanup_activation(hash_key.clone(), &activation_id) + .await?; + let mut conn = self.get_conn().await?; + conn.lrem(pending_key.clone(), 1, activation_id.clone()) + .await?; continue; } let activation = potential.unwrap(); @@ -536,11 +549,10 @@ impl InnerRedisActivationStore { .key_builder .get_processing_key(hash_key.clone(), &activation_id) .build_redis_key(); - let processing_deadline = match activation.processing_deadline { - None => Utc::now() + Duration::seconds(self.processing_deadline_grace_sec), - Some(apd) => apd, - } + let processing_deadline = (Utc::now() + + Duration::seconds(activation.processing_deadline_duration as i64)) .timestamp_millis(); + { // Scope for the connection let mut conn = self.get_conn().await?; @@ -628,7 +640,7 @@ impl InnerRedisActivationStore { if !result.contains_key("activation") { // TODO remove this error!( - "Activation not found for id: {}, skipping get by id: {:?}", + "Activation not found for id: {}, full payload: {:?}", activation_id, result ); return Ok(None); @@ -996,6 +1008,7 @@ impl InnerRedisActivationStore { ) .await? }; + if activations.is_empty() { continue; } @@ -1023,15 +1036,11 @@ impl InnerRedisActivationStore { metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "not_found").record(single_activation_duration.as_millis() as f64); continue; } - let at_most_once = if fields.is_empty() { - false - } else { - fields - .get("at_most_once") - .unwrap_or(&"false".to_string()) - .parse::() - .unwrap() - }; + let at_most_once = fields + .get("at_most_once") + .unwrap_or(&"false".to_string()) + .parse::() + .unwrap(); if at_most_once { { // Scope for the connection @@ -1052,15 +1061,11 @@ impl InnerRedisActivationStore { metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "at_most_once").record(single_activation_duration.as_millis() as f64); continue; } - let processing_attempts = if fields.is_empty() { - 0 - } else { - fields - .get("processing_attempts") - .unwrap_or(&"0".to_string()) - .parse::() - .unwrap() - }; + let processing_attempts = fields + .get("processing_attempts") + .unwrap_or(&"0".to_string()) + .parse::() + .unwrap(); if processing_attempts >= self.max_processing_attempts { // Check for deadletter/dlq processing_attempts_exceeded_count += 1; @@ -1083,7 +1088,7 @@ impl InnerRedisActivationStore { metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "processing_attempts_exceeded").record(single_activation_duration.as_millis() as f64); continue; } - // Move back to pending + let pending_key = self .key_builder .get_pending_key(hash_key.clone(), activation_id) @@ -1092,21 +1097,39 @@ impl InnerRedisActivationStore { .key_builder .get_payload_key(hash_key.clone(), activation_id) .build_redis_key(); + let activation = self.get_by_id(hash_key.clone(), activation_id).await?; + + // Move back to pending let mut pipe = redis::pipe(); - pipe.atomic(); - pipe.hset( - payload_key, - "processing_attempts", - (processing_attempts + 1).to_string(), - ); - pipe.rpush(pending_key.clone(), activation_id); + if activation.is_some() { + pipe.atomic(); + pipe.hset( + payload_key, + "processing_attempts", + (processing_attempts + 1).to_string(), + ); + pipe.rpush(pending_key.clone(), activation_id); + } else { + metrics::counter!( + "redis_store.handle_processing_deadline.activation_not_found" + ) + .increment(1); + } pipe.zrem(processing_key.clone(), activation_id); let results: Vec = { // Scope for the connection let mut conn = self.get_conn().await?; pipe.query_async(&mut *conn).await? }; - if results.len() != 3 { + + if results.len() == 1 { + if results[0] != 1 { + error!( + "Failed to remove activation from processing set (output: {}): {} {}", + results[2], processing_key, activation_id + ); + } + } else if results.len() != 3 { return Err(anyhow::anyhow!( "Failed to move activation back to pending: incorrect number of commands run: expected 3, got {} for key {}", results.len(), @@ -1120,24 +1143,24 @@ impl InnerRedisActivationStore { // activation_id // )); // } - if results[1] == 0 { - // Should at least have added itself to the pending queue - return Err(anyhow::anyhow!( - "Failed to add activation to pending queue: {}", - activation_id - )); - } - if results[2] != 1 { - error!( - "Failed to remove activation from processing set (output: {}): {} {}", - results[2], processing_key, activation_id - ); - // return Err(anyhow::anyhow!( - // "Failed to remove activation from processing set: {}", - // activation_id - // )); - // } - } + // if results[1] == 0 { + // // Should at least have added itself to the pending queue + // return Err(anyhow::anyhow!( + // "Failed to add activation to pending queue: {}", + // activation_id + // )); + // } + // if results[2] != 1 { + // error!( + // "Failed to remove activation from processing set (output: {}): {} {}", + // results[2], processing_key, activation_id + // ); + // // return Err(anyhow::anyhow!( + // // "Failed to remove activation from processing set: {}", + // // activation_id + // // )); + // // } + // } let single_activation_duration = single_activation_start_time.duration_since(single_activation_start_time); metrics::histogram!("redis_store.handle_processing_deadline.single_activation.duration", "status" => "moved_to_pending").record(single_activation_duration.as_millis() as f64); @@ -1447,3 +1470,147 @@ impl InnerRedisActivationStore { Ok(0) } } + +// #[cfg(test)] +// mod tests { +// use super::{ActivationWriterConfig, InflightActivation, InflightActivationWriter, Reducer}; +// use chrono::{DateTime, Duration, Utc}; +// use prost::Message; +// use prost_types::Timestamp; +// use std::collections::HashMap; +// use uuid::Uuid; + +// use sentry_protos::taskbroker::v1::OnAttemptsExceeded; +// use sentry_protos::taskbroker::v1::TaskActivation; +// use std::sync::Arc; + +// use crate::store::inflight_activation::InflightActivationStatus; +// use crate::store::inflight_redis_activation::RedisActivationStore; +// use crate::store::inflight_redis_activation::RedisActivationStoreConfig; +// use crate::test_utils::create_integration_config; +// use crate::test_utils::generate_temp_redis_urls; +// use crate::test_utils::create_test_store; +// use crate::test_utils::make_activations; + +// fn activation_id() -> String { +// Uuid::new_v4().to_string() +// } +// #[tokio::test] +// async fn test_handle_processing_deadline() { +// let store = Arc::new( +// RedisActivationStore::new( +// generate_temp_redis_urls(), +// RedisActivationStoreConfig::from_config(&create_integration_config()), +// ) +// .await +// .unwrap(), +// ); +// store +// .delete_all_keys() +// .await +// .expect("Error deleting all keys"); + +// let writer_config = ActivationWriterConfig { +// db_max_size: None, +// max_buf_len: 100, +// max_pending_activations: 10, +// max_processing_activations: 10, +// max_delay_activations: 10, +// write_failure_backoff_ms: 4000, +// }; +// let store = Arc::new( +// RedisActivationStore::new( +// generate_temp_redis_urls(), +// RedisActivationStoreConfig::from_config(&create_integration_config()), +// ) +// .await +// .unwrap(), +// ); +// store +// .delete_all_keys() +// .await +// .expect("Error deleting all keys"); +// let mut writer = InflightActivationWriter::new(store.clone(), writer_config); +// let received_at = Timestamp { +// seconds: 0, +// nanos: 0, +// }; +// let batch = vec![ +// InflightActivation { +// id: activation_id(), +// activation: TaskActivation { +// id: activation_id(), +// namespace: "default".to_string(), +// taskname: "pending_task".to_string(), +// parameters: "{}".to_string(), +// headers: HashMap::new(), +// received_at: Some(received_at), +// retry_state: None, +// processing_deadline_duration: 0, +// expires: None, +// delay: None, +// } +// .encode_to_vec(), +// status: InflightActivationStatus::Pending, +// topic: "taskbroker-test".to_string(), +// partition: 0, +// offset: 0, +// added_at: Utc::now(), +// received_at: DateTime::from_timestamp( +// received_at.seconds, +// received_at.nanos as u32, +// ) +// .unwrap(), +// processing_attempts: 0, +// processing_deadline_duration: 0, +// expires_at: None, +// delay_until: None, +// processing_deadline: None, +// at_most_once: false, +// namespace: "default".to_string(), +// taskname: "pending_task".to_string(), +// on_attempts_exceeded: OnAttemptsExceeded::Discard, +// }, +// InflightActivation { +// id: activation_id(), +// activation: TaskActivation { +// id: activation_id(), +// namespace: "default".to_string(), +// taskname: "delay_task".to_string(), +// parameters: "{}".to_string(), +// headers: HashMap::new(), +// received_at: Some(received_at), +// retry_state: None, +// processing_deadline_duration: 0, +// expires: None, +// delay: None, +// } +// .encode_to_vec(), +// status: InflightActivationStatus::Delay, +// topic: "taskbroker-test".to_string(), +// partition: 0, +// offset: 0, +// added_at: Utc::now(), +// received_at: DateTime::from_timestamp( +// received_at.seconds, +// received_at.nanos as u32, +// ) +// .unwrap(), +// processing_attempts: 0, +// processing_deadline_duration: 0, +// expires_at: None, +// delay_until: None, +// processing_deadline: None, +// at_most_once: false, +// namespace: "default".to_string(), +// taskname: "delay_task".to_string(), +// on_attempts_exceeded: OnAttemptsExceeded::Discard, +// }, +// ]; +// writer.reduce(batch).await.unwrap(); +// writer.flush().await.unwrap(); +// let count_pending = writer.store.count_pending_activations().await.unwrap(); +// let count_delay = writer.store.count_delayed_activations().await.unwrap(); +// assert_eq!(count_pending + count_delay, 2); +// } +// } diff --git a/src/upkeep.rs b/src/upkeep.rs index 04d2111f..3f7ff974 100644 --- a/src/upkeep.rs +++ b/src/upkeep.rs @@ -983,6 +983,7 @@ mod tests { batch[0].status = InflightActivationStatus::Processing; batch[0].processing_deadline = Some(Utc.with_ymd_and_hms(2024, 11, 14, 21, 22, 23).unwrap()); + batch[0].processing_deadline_duration = 0; assert!(store.store(batch.clone()).await.is_ok()); assert!(store.get_pending_activation(None).await.unwrap().is_some()); // Move to processing @@ -1009,6 +1010,25 @@ mod tests { 1, "Should be one in pending" ); + let activation = store + .get_by_id( + HashKey::new( + batch[0].namespace.clone(), + batch[0].topic.clone(), + batch[0].partition, + ), + &batch[0].id, + ) + .await + .unwrap() + .unwrap(); + assert_eq!(activation.processing_attempts, 1); + assert_eq!(activation.status, InflightActivationStatus::Processing); + assert_eq!( + activation.processing_deadline, + Some(Utc.with_ymd_and_hms(2024, 11, 14, 21, 22, 23).unwrap()) + ); + assert_eq!(activation.on_attempts_exceeded, OnAttemptsExceeded::Discard); } #[tokio::test] From 580963ce184797675ede919cdf533ed91683fce0 Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 9 Dec 2025 14:32:00 -0500 Subject: [PATCH 42/43] attempt --- src/store/inner_redis_activation_store.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index 03622655..dce8878a 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -370,17 +370,19 @@ impl InnerRedisActivationStore { .build_redis_key(); let result: usize = conn.del(payload_key.clone()).await?; if result != 1 { - return Err(anyhow::anyhow!( - "Failed to cleanup payload for key {}", - payload_key.clone() - )); + error!( + "Failed to cleanup payload for key {}: {}", + payload_key.clone(), + result + ); } let result: usize = conn.del(id_lookup_key.clone()).await?; if result != 1 { - return Err(anyhow::anyhow!( - "Failed to cleanup id lookup for key {}", - id_lookup_key.clone() - )); + error!( + "Failed to cleanup id lookup for key {}: {}", + id_lookup_key.clone(), + result + ); } let end_time = Instant::now(); let duration = end_time.duration_since(start_time); From dd94f848902bdcba87070c3139ad8a32d4f3f16b Mon Sep 17 00:00:00 2001 From: Evan Hicks Date: Tue, 9 Dec 2025 14:32:30 -0500 Subject: [PATCH 43/43] no print --- src/store/inner_redis_activation_store.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/store/inner_redis_activation_store.rs b/src/store/inner_redis_activation_store.rs index dce8878a..78c1561d 100644 --- a/src/store/inner_redis_activation_store.rs +++ b/src/store/inner_redis_activation_store.rs @@ -471,7 +471,6 @@ impl InnerRedisActivationStore { let mut activations: Vec = Vec::new(); let hash_keys = self.get_hash_keys(); let random_iterator = RandomStartIterator::new(hash_keys.len()); - error!("Hash Start: {:?}", random_iterator.random_start); let mut buckets_checked = 0; let mut hashes_checked = 0; for idx in random_iterator { @@ -485,7 +484,6 @@ impl InnerRedisActivationStore { continue; } let hash_iterator = RandomStartIterator::new(self.bucket_hashes.len()); - error!("Bucket Start: {:?}", hash_iterator.random_start); for bucket_idx in hash_iterator { let bucket_hash = self.bucket_hashes[bucket_idx].clone(); buckets_checked += 1;