|
| 1 | +//! SPDX-License-Identifier: GPL-2.0 |
| 2 | +//! |
| 3 | +//! Binary Increase Congestion control (BIC). Based on: |
| 4 | +//! Binary Increase Congestion Control (BIC) for Fast Long-Distance |
| 5 | +//! Networks - Lisong Xu, Khaled Harfoush, and Injong Rhee |
| 6 | +//! IEEE INFOCOM 2004, Hong Kong, China, 2004, pp. 2514-2524 vol.4 |
| 7 | +//! doi: 10.1109/INFCOM.2004.1354672 |
| 8 | +//! Link: https://doi.org/10.1109/INFCOM.2004.1354672 |
| 9 | +//! Link: https://web.archive.org/web/20160417213452/http://netsrv.csc.ncsu.edu/export/bitcp.pdf |
| 10 | +
|
| 11 | +use core::cmp::{max, min}; |
| 12 | +use core::num::NonZeroU32; |
| 13 | +use kernel::net::tcp::cong; |
| 14 | +use kernel::prelude::*; |
| 15 | +use kernel::time; |
| 16 | +use kernel::{c_str, module_cca}; |
| 17 | + |
| 18 | +const ACK_RATIO_SHIFT: u32 = 4; |
| 19 | + |
| 20 | +// TODO: Convert to module parameters once they are available. |
| 21 | +/// Value of ssthresh for new connections. |
| 22 | +const INITIAL_SSTHRESH: Option<u32> = None; |
| 23 | +/// If cwnd is larger than this threshold, BIC engages; otherwise normal TCP |
| 24 | +/// increase/decrease will be performed. |
| 25 | +// NOTE: cwnd is expressed in units of full-sized segments. |
| 26 | +const LOW_WINDOW: u32 = 14; |
| 27 | +/// In binary search, go to point: `cwnd + (W_max - cwnd) / BICTCP_B`. |
| 28 | +// SAFETY: This will panic at compile time when passing zero. |
| 29 | +// TODO: Convert to `new::(x).unwrap()` once 'const_option' is stabilised. |
| 30 | +const BICTCP_B: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(4) }; |
| 31 | +/// The maximum increment, i.e., `S_max`. This is used during additive increase. |
| 32 | +/// After crossing `W_max`, slow start is performed until passing |
| 33 | +/// `MAX_INCREMENT * (BICTCP_B - 1)`. |
| 34 | +// SAFETY: This will panic at compile time when passing zero. |
| 35 | +const MAX_INCREMENT: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(16) }; |
| 36 | +/// The number of RTT it takes to get from `W_max - BICTCP_B` to `W_max` (and |
| 37 | +/// from `W_max` to `W_max + BICTCP_B`). This is not part of the original paper |
| 38 | +/// and results in a slow additive increase across `W_max`. |
| 39 | +const SMOOTH_PART: u32 = 20; |
| 40 | +/// Enable or disable fast convergence. |
| 41 | +const FAST_CONVERGENCE: bool = true; |
| 42 | +/// Factor for multiplicative decrease. In fast retransmit we have: |
| 43 | +/// `cwnd = cwnd * BETA/BETA_SCALE` |
| 44 | +/// and if fast convergence is active: |
| 45 | +/// `W_max = cwnd * (1 + BETA/BETA_SCALE)/2` |
| 46 | +/// instead of `W_max = cwnd`. |
| 47 | +const BETA: u32 = 819; |
| 48 | +/// Used to calculate beta in [0, 1] with integer arithmetics. |
| 49 | +// SAFETY: This will panic at compile time when passing zero. |
| 50 | +const BETA_SCALE: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1024) }; |
| 51 | +/// The minimum amount of time that has to pass between two updates of the cwnd. |
| 52 | +const MIN_UPDATE_INTERVAL: time::Nsecs = 31250000; |
| 53 | + |
| 54 | +module_cca! { |
| 55 | + type: Bic, |
| 56 | + name: "tcp_bic_rust", |
| 57 | + author: "Rust for Linux Contributors", |
| 58 | + description: "Binary Increase Congestion (BIC) control algorithm, Rust implementation", |
| 59 | + license: "GPL v2", |
| 60 | +} |
| 61 | + |
| 62 | +struct Bic {} |
| 63 | + |
| 64 | +#[vtable] |
| 65 | +impl cong::Algorithm for Bic { |
| 66 | + type Data = BicState; |
| 67 | + |
| 68 | + const NAME: &'static CStr = c_str!("bic_rust"); |
| 69 | + |
| 70 | + fn pkts_acked(sk: &mut cong::Sock<'_, Self>, sample: &cong::AckSample) { |
| 71 | + if let Ok(cong::State::Open) = sk.inet_csk().ca_state() { |
| 72 | + let ca = sk.inet_csk_ca_mut(); |
| 73 | + |
| 74 | + // This is supposed to wrap. |
| 75 | + ca.delayed_ack = ca.delayed_ack.wrapping_add( |
| 76 | + sample |
| 77 | + .pkts_acked() |
| 78 | + .wrapping_sub(ca.delayed_ack >> ACK_RATIO_SHIFT), |
| 79 | + ); |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + fn ssthresh(sk: &mut cong::Sock<'_, Self>) -> u32 { |
| 84 | + let cwnd = sk.tcp_sk().snd_cwnd(); |
| 85 | + let ca = sk.inet_csk_ca_mut(); |
| 86 | + |
| 87 | + pr_info!( |
| 88 | + // TODO: remove |
| 89 | + "Enter fast retransmit: time {}, start {}", |
| 90 | + time::ktime_get_boot_fast_ns(), |
| 91 | + ca.start_time |
| 92 | + ); |
| 93 | + |
| 94 | + // Epoch has ended. |
| 95 | + ca.epoch_start = 0; |
| 96 | + ca.last_max_cwnd = if cwnd < ca.last_max_cwnd && FAST_CONVERGENCE { |
| 97 | + (cwnd * (BETA_SCALE.get() + BETA)) / (2 * BETA_SCALE.get()) |
| 98 | + } else { |
| 99 | + cwnd |
| 100 | + }; |
| 101 | + |
| 102 | + if cwnd <= LOW_WINDOW { |
| 103 | + max(cwnd >> 1, 2) |
| 104 | + } else { |
| 105 | + max((cwnd * BETA) / BETA_SCALE, 2) |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + fn cong_avoid(sk: &mut cong::Sock<'_, Self>, _ack: u32, mut acked: u32) { |
| 110 | + if !sk.tcp_is_cwnd_limited() { |
| 111 | + return; |
| 112 | + } |
| 113 | + |
| 114 | + let tp = sk.tcp_sk_mut(); |
| 115 | + |
| 116 | + if tp.in_slow_start() { |
| 117 | + acked = tp.slow_start(acked); |
| 118 | + if acked == 0 { |
| 119 | + pr_info!( |
| 120 | + // TODO: remove |
| 121 | + "New cwnd {}, time {}, ssthresh {}, start {}, ss 1", |
| 122 | + sk.tcp_sk().snd_cwnd(), |
| 123 | + time::ktime_get_boot_fast_ns(), |
| 124 | + sk.tcp_sk().snd_ssthresh(), |
| 125 | + sk.inet_csk_ca().start_time |
| 126 | + ); |
| 127 | + return; |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + let cwnd = tp.snd_cwnd(); |
| 132 | + let cnt = sk.inet_csk_ca_mut().update(cwnd); |
| 133 | + sk.tcp_sk_mut().cong_avoid_ai(cnt, acked); |
| 134 | + |
| 135 | + pr_info!( |
| 136 | + // TODO: remove |
| 137 | + "New cwnd {}, time {}, ssthresh {}, start {}, ss 0", |
| 138 | + sk.tcp_sk().snd_cwnd(), |
| 139 | + time::ktime_get_boot_fast_ns(), |
| 140 | + sk.tcp_sk().snd_ssthresh(), |
| 141 | + sk.inet_csk_ca().start_time |
| 142 | + ); |
| 143 | + } |
| 144 | + |
| 145 | + fn set_state(sk: &mut cong::Sock<'_, Self>, new_state: cong::State) { |
| 146 | + if matches!(new_state, cong::State::Loss) { |
| 147 | + pr_info!( |
| 148 | + // TODO: remove |
| 149 | + "Retransmission timeout fired: time {}, start {}", |
| 150 | + time::ktime_get_boot_fast_ns(), |
| 151 | + sk.inet_csk_ca().start_time |
| 152 | + ); |
| 153 | + sk.inet_csk_ca_mut().reset() |
| 154 | + } |
| 155 | + } |
| 156 | + |
| 157 | + fn undo_cwnd(sk: &mut cong::Sock<'_, Self>) -> u32 { |
| 158 | + pr_info!( |
| 159 | + // TODO: remove |
| 160 | + "Undo cwnd reduction: time {}, start {}", |
| 161 | + time::ktime_get_boot_fast_ns(), |
| 162 | + sk.inet_csk_ca().start_time |
| 163 | + ); |
| 164 | + |
| 165 | + cong::reno::undo_cwnd(sk) |
| 166 | + } |
| 167 | + |
| 168 | + fn init(sk: &mut cong::Sock<'_, Self>) { |
| 169 | + if let Some(ssthresh) = INITIAL_SSTHRESH { |
| 170 | + sk.tcp_sk_mut().set_snd_ssthresh(ssthresh); |
| 171 | + } |
| 172 | + |
| 173 | + // TODO: remove |
| 174 | + pr_info!("Socket created: start {}", sk.inet_csk_ca().start_time); |
| 175 | + } |
| 176 | + |
| 177 | + // TODO: remove |
| 178 | + fn release(sk: &mut cong::Sock<'_, Self>) { |
| 179 | + pr_info!( |
| 180 | + "Socket destroyed: start {}, end {}", |
| 181 | + sk.inet_csk_ca().start_time, |
| 182 | + time::ktime_get_boot_fast_ns() |
| 183 | + ); |
| 184 | + } |
| 185 | +} |
| 186 | + |
| 187 | +/// Internal state of each instance of the algorithm. |
| 188 | +struct BicState { |
| 189 | + /// During congestion avoidance, cwnd is increased at most every `cnt` |
| 190 | + /// acknowledged packets, i.e., the average increase per acknowledged packet |
| 191 | + /// is proportional to `1 / cnt`. |
| 192 | + // NOTE: The C impl initialises this to zero. It then ensures that zero is |
| 193 | + // never passed to `cong_avoid_ai`, which could divide by it. Make it |
| 194 | + // explicit in the types that zero is not a valid value. |
| 195 | + cnt: NonZeroU32, |
| 196 | + /// Last maximum `snd_cwnd`, i.e, `W_max`. |
| 197 | + last_max_cwnd: u32, |
| 198 | + /// The last `snd_cwnd`. |
| 199 | + last_cwnd: u32, |
| 200 | + /// Time when `last_cwnd` was updated. |
| 201 | + last_time: time::Nsecs, |
| 202 | + /// Records the beginning of an epoch. |
| 203 | + epoch_start: time::Nsecs, |
| 204 | + /// Estimates the ratio of `packets/ACK << 4`. This allows us to adjust cwnd |
| 205 | + /// per packet when a receiver is sending a single ACK for multiple received |
| 206 | + /// packets. |
| 207 | + delayed_ack: u32, |
| 208 | + /// Time when algorithm was initialised. |
| 209 | + // TODO: remove |
| 210 | + start_time: time::Nsecs, |
| 211 | +} |
| 212 | + |
| 213 | +impl Default for BicState { |
| 214 | + fn default() -> Self { |
| 215 | + Self { |
| 216 | + // NOTE: Initializing this to 1 deviates from the C code. It does |
| 217 | + // not change the behavior. |
| 218 | + cnt: NonZeroU32::MIN, |
| 219 | + last_max_cwnd: 0, |
| 220 | + last_cwnd: 0, |
| 221 | + last_time: 0, |
| 222 | + epoch_start: 0, |
| 223 | + delayed_ack: 2 << ACK_RATIO_SHIFT, |
| 224 | + // TODO: remove |
| 225 | + start_time: time::ktime_get_boot_fast_ns(), |
| 226 | + } |
| 227 | + } |
| 228 | +} |
| 229 | + |
| 230 | +impl BicState { |
| 231 | + /// Compute congestion window to use. Returns the new `cnt`. |
| 232 | + /// |
| 233 | + /// This governs the behavior of the algorithm during congestion avoidance. |
| 234 | + fn update(&mut self, cwnd: u32) -> NonZeroU32 { |
| 235 | + let timestamp = time::ktime_get_boot_fast_ns(); |
| 236 | + |
| 237 | + // Do nothing if we are invoked too frequently. |
| 238 | + if self.last_cwnd == cwnd && (timestamp - self.last_time) <= MIN_UPDATE_INTERVAL { |
| 239 | + return self.cnt; |
| 240 | + } |
| 241 | + |
| 242 | + self.last_cwnd = cwnd; |
| 243 | + self.last_time = timestamp; |
| 244 | + |
| 245 | + // Record the beginning of an epoch. |
| 246 | + if self.epoch_start == 0 { |
| 247 | + self.epoch_start = timestamp; |
| 248 | + } |
| 249 | + |
| 250 | + // Start off like normal TCP. |
| 251 | + if cwnd <= LOW_WINDOW { |
| 252 | + self.cnt = NonZeroU32::new(cwnd).unwrap_or(NonZeroU32::MIN); |
| 253 | + return self.cnt; |
| 254 | + } |
| 255 | + |
| 256 | + let mut new_cnt = if cwnd < self.last_max_cwnd { |
| 257 | + // binary increase |
| 258 | + let dist: u32 = (self.last_max_cwnd - cwnd) / BICTCP_B; |
| 259 | + |
| 260 | + if dist > MAX_INCREMENT.get() { |
| 261 | + cwnd / MAX_INCREMENT // additive increase |
| 262 | + } else if dist <= 1 { |
| 263 | + (cwnd * SMOOTH_PART) / BICTCP_B // careful additive increase |
| 264 | + } else { |
| 265 | + cwnd / dist // binary search |
| 266 | + } |
| 267 | + } else { |
| 268 | + if cwnd < self.last_max_cwnd + BICTCP_B.get() { |
| 269 | + (cwnd * SMOOTH_PART) / BICTCP_B // careful additive increase |
| 270 | + } else if cwnd < self.last_max_cwnd + MAX_INCREMENT.get() * (BICTCP_B.get() - 1) { |
| 271 | + (cwnd * (BICTCP_B.get() - 1)) / (cwnd - self.last_max_cwnd) // slow start |
| 272 | + } else { |
| 273 | + cwnd / MAX_INCREMENT // linear increase |
| 274 | + } |
| 275 | + }; |
| 276 | + |
| 277 | + // If in initial slow start or link utilization is very low. |
| 278 | + if self.last_max_cwnd == 0 { |
| 279 | + new_cnt = min(new_cnt, 20); |
| 280 | + } |
| 281 | + |
| 282 | + // Account for estimated packets/ACK to ensure that we increase per |
| 283 | + // packet. |
| 284 | + new_cnt = (new_cnt << ACK_RATIO_SHIFT) / self.delayed_ack; |
| 285 | + |
| 286 | + self.cnt = NonZeroU32::new(new_cnt).unwrap_or(NonZeroU32::MIN); |
| 287 | + |
| 288 | + self.cnt |
| 289 | + } |
| 290 | + |
| 291 | + fn reset(&mut self) { |
| 292 | + // TODO: remove |
| 293 | + let tmp = self.start_time; |
| 294 | + |
| 295 | + *self = Self::default(); |
| 296 | + |
| 297 | + // TODO: remove |
| 298 | + self.start_time = tmp; |
| 299 | + } |
| 300 | +} |
0 commit comments