Skip to content

Commit 92252f5

Browse files
author
Valentin Obst
committed
net/tcp: add Rust implementation of BIC
Reimplement the Binary Increase Congestion (BIC) control algorithm in Rust. BIC is one of the smallest CCAs in the kernel and this mainly serves as a minimal example for a real-world algorithm.
1 parent 23b6819 commit 92252f5

File tree

3 files changed

+314
-0
lines changed

3 files changed

+314
-0
lines changed

net/ipv4/Kconfig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,15 @@ config TCP_CONG_BIC
510510
increase provides TCP friendliness.
511511
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
512512

513+
config TCP_CONG_BIC_RUST
514+
tristate "Binary Increase Congestion (BIC) control (Rust rewrite)"
515+
depends on RUST_TCP_ABSTRACTIONS
516+
help
517+
Rust rewrite of the original implementation of Binary Increase
518+
Congestion (BIC) control.
519+
520+
If unsure, say N.
521+
513522
config TCP_CONG_CUBIC
514523
tristate "CUBIC TCP"
515524
default y
@@ -705,6 +714,9 @@ choice
705714
config DEFAULT_BIC
706715
bool "Bic" if TCP_CONG_BIC=y
707716

717+
config DEFAULT_BIC_RUST
718+
bool "Bic (Rust)" if TCP_CONG_BIC_RUST=y
719+
708720
config DEFAULT_CUBIC
709721
bool "Cubic" if TCP_CONG_CUBIC=y
710722

@@ -746,6 +758,7 @@ config TCP_CONG_CUBIC
746758
config DEFAULT_TCP_CONG
747759
string
748760
default "bic" if DEFAULT_BIC
761+
default "bic_rust" if DEFAULT_BIC_RUST
749762
default "cubic" if DEFAULT_CUBIC
750763
default "htcp" if DEFAULT_HTCP
751764
default "hybla" if DEFAULT_HYBLA

net/ipv4/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
4646
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
4747
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
4848
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
49+
obj-$(CONFIG_TCP_CONG_BIC_RUST) += tcp_bic_rust.o
4950
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
5051
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
5152
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o

net/ipv4/tcp_bic_rust.rs

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
//! SPDX-License-Identifier: GPL-2.0
2+
//!
3+
//! Binary Increase Congestion control (BIC). Based on:
4+
//! Binary Increase Congestion Control (BIC) for Fast Long-Distance
5+
//! Networks - Lisong Xu, Khaled Harfoush, and Injong Rhee
6+
//! IEEE INFOCOM 2004, Hong Kong, China, 2004, pp. 2514-2524 vol.4
7+
//! doi: 10.1109/INFCOM.2004.1354672
8+
//! Link: https://doi.org/10.1109/INFCOM.2004.1354672
9+
//! Link: https://web.archive.org/web/20160417213452/http://netsrv.csc.ncsu.edu/export/bitcp.pdf
10+
11+
use core::cmp::{max, min};
12+
use core::num::NonZeroU32;
13+
use kernel::net::tcp::cong;
14+
use kernel::prelude::*;
15+
use kernel::time;
16+
use kernel::{c_str, module_cca};
17+
18+
const ACK_RATIO_SHIFT: u32 = 4;
19+
20+
// TODO: Convert to module parameters once they are available.
21+
/// Value of ssthresh for new connections.
22+
const INITIAL_SSTHRESH: Option<u32> = None;
23+
/// If cwnd is larger than this threshold, BIC engages; otherwise normal TCP
24+
/// increase/decrease will be performed.
25+
// NOTE: cwnd is expressed in units of full-sized segments.
26+
const LOW_WINDOW: u32 = 14;
27+
/// In binary search, go to point: `cwnd + (W_max - cwnd) / BICTCP_B`.
28+
// SAFETY: This will panic at compile time when passing zero.
29+
// TODO: Convert to `new::(x).unwrap()` once 'const_option' is stabilised.
30+
const BICTCP_B: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(4) };
31+
/// The maximum increment, i.e., `S_max`. This is used during additive increase.
32+
/// After crossing `W_max`, slow start is performed until passing
33+
/// `MAX_INCREMENT * (BICTCP_B - 1)`.
34+
// SAFETY: This will panic at compile time when passing zero.
35+
const MAX_INCREMENT: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(16) };
36+
/// The number of RTT it takes to get from `W_max - BICTCP_B` to `W_max` (and
37+
/// from `W_max` to `W_max + BICTCP_B`). This is not part of the original paper
38+
/// and results in a slow additive increase across `W_max`.
39+
const SMOOTH_PART: u32 = 20;
40+
/// Enable or disable fast convergence.
41+
const FAST_CONVERGENCE: bool = true;
42+
/// Factor for multiplicative decrease. In fast retransmit we have:
43+
/// `cwnd = cwnd * BETA/BETA_SCALE`
44+
/// and if fast convergence is active:
45+
/// `W_max = cwnd * (1 + BETA/BETA_SCALE)/2`
46+
/// instead of `W_max = cwnd`.
47+
const BETA: u32 = 819;
48+
/// Used to calculate beta in [0, 1] with integer arithmetics.
49+
// SAFETY: This will panic at compile time when passing zero.
50+
const BETA_SCALE: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1024) };
51+
/// The minimum amount of time that has to pass between two updates of the cwnd.
52+
const MIN_UPDATE_INTERVAL: time::Nsecs = 31250000;
53+
54+
module_cca! {
55+
type: Bic,
56+
name: "tcp_bic_rust",
57+
author: "Rust for Linux Contributors",
58+
description: "Binary Increase Congestion (BIC) control algorithm, Rust implementation",
59+
license: "GPL v2",
60+
}
61+
62+
struct Bic {}
63+
64+
#[vtable]
65+
impl cong::Algorithm for Bic {
66+
type Data = BicState;
67+
68+
const NAME: &'static CStr = c_str!("bic_rust");
69+
70+
fn pkts_acked(sk: &mut cong::Sock<'_, Self>, sample: &cong::AckSample) {
71+
if let Ok(cong::State::Open) = sk.inet_csk().ca_state() {
72+
let ca = sk.inet_csk_ca_mut();
73+
74+
// This is supposed to wrap.
75+
ca.delayed_ack = ca.delayed_ack.wrapping_add(
76+
sample
77+
.pkts_acked()
78+
.wrapping_sub(ca.delayed_ack >> ACK_RATIO_SHIFT),
79+
);
80+
}
81+
}
82+
83+
fn ssthresh(sk: &mut cong::Sock<'_, Self>) -> u32 {
84+
let cwnd = sk.tcp_sk().snd_cwnd();
85+
let ca = sk.inet_csk_ca_mut();
86+
87+
pr_info!(
88+
// TODO: remove
89+
"Enter fast retransmit: time {}, start {}",
90+
time::ktime_get_boot_fast_ns(),
91+
ca.start_time
92+
);
93+
94+
// Epoch has ended.
95+
ca.epoch_start = 0;
96+
ca.last_max_cwnd = if cwnd < ca.last_max_cwnd && FAST_CONVERGENCE {
97+
(cwnd * (BETA_SCALE.get() + BETA)) / (2 * BETA_SCALE.get())
98+
} else {
99+
cwnd
100+
};
101+
102+
if cwnd <= LOW_WINDOW {
103+
max(cwnd >> 1, 2)
104+
} else {
105+
max((cwnd * BETA) / BETA_SCALE, 2)
106+
}
107+
}
108+
109+
fn cong_avoid(sk: &mut cong::Sock<'_, Self>, _ack: u32, mut acked: u32) {
110+
if !sk.tcp_is_cwnd_limited() {
111+
return;
112+
}
113+
114+
let tp = sk.tcp_sk_mut();
115+
116+
if tp.in_slow_start() {
117+
acked = tp.slow_start(acked);
118+
if acked == 0 {
119+
pr_info!(
120+
// TODO: remove
121+
"New cwnd {}, time {}, ssthresh {}, start {}, ss 1",
122+
sk.tcp_sk().snd_cwnd(),
123+
time::ktime_get_boot_fast_ns(),
124+
sk.tcp_sk().snd_ssthresh(),
125+
sk.inet_csk_ca().start_time
126+
);
127+
return;
128+
}
129+
}
130+
131+
let cwnd = tp.snd_cwnd();
132+
let cnt = sk.inet_csk_ca_mut().update(cwnd);
133+
sk.tcp_sk_mut().cong_avoid_ai(cnt, acked);
134+
135+
pr_info!(
136+
// TODO: remove
137+
"New cwnd {}, time {}, ssthresh {}, start {}, ss 0",
138+
sk.tcp_sk().snd_cwnd(),
139+
time::ktime_get_boot_fast_ns(),
140+
sk.tcp_sk().snd_ssthresh(),
141+
sk.inet_csk_ca().start_time
142+
);
143+
}
144+
145+
fn set_state(sk: &mut cong::Sock<'_, Self>, new_state: cong::State) {
146+
if matches!(new_state, cong::State::Loss) {
147+
pr_info!(
148+
// TODO: remove
149+
"Retransmission timeout fired: time {}, start {}",
150+
time::ktime_get_boot_fast_ns(),
151+
sk.inet_csk_ca().start_time
152+
);
153+
sk.inet_csk_ca_mut().reset()
154+
}
155+
}
156+
157+
fn undo_cwnd(sk: &mut cong::Sock<'_, Self>) -> u32 {
158+
pr_info!(
159+
// TODO: remove
160+
"Undo cwnd reduction: time {}, start {}",
161+
time::ktime_get_boot_fast_ns(),
162+
sk.inet_csk_ca().start_time
163+
);
164+
165+
cong::reno::undo_cwnd(sk)
166+
}
167+
168+
fn init(sk: &mut cong::Sock<'_, Self>) {
169+
if let Some(ssthresh) = INITIAL_SSTHRESH {
170+
sk.tcp_sk_mut().set_snd_ssthresh(ssthresh);
171+
}
172+
173+
// TODO: remove
174+
pr_info!("Socket created: start {}", sk.inet_csk_ca().start_time);
175+
}
176+
177+
// TODO: remove
178+
fn release(sk: &mut cong::Sock<'_, Self>) {
179+
pr_info!(
180+
"Socket destroyed: start {}, end {}",
181+
sk.inet_csk_ca().start_time,
182+
time::ktime_get_boot_fast_ns()
183+
);
184+
}
185+
}
186+
187+
/// Internal state of each instance of the algorithm.
188+
struct BicState {
189+
/// During congestion avoidance, cwnd is increased at most every `cnt`
190+
/// acknowledged packets, i.e., the average increase per acknowledged packet
191+
/// is proportional to `1 / cnt`.
192+
// NOTE: The C impl initialises this to zero. It then ensures that zero is
193+
// never passed to `cong_avoid_ai`, which could divide by it. Make it
194+
// explicit in the types that zero is not a valid value.
195+
cnt: NonZeroU32,
196+
/// Last maximum `snd_cwnd`, i.e, `W_max`.
197+
last_max_cwnd: u32,
198+
/// The last `snd_cwnd`.
199+
last_cwnd: u32,
200+
/// Time when `last_cwnd` was updated.
201+
last_time: time::Nsecs,
202+
/// Records the beginning of an epoch.
203+
epoch_start: time::Nsecs,
204+
/// Estimates the ratio of `packets/ACK << 4`. This allows us to adjust cwnd
205+
/// per packet when a receiver is sending a single ACK for multiple received
206+
/// packets.
207+
delayed_ack: u32,
208+
/// Time when algorithm was initialised.
209+
// TODO: remove
210+
start_time: time::Nsecs,
211+
}
212+
213+
impl Default for BicState {
214+
fn default() -> Self {
215+
Self {
216+
// NOTE: Initializing this to 1 deviates from the C code. It does
217+
// not change the behavior.
218+
cnt: NonZeroU32::MIN,
219+
last_max_cwnd: 0,
220+
last_cwnd: 0,
221+
last_time: 0,
222+
epoch_start: 0,
223+
delayed_ack: 2 << ACK_RATIO_SHIFT,
224+
// TODO: remove
225+
start_time: time::ktime_get_boot_fast_ns(),
226+
}
227+
}
228+
}
229+
230+
impl BicState {
231+
/// Compute congestion window to use. Returns the new `cnt`.
232+
///
233+
/// This governs the behavior of the algorithm during congestion avoidance.
234+
fn update(&mut self, cwnd: u32) -> NonZeroU32 {
235+
let timestamp = time::ktime_get_boot_fast_ns();
236+
237+
// Do nothing if we are invoked too frequently.
238+
if self.last_cwnd == cwnd && (timestamp - self.last_time) <= MIN_UPDATE_INTERVAL {
239+
return self.cnt;
240+
}
241+
242+
self.last_cwnd = cwnd;
243+
self.last_time = timestamp;
244+
245+
// Record the beginning of an epoch.
246+
if self.epoch_start == 0 {
247+
self.epoch_start = timestamp;
248+
}
249+
250+
// Start off like normal TCP.
251+
if cwnd <= LOW_WINDOW {
252+
self.cnt = NonZeroU32::new(cwnd).unwrap_or(NonZeroU32::MIN);
253+
return self.cnt;
254+
}
255+
256+
let mut new_cnt = if cwnd < self.last_max_cwnd {
257+
// binary increase
258+
let dist: u32 = (self.last_max_cwnd - cwnd) / BICTCP_B;
259+
260+
if dist > MAX_INCREMENT.get() {
261+
cwnd / MAX_INCREMENT // additive increase
262+
} else if dist <= 1 {
263+
(cwnd * SMOOTH_PART) / BICTCP_B // careful additive increase
264+
} else {
265+
cwnd / dist // binary search
266+
}
267+
} else {
268+
if cwnd < self.last_max_cwnd + BICTCP_B.get() {
269+
(cwnd * SMOOTH_PART) / BICTCP_B // careful additive increase
270+
} else if cwnd < self.last_max_cwnd + MAX_INCREMENT.get() * (BICTCP_B.get() - 1) {
271+
(cwnd * (BICTCP_B.get() - 1)) / (cwnd - self.last_max_cwnd) // slow start
272+
} else {
273+
cwnd / MAX_INCREMENT // linear increase
274+
}
275+
};
276+
277+
// If in initial slow start or link utilization is very low.
278+
if self.last_max_cwnd == 0 {
279+
new_cnt = min(new_cnt, 20);
280+
}
281+
282+
// Account for estimated packets/ACK to ensure that we increase per
283+
// packet.
284+
new_cnt = (new_cnt << ACK_RATIO_SHIFT) / self.delayed_ack;
285+
286+
self.cnt = NonZeroU32::new(new_cnt).unwrap_or(NonZeroU32::MIN);
287+
288+
self.cnt
289+
}
290+
291+
fn reset(&mut self) {
292+
// TODO: remove
293+
let tmp = self.start_time;
294+
295+
*self = Self::default();
296+
297+
// TODO: remove
298+
self.start_time = tmp;
299+
}
300+
}

0 commit comments

Comments
 (0)