Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e87d4bb
client/daemon: route liveness / bfd
snormore Nov 8, 2025
79fd7d0
client/daemon: route liveness / bfd / passive-mode
snormore Nov 8, 2025
e1a126e
e2e: multi client test start devices in parallel
snormore Nov 8, 2025
e2e294d
client/daemon: route liveness / bfd / e2e testing
snormore Nov 8, 2025
640a9af
client/daemon: route liveness / bfd / docs
snormore Nov 8, 2025
946b92e
client/daemon: route liveness / bfd / ipv4
snormore Nov 8, 2025
4aedd27
client/daemon: route liveness / bfd / throttled log write error
snormore Nov 8, 2025
c66bbab
client/daemon: route liveness / bfd / tests
snormore Nov 8, 2025
4a9a60e
tools/uping: remove unused package
snormore Nov 8, 2025
fc37437
client/daemon: route liveness / bfd / changelog
snormore Nov 8, 2025
3b2368d
client/daemon: route liveness / bfd / e2e testing
snormore Nov 9, 2025
6fe9b19
client/daemon: route liveness / bfd / udp cleanup
snormore Nov 9, 2025
ecb19cd
client/daemon: route liveness / bfd / run error
snormore Nov 9, 2025
501489d
client/daemon: route liveness / bfd / e2e testing
snormore Nov 10, 2025
58e3d6c
client/daemon: route liveness / bfd / clearer event/type var names
snormore Nov 10, 2025
1834370
client/daemon: route liveness / s/remote/peer
snormore Nov 11, 2025
9dd149a
client/daemon: route liveness / logging
snormore Nov 11, 2025
cb85988
client/daemon: route liveness / better var names
snormore Nov 11, 2025
527e2c5
client/daemon: route liveness / remove unused AdminDownAll
snormore Nov 11, 2025
fb9fcb1
client/daemon: route liveness / dedup scheduler events
snormore Nov 12, 2025
6ccbd4f
client/daemon: route liveness / bounded scheduler queue
snormore Nov 12, 2025
a7e4a38
client/daemon: liveness subsystem disabled by default for initial rol…
snormore Nov 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,17 @@ All notable changes to this project will be documented in this file.

### Added

- Validate AccessPass before client connection (CLI) ([#1356](https://github.com/malbeclabs/doublezero/issues/1356))

- Onchain programs
- Check if `accesspass.owner` is equal to system program ([malbeclabs/doublezero#2088](https://github.com/malbeclabs/doublezero/pull/2088))
- CLI
- Add a new sub-type field to interface definitions to support CYOA and DIA interfaces. This sub-type allows the system to distinguish between standard Physical/Loopback interfaces and specialized CYOA/DIA interfaces, enabling proper classification, validation, and configuration handling across the DZD.
- Improve error message when connecting to a device that is at capacity or has max_users=0. Users now receive "Device is not accepting more users (at capacity or max_users=0)" instead of the confusing "Device not found" error when explicitly specifying an ineligible device.
- Add `link latency` command to display latency statistics from the telemetry program. Supports filtering by percentile (p50, p90, p95, p99, mean, min, max, stddev, all), querying by link code or all links, and filtering by epoch. Resolves: [#1942](https://github.com/malbeclabs/doublezero/issues/1942)

- Added `--contributor | -c` filter to `device list`, `interface list`, and `link list` commands. (#1274)
- Added `--contributor | -c` filter to `device list`, `interface list`, and `link list` commands. (#1274)
- Validate AccessPass before client connection ([#1356](https://github.com/malbeclabs/doublezero/issues/1356))
- Client
- Add initial route liveness probing, initially disabled for rollout

### Breaking

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ nocontainertest:
.PHONY: go-fuzz
go-fuzz:
cd tools/twamp && $(MAKE) fuzz
cd tools/uping && $(MAKE) fuzz
cd client/doublezerod && $(MAKE) fuzz

.PHONY: go-container-test
go-container-test:
Expand Down
8 changes: 8 additions & 0 deletions client/doublezerod/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,11 @@ lint:
.PHONY: build
build:
CGO_ENABLED=0 go build -v $(LDFLAGS) -o bin/doublezerod cmd/doublezerod/main.go

FUZZTIME ?= 10s
.PHONY: fuzz
fuzz:
@for f in $$(go test ./internal/liveness -list=Fuzz | grep '^Fuzz'); do \
echo "==> Fuzzing $$f"; \
go test ./internal/liveness -run=^$$ -fuzz=$$f -fuzztime=$(FUZZTIME) || exit 1; \
done
53 changes: 52 additions & 1 deletion client/doublezerod/cmd/doublezerod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import (
"os"
"os/signal"
"syscall"
"time"

"github.com/malbeclabs/doublezero/client/doublezerod/internal/liveness"
"github.com/malbeclabs/doublezero/client/doublezerod/internal/runtime"
"github.com/malbeclabs/doublezero/config"
"github.com/prometheus/client_golang/prometheus"
Expand All @@ -34,12 +36,38 @@ var (
metricsAddr = flag.String("metrics-addr", "localhost:0", "Address to listen on for prometheus metrics")
routeConfigPath = flag.String("route-config", "/var/lib/doublezerod/route-config.json", "path to route config file (unstable)")

// Route liveness configuration flags.
routeLivenessTxMin = flag.Duration("route-liveness-tx-min", defaultRouteLivenessTxMin, "route liveness tx min")
routeLivenessRxMin = flag.Duration("route-liveness-rx-min", defaultRouteLivenessRxMin, "route liveness rx min")
routeLivenessDetectMult = flag.Uint("route-liveness-detect-mult", defaultRouteLivenessDetectMult, "route liveness detect mult")
routeLivenessMinTxFloor = flag.Duration("route-liveness-min-tx-floor", defaultRouteLivenessMinTxFloor, "route liveness min tx floor")
routeLivenessMaxTxCeil = flag.Duration("route-liveness-max-tx-ceil", defaultRouteLivenessMaxTxCeil, "route liveness max tx ceil")

// TODO(snormore): These flags are temporary for initial rollout testing.
// They will be superceded by a single `route-liveness-enable` flag, where false means
// passive-mode and true means active-mode.
routeLivenessEnablePassive = flag.Bool("route-liveness-enable-passive", false, "enables route liveness in passive mode (experimental)")
routeLivenessEnableActive = flag.Bool("route-liveness-enable-active", false, "enables route liveness in active mode (experimental)")

// set by LDFLAGS
version = "dev"
commit = "none"
date = "unknown"
)

const (
defaultRouteLivenessTxMin = 300 * time.Millisecond
defaultRouteLivenessRxMin = 300 * time.Millisecond
defaultRouteLivenessDetectMult = 3
defaultRouteLivenessMinTxFloor = 50 * time.Millisecond
defaultRouteLivenessMaxTxCeil = 1 * time.Second

// The liveness port is not configurable since clients need to use the same one so they know
// how to connect to each other.
defaultRouteLivenessPort = 44880
defaultRouteLivenessBindIP = "0.0.0.0"
)

func main() {
flag.Parse()

Expand Down Expand Up @@ -112,10 +140,33 @@ func main() {
}()
}

// If either passive or active mode is enabled, create a manager config.
// If neither is enabled, completely disable the liveness subsystem.
// TODO(snormore): The scenario where the liveness subsystem is completely disabled is
// temporary for initial rollout testing.
var lmc *liveness.ManagerConfig
if *routeLivenessEnablePassive || *routeLivenessEnableActive {
lmc = &liveness.ManagerConfig{
Logger: slog.Default(),
BindIP: defaultRouteLivenessBindIP,
Port: defaultRouteLivenessPort,

// If active mode is enabled, set passive mode to false.
// The manager only knows about passive mode, with the negation of it being active mode.
PassiveMode: !*routeLivenessEnableActive,

TxMin: *routeLivenessTxMin,
RxMin: *routeLivenessRxMin,
DetectMult: uint8(*routeLivenessDetectMult),
MinTxFloor: *routeLivenessMinTxFloor,
MaxTxCeil: *routeLivenessMaxTxCeil,
}
}

ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()

if err := runtime.Run(ctx, *sockFile, *routeConfigPath, *enableLatencyProbing, *enableLatencyMetrics, *programId, *rpcEndpoint, *probeInterval, *cacheUpdateInterval); err != nil {
if err := runtime.Run(ctx, *sockFile, *routeConfigPath, *enableLatencyProbing, *enableLatencyMetrics, *programId, *rpcEndpoint, *probeInterval, *cacheUpdateInterval, lmc); err != nil {
slog.Error("runtime error", "error", err)
os.Exit(1)
}
Expand Down
31 changes: 20 additions & 11 deletions client/doublezerod/internal/bgp/bgp.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync"

"github.com/jwhited/corebgp"
"github.com/malbeclabs/doublezero/client/doublezerod/internal/liveness"
"github.com/malbeclabs/doublezero/client/doublezerod/internal/routing"
)

Expand Down Expand Up @@ -90,15 +91,17 @@ type RouteReaderWriter interface {
}

type PeerConfig struct {
LocalAddress net.IP
RemoteAddress net.IP
LocalAs uint32
RemoteAs uint32
Port int
RouteSrc net.IP
RouteTable int
FlushRoutes bool
NoInstall bool
LocalAddress net.IP
RemoteAddress net.IP
LocalAs uint32
RemoteAs uint32
Port int
RouteSrc net.IP
RouteTable int
FlushRoutes bool
NoInstall bool
Interface string
LivenessEnabled bool
}

type BgpServer struct {
Expand All @@ -107,9 +110,10 @@ type BgpServer struct {
peerStatus map[string]Session
peerStatusLock sync.Mutex
routeReaderWriter RouteReaderWriter
livenessManager *liveness.Manager
}

func NewBgpServer(routerID net.IP, r RouteReaderWriter) (*BgpServer, error) {
func NewBgpServer(routerID net.IP, r RouteReaderWriter, lm *liveness.Manager) (*BgpServer, error) {
corebgp.SetLogger(log.Print)
srv, err := corebgp.NewServer(netip.MustParseAddr(routerID.String()))
if err != nil {
Expand All @@ -121,6 +125,7 @@ func NewBgpServer(routerID net.IP, r RouteReaderWriter) (*BgpServer, error) {
peerStatus: make(map[string]Session),
peerStatusLock: sync.Mutex{},
routeReaderWriter: r,
livenessManager: lm,
}, nil
}

Expand All @@ -142,7 +147,11 @@ func (b *BgpServer) AddPeer(p *PeerConfig, advertised []NLRI) error {
if p.Port != 0 {
peerOpts = append(peerOpts, corebgp.WithPort(p.Port))
}
plugin := NewBgpPlugin(advertised, p.RouteSrc, p.RouteTable, b.peerStatusChan, p.FlushRoutes, p.NoInstall, b.routeReaderWriter)
rrw := b.routeReaderWriter
if p.LivenessEnabled && b.livenessManager != nil {
rrw = liveness.NewRouteReaderWriter(b.livenessManager, b.routeReaderWriter, p.Interface)
}
plugin := NewBgpPlugin(advertised, p.RouteSrc, p.RouteTable, b.peerStatusChan, p.FlushRoutes, p.NoInstall, rrw)
err := b.server.AddPeer(corebgp.PeerConfig{
RemoteAddress: netip.MustParseAddr(p.RemoteAddress.String()),
LocalAS: p.LocalAs,
Expand Down
18 changes: 17 additions & 1 deletion client/doublezerod/internal/bgp/bgp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"log"
"log/slog"
"net"
"net/netip"
"strings"
Expand All @@ -15,9 +16,11 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/jwhited/corebgp"
"github.com/malbeclabs/doublezero/client/doublezerod/internal/bgp"
"github.com/malbeclabs/doublezero/client/doublezerod/internal/liveness"
"github.com/malbeclabs/doublezero/client/doublezerod/internal/routing"
gobgp "github.com/osrg/gobgp/pkg/packet/bgp"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
"golang.org/x/sys/unix"
)

Expand Down Expand Up @@ -114,7 +117,20 @@ func (p *dummyPlugin) handleUpdate(peer corebgp.PeerConfig, u []byte) *corebgp.N

func TestBgpServer(t *testing.T) {
nlr := &mockRouteReaderWriter{}
b, err := bgp.NewBgpServer(net.IP{1, 1, 1, 1}, nlr)
lm, err := liveness.NewManager(t.Context(), &liveness.ManagerConfig{
Logger: slog.Default(),
Netlinker: nlr,
BindIP: "127.0.0.1",
Port: 0,
TxMin: 100 * time.Millisecond,
RxMin: 100 * time.Millisecond,
DetectMult: 3,
MinTxFloor: 50 * time.Millisecond,
MaxTxCeil: 1 * time.Second,
})
require.NoError(t, err)
t.Cleanup(func() { _ = lm.Close() })
b, err := bgp.NewBgpServer(net.IP{1, 1, 1, 1}, nlr, lm)
if err != nil {
t.Fatalf("error creating bgp server: %v", err)
}
Expand Down
8 changes: 4 additions & 4 deletions client/doublezerod/internal/bgp/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ func (p *Plugin) OnClose(peer corebgp.PeerConfig) {
protocol := unix.RTPROT_BGP // 186
routes, err := p.RouteReaderWriter.RouteByProtocol(protocol)
if err != nil {
slog.Error("routes: error getting routes by protocol", "protocol", protocol)
slog.Error("routes: error getting routes by protocol on peer close", "protocol", protocol, "error", err)
}
for _, route := range routes {
if err := p.RouteReaderWriter.RouteDelete(route); err != nil {
slog.Error("Error deleting route", "route", route)
slog.Error("routes: error deleting route on peer close", "route", route.String(), "error", err)
continue
}
}
Expand Down Expand Up @@ -126,7 +126,7 @@ func (p *Plugin) handleUpdate(peer corebgp.PeerConfig, u []byte) *corebgp.Notifi
slog.Info("routes: removing route from table", "table", p.RouteTable, "dz route", route.String())
err := p.RouteReaderWriter.RouteDelete(route)
if err != nil {
slog.Error("routes: error removing route from table", "table", p.RouteTable, "error", err)
slog.Error("routes: error removing route from table", "table", p.RouteTable, "error", err, "route", route.String())
}
}

Expand All @@ -152,7 +152,7 @@ func (p *Plugin) handleUpdate(peer corebgp.PeerConfig, u []byte) *corebgp.Notifi
Protocol: unix.RTPROT_BGP}
slog.Info("routes: writing route", "table", p.RouteTable, "dz route", route.String())
if err := p.RouteReaderWriter.RouteAdd(route); err != nil {
slog.Error("routes: error writing route", "table", p.RouteTable, "error", err)
slog.Error("routes: error writing route", "table", p.RouteTable, "error", err, "route", route.String())
}
}
return nil
Expand Down
Loading
Loading