-
Couldn't load subscription status.
- Fork 296
make reconstruction timeout dynamic + prevent reconstruction in nonSupernodes #7628
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: unstable
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -1713,6 +1713,16 @@ proc reconstructDataColumns(node: BeaconNode, slot: Slot) = | |||||||
| warn "Failed to get the current slot head" | ||||||||
| return | ||||||||
|
|
||||||||
| let | ||||||||
| currentCgc = node.dataColumnQuarantine.custodyColumns.lenu64 | ||||||||
| nonSupernode = | ||||||||
| currentCgc > node.dag.cfg.NUMBER_OF_CUSTODY_GROUPS div 2 and | ||||||||
| currentCgc < node.dag.cfg.NUMBER_OF_CUSTODY_GROUPS | ||||||||
|
|
||||||||
| if not(node.config.peerdasSupernode) or | ||||||||
| nonSupernode: | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so not really sure why it's two lines. But, beyond the most superficial cosmetics, earlier in the function there's already a nimbus-eth2/beacon_chain/nimbus_beacon_node.nim Lines 1705 to 1707 in d71ea30
which is effectively part of this logic too. I'm not sure why these two chunks of custody column checking should exist separately, and it's simpler it they're combined: to achieve a similar effect, check once for not(node.config.peerdasSupernode) or node.dataColumnQuarantine.custodyColumns.lenu64 < node.dag.cfg.NUMBER_OF_CUSTODY_GROUPS. Nothing about this function's current behavior changes between custody count of 63 and 65 (why 64? is that intentional?), it's just ruling everything under 128 out.
I'm not really sure in what circumstances |
||||||||
| return | ||||||||
|
|
||||||||
| withBlck(blck): | ||||||||
| when consensusFork >= ConsensusFork.Fulu: | ||||||||
| let maxColCount = node.dag.cfg.NUMBER_OF_COLUMNS | ||||||||
|
|
@@ -1741,7 +1751,8 @@ proc reconstructDataColumns(node: BeaconNode, slot: Slot) = | |||||||
|
|
||||||||
| # Reconstruct columns | ||||||||
| let recovered = recover_cells_and_proofs_parallel( | ||||||||
| node.batchVerifier[].taskpool, columns).valueOr: | ||||||||
| node.batchVerifier[].taskpool, columns, | ||||||||
| node.dag.cfg.time.SECONDS_PER_SLOT.int64).valueOr: | ||||||||
| error "Data column reconstruction incomplete" | ||||||||
| return | ||||||||
| let rowCount = recovered.len | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,7 +9,7 @@ | |
|
|
||
| # Uncategorized helper functions from the spec | ||
| import | ||
| chronos, chronicles, results, taskpools, | ||
| chronos, chronicles, results, taskpools, times, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
import
std/times,
chronos, chronicles, results, taskpools, |
||
| eth/p2p/discoveryv5/node, | ||
| kzg4844/kzg, | ||
| ssz_serialization/[ | ||
|
|
@@ -151,7 +151,8 @@ proc recoverCellsAndKzgProofsTask(cellIndices: seq[CellIndex], | |
|
|
||
| proc recover_cells_and_proofs_parallel*( | ||
| tp: Taskpool, | ||
| dataColumns: seq[ref fulu.DataColumnSidecar]): | ||
| dataColumns: seq[ref fulu.DataColumnSidecar], | ||
| slotDuration: int64): | ||
| Result[seq[CellsAndProofs], cstring] = | ||
| ## This helper recovers blobs from the data column sidecars parallelly | ||
| if dataColumns.len == 0: | ||
|
|
@@ -170,13 +171,15 @@ proc recover_cells_and_proofs_parallel*( | |
| res = newSeq[CellsAndProofs](blobCount) | ||
|
|
||
| let startTime = Moment.now() | ||
| const reconstructionTimeout = 2.seconds | ||
| let reconstructionTimeout = | ||
| (initDuration(nanoseconds = slotDuration * 100_000_000)).inNanoseconds() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
More generally,
That is, one should be able to do something like let reconstructionTimeout = slotDuration * 1000or similar, rather than round-tripping through these While the two if (now - startTime).nanoseconds > reconstructionTimeoutlines use |
||
|
|
||
|
|
||
| # ---- Spawn phase with time limit ---- | ||
| for blobIdx in 0 ..< blobCount: | ||
| let now = Moment.now() | ||
| if (now - startTime) > reconstructionTimeout: | ||
| debug "PeerDAS reconstruction timed out while preparing columns", | ||
| if (now - startTime).nanoseconds > reconstructionTimeout: | ||
| debug "PeerDAS column reconstruction timed out while preparing columns", | ||
| spawned = pendingFuts.len, total = blobCount | ||
| break # Stop spawning new tasks | ||
|
|
||
|
|
@@ -191,8 +194,8 @@ proc recover_cells_and_proofs_parallel*( | |
| # ---- Sync phase ---- | ||
| for i in 0 ..< pendingFuts.len: | ||
| let now = Moment.now() | ||
| if (now - startTime) > reconstructionTimeout: | ||
| debug "PeerDAS reconstruction timed out", | ||
| if (now - startTime).nanoseconds > reconstructionTimeout: | ||
| debug "PeerDAS column reconstruction timed out while preparing columns", | ||
| completed = i, totalSpawned = pendingFuts.len | ||
| return err("Data column reconstruction timed out") | ||
|
|
||
|
|
@@ -207,6 +210,11 @@ proc recover_cells_and_proofs_parallel*( | |
|
|
||
| ok(res) | ||
|
|
||
| proc recover_cells_and_proofs_parallel*( | ||
| tp: Taskpool, | ||
| dataColumns: seq[ref fulu.DataColumnSidecar]): | ||
| Result[seq[CellsAndProofs], cstring] = | ||
| recover_cells_and_proofs_parallel(tp, dataColumns, 10000'i64) | ||
|
|
||
| proc assemble_data_column_sidecars*( | ||
| signed_beacon_block: fulu.SignedBeaconBlock | gloas.SignedBeaconBlock, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This boundary condition seems to create an odd hole for
node.dataColumnQuarantine.custodyColumns.lenu64 == node.dag.cfg.NUMBER_OF_CUSTODY_GROUPS div 2. It won't be rejected by the earlierand it won't trigger
nonSupernodestatus here (64 > 64 and 64 < 128 == false).Maybe
>=was meant? But see comment below for a slightly broader point, related to this.