Skip to content

Commit 13fb3d2

Browse files
macfarlaclaude
andcommitted
Increase snap task timeouts to handle cold-cache server latency
The Besu snap server performs synchronous disk I/O (trie proof generation) on the Netty event loop, which can exceed 5s on cold RocksDB cache. This caused clients to timeout before receiving a response, accumulating 5 timeout counts and triggering a TIMEOUT disconnect. Fix: raise GetAccountRangeFromPeerTask timeout to 20s for sync tasks, 10s for the snap server probe, and widen the EthPeers probe wait to 12s. Also fix SnapServerChecker to complete the future on probe failure so isServingSnap is not incorrectly set to false. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent e1e5751 commit 13fb3d2

File tree

3 files changed

+18
-5
lines changed

3 files changed

+18
-5
lines changed

ethereum/eth/src/main/java/org/hyperledger/besu/ethereum/eth/manager/EthPeers.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ private void checkIsSnapServer(final EthPeer peer, final BlockHeader peersHeadBl
654654
peer.setIsServingSnap(true);
655655
Boolean isServer;
656656
try {
657-
isServer = snapServerChecker.check(peer, peersHeadBlockHeader).get(6L, TimeUnit.SECONDS);
657+
isServer = snapServerChecker.check(peer, peersHeadBlockHeader).get(12L, TimeUnit.SECONDS);
658658
} catch (Exception e) {
659659
LOG.atTrace()
660660
.setMessage("Error checking if peer {} is a snap server. Setting to false.")

ethereum/eth/src/main/java/org/hyperledger/besu/ethereum/eth/manager/snap/RetryingGetAccountRangeFromPeerTask.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ protected CompletableFuture<AccountRangeMessage.AccountRangeData> executeTaskOnC
7272
final GetAccountRangeFromPeerTask task =
7373
GetAccountRangeFromPeerTask.forAccountRange(
7474
ethContext, startKeyHash, endKeyHash, blockHeader, metricsSystem);
75+
// Use a longer timeout than the default 5s: the snap server does synchronous disk I/O
76+
// (trie proof generation) on the Netty event loop, which can exceed 5s on cold cache.
77+
task.setTimeout(java.time.Duration.ofSeconds(20));
7578
return executeSubTask(task::run)
7679
.thenApply(
7780
peerResult -> {

ethereum/eth/src/main/java/org/hyperledger/besu/ethereum/eth/sync/SnapServerChecker.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,30 @@ public CompletableFuture<Boolean> check(final EthPeer peer, final BlockHeader pe
7272
.log();
7373
future.complete(false);
7474
}
75+
} else {
76+
LOG.atTrace()
77+
.setMessage("Snap server probe for peer {} failed ({}), marking as not a snap server.")
78+
.addArgument(peer::getLoggableId)
79+
.addArgument(error != null ? error.getMessage() : "cancelled")
80+
.log();
81+
future.complete(false);
7582
}
7683
});
7784
return future;
7885
}
7986

8087
public CompletableFuture<AbstractPeerTask.PeerTaskResult<AccountRangeMessage.AccountRangeData>>
8188
getAccountRangeFromPeer(final EthPeer peer, final BlockHeader header) {
82-
return GetAccountRangeFromPeerTask.forAccountRange(
89+
final GetAccountRangeFromPeerTask task =
90+
GetAccountRangeFromPeerTask.forAccountRange(
8391
ethContext,
8492
Bytes32.wrap(Hash.ZERO.getBytes()),
8593
Bytes32.wrap(Hash.ZERO.getBytes()),
8694
header,
87-
metricsSystem)
88-
.assignPeer(peer)
89-
.run();
95+
metricsSystem);
96+
// Use a longer timeout than the default 5s: the snap server does synchronous disk I/O
97+
// (trie proof generation) on the Netty event loop, which can exceed 5s on cold cache.
98+
task.setTimeout(java.time.Duration.ofSeconds(10));
99+
return task.assignPeer(peer).run();
90100
}
91101
}

0 commit comments

Comments
 (0)