mirror of
https://github.com/qwibitai/nanoclaw.git
synced 2026-06-04 10:14:47 +08:00
fix(host-sweep): skip ceiling check when heartbeat file is absent
decideStuckAction treated a missing heartbeat file as heartbeatAge = Infinity, which always exceeded the 30-minute ceiling. Result: every freshly-spawned container got killed within seconds of spawn on the first sweep pass because it hadn't produced an SDK event yet (heartbeat is only touched on SDK events inside processQuery, not on boot). Skip the ceiling branch when heartbeatMtimeMs === 0. Containers that genuinely never wrote a heartbeat because they died are caught by the separate "container process not running" cleanup path. Containers that boot, claim a message, but hang at the gate are caught by the claim-stuck check below — which correctly fires regardless of heartbeat presence once claimAge exceeds tolerance. Updates the "absent heartbeat → kill-ceiling" test (which was encoding the bug) and adds a companion that the claim-stuck path still fires for absent-heartbeat containers with aged claims. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+21
-3
@@ -39,14 +39,32 @@ describe('decideStuckAction', () => {
|
||||
expect(res.heartbeatAgeMs).toBeGreaterThan(ABSOLUTE_CEILING_MS);
|
||||
});
|
||||
|
||||
it('treats an absent heartbeat file as infinitely stale', () => {
|
||||
it('skips the ceiling check when no heartbeat file exists (fresh container not yet ticked)', () => {
|
||||
// A freshly-spawned container hasn't produced any SDK events yet, so no
|
||||
// heartbeat. Prior behavior treated this as infinitely stale and killed
|
||||
// every container within seconds of spawn. With no claims either, we
|
||||
// should conclude everything is fine.
|
||||
const res = decideStuckAction({
|
||||
now: BASE,
|
||||
heartbeatMtimeMs: 0,
|
||||
containerState: null,
|
||||
claims: [],
|
||||
});
|
||||
expect(res.action).toBe('kill-ceiling');
|
||||
expect(res.action).toBe('ok');
|
||||
});
|
||||
|
||||
it('kills on claim-stuck when heartbeat is absent AND a claim has aged past tolerance', () => {
|
||||
// Hanging fresh container: spawned, picked up a message (claim recorded
|
||||
// in processing_ack), but never wrote a heartbeat. Falls through the
|
||||
// skipped ceiling check into claim-stuck — which correctly fires.
|
||||
const claimedAgeMs = CLAIM_STUCK_MS + 5_000;
|
||||
const res = decideStuckAction({
|
||||
now: BASE,
|
||||
heartbeatMtimeMs: 0,
|
||||
containerState: null,
|
||||
claims: [claim('msg-1', claimedAgeMs)],
|
||||
});
|
||||
expect(res.action).toBe('kill-claim');
|
||||
});
|
||||
|
||||
it('extends the ceiling when Bash has a declared timeout longer than 30 min', () => {
|
||||
@@ -105,7 +123,7 @@ describe('decideStuckAction', () => {
|
||||
const res = decideStuckAction({
|
||||
now: BASE,
|
||||
// 5 min since claim, over the 60s default but under the declared Bash timeout
|
||||
heartbeatMtimeMs: BASE - (5 * 60 * 1000) - 5_000,
|
||||
heartbeatMtimeMs: BASE - 5 * 60 * 1000 - 5_000,
|
||||
containerState: {
|
||||
current_tool: 'Bash',
|
||||
tool_declared_timeout_ms: tenMinMs,
|
||||
|
||||
+14
-4
@@ -75,11 +75,21 @@ export function decideStuckAction(args: {
|
||||
}): StuckDecision {
|
||||
const { now, heartbeatMtimeMs, containerState, claims } = args;
|
||||
const declaredBashMs = bashTimeoutMs(containerState);
|
||||
const heartbeatAge = heartbeatMtimeMs === 0 ? Infinity : now - heartbeatMtimeMs;
|
||||
|
||||
const ceiling = Math.max(ABSOLUTE_CEILING_MS, declaredBashMs ?? 0);
|
||||
if (heartbeatAge > ceiling) {
|
||||
return { action: 'kill-ceiling', heartbeatAgeMs: heartbeatAge, ceilingMs: ceiling };
|
||||
// Ceiling check only applies when we have an actual heartbeat timestamp.
|
||||
// A freshly-spawned container hasn't had any SDK activity yet so no
|
||||
// heartbeat file exists — if we treated that as infinitely stale we'd
|
||||
// kill every container within seconds of spawn. Genuinely-dead containers
|
||||
// that never wrote a heartbeat are caught by the separate "container
|
||||
// process not running" cleanup path, not here. If a fresh container is
|
||||
// hanging at the gate (claimed a message but never did anything) the
|
||||
// claim-stuck check below handles it.
|
||||
if (heartbeatMtimeMs !== 0) {
|
||||
const heartbeatAge = now - heartbeatMtimeMs;
|
||||
const ceiling = Math.max(ABSOLUTE_CEILING_MS, declaredBashMs ?? 0);
|
||||
if (heartbeatAge > ceiling) {
|
||||
return { action: 'kill-ceiling', heartbeatAgeMs: heartbeatAge, ceilingMs: ceiling };
|
||||
}
|
||||
}
|
||||
|
||||
const tolerance = Math.max(CLAIM_STUCK_MS, declaredBashMs ?? 0);
|
||||
|
||||
Reference in New Issue
Block a user