diff --git a/src/host-sweep.test.ts b/src/host-sweep.test.ts index d9505a4ae..eefcc8af0 100644 --- a/src/host-sweep.test.ts +++ b/src/host-sweep.test.ts @@ -39,14 +39,32 @@ describe('decideStuckAction', () => { expect(res.heartbeatAgeMs).toBeGreaterThan(ABSOLUTE_CEILING_MS); }); - it('treats an absent heartbeat file as infinitely stale', () => { + it('skips the ceiling check when no heartbeat file exists (fresh container not yet ticked)', () => { + // A freshly-spawned container hasn't produced any SDK events yet, so no + // heartbeat. Prior behavior treated this as infinitely stale and killed + // every container within seconds of spawn. With no claims either, we + // should conclude everything is fine. const res = decideStuckAction({ now: BASE, heartbeatMtimeMs: 0, containerState: null, claims: [], }); - expect(res.action).toBe('kill-ceiling'); + expect(res.action).toBe('ok'); + }); + + it('kills on claim-stuck when heartbeat is absent AND a claim has aged past tolerance', () => { + // Hanging fresh container: spawned, picked up a message (claim recorded + // in processing_ack), but never wrote a heartbeat. Falls through the + // skipped ceiling check into claim-stuck — which correctly fires. + const claimedAgeMs = CLAIM_STUCK_MS + 5_000; + const res = decideStuckAction({ + now: BASE, + heartbeatMtimeMs: 0, + containerState: null, + claims: [claim('msg-1', claimedAgeMs)], + }); + expect(res.action).toBe('kill-claim'); }); it('extends the ceiling when Bash has a declared timeout longer than 30 min', () => { @@ -105,7 +123,7 @@ describe('decideStuckAction', () => { const res = decideStuckAction({ now: BASE, // 5 min since claim, over the 60s default but under the declared Bash timeout - heartbeatMtimeMs: BASE - (5 * 60 * 1000) - 5_000, + heartbeatMtimeMs: BASE - 5 * 60 * 1000 - 5_000, containerState: { current_tool: 'Bash', tool_declared_timeout_ms: tenMinMs, diff --git a/src/host-sweep.ts b/src/host-sweep.ts index 0f8365c80..1a2901ccc 100644 --- a/src/host-sweep.ts +++ b/src/host-sweep.ts @@ -75,11 +75,21 @@ export function decideStuckAction(args: { }): StuckDecision { const { now, heartbeatMtimeMs, containerState, claims } = args; const declaredBashMs = bashTimeoutMs(containerState); - const heartbeatAge = heartbeatMtimeMs === 0 ? Infinity : now - heartbeatMtimeMs; - const ceiling = Math.max(ABSOLUTE_CEILING_MS, declaredBashMs ?? 0); - if (heartbeatAge > ceiling) { - return { action: 'kill-ceiling', heartbeatAgeMs: heartbeatAge, ceilingMs: ceiling }; + // Ceiling check only applies when we have an actual heartbeat timestamp. + // A freshly-spawned container hasn't had any SDK activity yet so no + // heartbeat file exists — if we treated that as infinitely stale we'd + // kill every container within seconds of spawn. Genuinely-dead containers + // that never wrote a heartbeat are caught by the separate "container + // process not running" cleanup path, not here. If a fresh container is + // hanging at the gate (claimed a message but never did anything) the + // claim-stuck check below handles it. + if (heartbeatMtimeMs !== 0) { + const heartbeatAge = now - heartbeatMtimeMs; + const ceiling = Math.max(ABSOLUTE_CEILING_MS, declaredBashMs ?? 0); + if (heartbeatAge > ceiling) { + return { action: 'kill-ceiling', heartbeatAgeMs: heartbeatAge, ceilingMs: ceiling }; + } } const tolerance = Math.max(CLAIM_STUCK_MS, declaredBashMs ?? 0);