fix(host-sweep): skip ceiling check when heartbeat file is absent

decideStuckAction treated a missing heartbeat file as heartbeatAge =
Infinity, which always exceeded the 30-minute ceiling. Result: every
freshly-spawned container got killed within seconds of spawn on the
first sweep pass because it hadn't produced an SDK event yet (heartbeat
is only touched on SDK events inside processQuery, not on boot).

Skip the ceiling branch when heartbeatMtimeMs === 0. Containers that
genuinely never wrote a heartbeat because they died are caught by the
separate "container process not running" cleanup path. Containers that
boot, claim a message, but hang at the gate are caught by the
claim-stuck check below — which correctly fires regardless of heartbeat
presence once claimAge exceeds tolerance.

Updates the "absent heartbeat → kill-ceiling" test (which was encoding
the bug) and adds a companion that the claim-stuck path still fires for
absent-heartbeat containers with aged claims.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-04-20 12:15:52 +03:00
parent 31f2da9585
commit 0105de0257
2 changed files with 35 additions and 7 deletions
+21 -3
View File
@@ -39,14 +39,32 @@ describe('decideStuckAction', () => {
expect(res.heartbeatAgeMs).toBeGreaterThan(ABSOLUTE_CEILING_MS);
});
it('treats an absent heartbeat file as infinitely stale', () => {
it('skips the ceiling check when no heartbeat file exists (fresh container not yet ticked)', () => {
// A freshly-spawned container hasn't produced any SDK events yet, so no
// heartbeat. Prior behavior treated this as infinitely stale and killed
// every container within seconds of spawn. With no claims either, we
// should conclude everything is fine.
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: 0,
containerState: null,
claims: [],
});
expect(res.action).toBe('kill-ceiling');
expect(res.action).toBe('ok');
});
it('kills on claim-stuck when heartbeat is absent AND a claim has aged past tolerance', () => {
// Hanging fresh container: spawned, picked up a message (claim recorded
// in processing_ack), but never wrote a heartbeat. Falls through the
// skipped ceiling check into claim-stuck — which correctly fires.
const claimedAgeMs = CLAIM_STUCK_MS + 5_000;
const res = decideStuckAction({
now: BASE,
heartbeatMtimeMs: 0,
containerState: null,
claims: [claim('msg-1', claimedAgeMs)],
});
expect(res.action).toBe('kill-claim');
});
it('extends the ceiling when Bash has a declared timeout longer than 30 min', () => {
@@ -105,7 +123,7 @@ describe('decideStuckAction', () => {
const res = decideStuckAction({
now: BASE,
// 5 min since claim, over the 60s default but under the declared Bash timeout
heartbeatMtimeMs: BASE - (5 * 60 * 1000) - 5_000,
heartbeatMtimeMs: BASE - 5 * 60 * 1000 - 5_000,
containerState: {
current_tool: 'Bash',
tool_declared_timeout_ms: tenMinMs,
+14 -4
View File
@@ -75,11 +75,21 @@ export function decideStuckAction(args: {
}): StuckDecision {
const { now, heartbeatMtimeMs, containerState, claims } = args;
const declaredBashMs = bashTimeoutMs(containerState);
const heartbeatAge = heartbeatMtimeMs === 0 ? Infinity : now - heartbeatMtimeMs;
const ceiling = Math.max(ABSOLUTE_CEILING_MS, declaredBashMs ?? 0);
if (heartbeatAge > ceiling) {
return { action: 'kill-ceiling', heartbeatAgeMs: heartbeatAge, ceilingMs: ceiling };
// Ceiling check only applies when we have an actual heartbeat timestamp.
// A freshly-spawned container hasn't had any SDK activity yet so no
// heartbeat file exists — if we treated that as infinitely stale we'd
// kill every container within seconds of spawn. Genuinely-dead containers
// that never wrote a heartbeat are caught by the separate "container
// process not running" cleanup path, not here. If a fresh container is
// hanging at the gate (claimed a message but never did anything) the
// claim-stuck check below handles it.
if (heartbeatMtimeMs !== 0) {
const heartbeatAge = now - heartbeatMtimeMs;
const ceiling = Math.max(ABSOLUTE_CEILING_MS, declaredBashMs ?? 0);
if (heartbeatAge > ceiling) {
return { action: 'kill-ceiling', heartbeatAgeMs: heartbeatAge, ceilingMs: ceiling };
}
}
const tolerance = Math.max(CLAIM_STUCK_MS, declaredBashMs ?? 0);