diff --git a/container/agent-runner/src/poll-loop.test.ts b/container/agent-runner/src/poll-loop.test.ts index 085cc61df..7b85faaaa 100644 --- a/container/agent-runner/src/poll-loop.test.ts +++ b/container/agent-runner/src/poll-loop.test.ts @@ -4,6 +4,7 @@ import { initTestSessionDb, closeSessionDb, getInboundDb, getOutboundDb } from ' import { getPendingMessages, markCompleted } from './db/messages-in.js'; import { getUndeliveredMessages } from './db/messages-out.js'; import { formatMessages, extractRouting } from './formatter.js'; +import { isCorruptionError } from './poll-loop.js'; import { MockProvider } from './providers/mock.js'; beforeEach(() => { @@ -377,3 +378,20 @@ describe('end-to-end with mock provider', () => { expect(outMessages[0].in_reply_to).toBe('m1'); }); }); + +describe('isCorruptionError', () => { + it('matches the Docker Desktop macOS torn-read symptom', () => { + expect(isCorruptionError('database disk image is malformed')).toBe(true); + }); + + it('matches wrapped SQLite corruption codes', () => { + expect(isCorruptionError('SqliteError: SQLITE_CORRUPT_VTAB: ...')).toBe(true); + expect(isCorruptionError('file is not a database')).toBe(true); + }); + + it('returns false for unrelated errors', () => { + expect(isCorruptionError('database is locked')).toBe(false); + expect(isCorruptionError('no such table: messages_in')).toBe(false); + expect(isCorruptionError('')).toBe(false); + }); +}); diff --git a/container/agent-runner/src/poll-loop.ts b/container/agent-runner/src/poll-loop.ts index ca081f065..1b7d181a3 100644 --- a/container/agent-runner/src/poll-loop.ts +++ b/container/agent-runner/src/poll-loop.ts @@ -18,6 +18,30 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types const POLL_INTERVAL_MS = 1000; const ACTIVE_POLL_INTERVAL_MS = 500; +/** + * Number of consecutive `database disk image is malformed` errors after which + * the follow-up poll gives up and exits the process. At ACTIVE_POLL_INTERVAL_MS + * = 500ms this is roughly 5 seconds — long enough to dodge a transient torn + * read during a host write, short enough to recover quickly from a poisoned + * page cache (host-sweep then respawns with a fresh mount). + */ +const CORRUPTION_STREAK_EXIT = 10; + +/** + * True for SQLite errors that indicate a corrupt READ view — almost always a + * cross-mount page-cache coherency issue on Docker Desktop macOS rather than + * actual file damage (host-side integrity_check passes). Reopening the DB + * handle inside this process does NOT recover; only a fresh container mount + * does. Caller's job is to exit so host-sweep respawns the container. + */ +export function isCorruptionError(msg: string): boolean { + return ( + msg.includes('database disk image is malformed') || + msg.includes('SQLITE_CORRUPT') || + msg.includes('file is not a database') + ); +} + function log(msg: string): void { console.error(`[poll-loop] ${msg}`); } @@ -291,6 +315,7 @@ async function processQuery( // will kill the container and messages get reset to pending. let pollInFlight = false; let endedForCommand = false; + let corruptionStreak = 0; const pollHandle = setInterval(() => { if (done || pollInFlight || endedForCommand) return; pollInFlight = true; @@ -362,6 +387,31 @@ async function processQuery( // path is not, so it needs its own. const errMsg = err instanceof Error ? err.message : String(err); log(`Follow-up poll error: ${errMsg}`); + + // Detect SQLite cross-mount corruption (Docker Desktop macOS virtiofs / + // gRPC-FUSE coherency bug — the kernel page cache for the inbound.db + // bind mount can latch a torn snapshot mid-host-write, after which + // every fresh openInboundDb() in this process sees the same broken + // view. Reopening inside the container does NOT recover; only a fresh + // container mount does. Exit so the host sweep respawns us. + if (isCorruptionError(errMsg)) { + corruptionStreak += 1; + if (corruptionStreak >= CORRUPTION_STREAK_EXIT) { + log( + `Follow-up poll: ${corruptionStreak} consecutive '${errMsg}' errors — ` + + `inbound.db page cache is poisoned. Exiting so host respawns with a fresh mount.`, + ); + // Stop touching the heartbeat so host-sweep stale detection fires + // promptly even if exit() races with in-flight async work. + done = true; + clearInterval(pollHandle); + // Defer exit one tick so this log line flushes through Docker's + // log driver before the process dies. + setTimeout(() => process.exit(75), 100); + } + } else { + corruptionStreak = 0; + } } finally { pollInFlight = false; }