Merge pull request #2597 from kartast/fix/db-malformed-self-restart

fix(agent-runner): exit on persistent inbound.db corruption errors
This commit is contained in:
gavrielc
2026-05-23 20:06:33 +03:00
committed by GitHub
2 changed files with 68 additions and 0 deletions
@@ -4,6 +4,7 @@ import { initTestSessionDb, closeSessionDb, getInboundDb, getOutboundDb } from '
import { getPendingMessages, markCompleted } from './db/messages-in.js';
import { getUndeliveredMessages } from './db/messages-out.js';
import { formatMessages, extractRouting } from './formatter.js';
import { isCorruptionError } from './poll-loop.js';
import { MockProvider } from './providers/mock.js';
beforeEach(() => {
@@ -377,3 +378,20 @@ describe('end-to-end with mock provider', () => {
expect(outMessages[0].in_reply_to).toBe('m1');
});
});
describe('isCorruptionError', () => {
it('matches the Docker Desktop macOS torn-read symptom', () => {
expect(isCorruptionError('database disk image is malformed')).toBe(true);
});
it('matches wrapped SQLite corruption codes', () => {
expect(isCorruptionError('SqliteError: SQLITE_CORRUPT_VTAB: ...')).toBe(true);
expect(isCorruptionError('file is not a database')).toBe(true);
});
it('returns false for unrelated errors', () => {
expect(isCorruptionError('database is locked')).toBe(false);
expect(isCorruptionError('no such table: messages_in')).toBe(false);
expect(isCorruptionError('')).toBe(false);
});
});
+50
View File
@@ -18,6 +18,30 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types
const POLL_INTERVAL_MS = 1000;
const ACTIVE_POLL_INTERVAL_MS = 500;
/**
* Number of consecutive `database disk image is malformed` errors after which
* the follow-up poll gives up and exits the process. At ACTIVE_POLL_INTERVAL_MS
* = 500ms this is roughly 5 seconds — long enough to dodge a transient torn
* read during a host write, short enough to recover quickly from a poisoned
* page cache (host-sweep then respawns with a fresh mount).
*/
const CORRUPTION_STREAK_EXIT = 10;
/**
* True for SQLite errors that indicate a corrupt READ view — almost always a
* cross-mount page-cache coherency issue on Docker Desktop macOS rather than
* actual file damage (host-side integrity_check passes). Reopening the DB
* handle inside this process does NOT recover; only a fresh container mount
* does. Caller's job is to exit so host-sweep respawns the container.
*/
export function isCorruptionError(msg: string): boolean {
return (
msg.includes('database disk image is malformed') ||
msg.includes('SQLITE_CORRUPT') ||
msg.includes('file is not a database')
);
}
function log(msg: string): void {
console.error(`[poll-loop] ${msg}`);
}
@@ -291,6 +315,7 @@ async function processQuery(
// will kill the container and messages get reset to pending.
let pollInFlight = false;
let endedForCommand = false;
let corruptionStreak = 0;
const pollHandle = setInterval(() => {
if (done || pollInFlight || endedForCommand) return;
pollInFlight = true;
@@ -362,6 +387,31 @@ async function processQuery(
// path is not, so it needs its own.
const errMsg = err instanceof Error ? err.message : String(err);
log(`Follow-up poll error: ${errMsg}`);
// Detect SQLite cross-mount corruption (Docker Desktop macOS virtiofs /
// gRPC-FUSE coherency bug — the kernel page cache for the inbound.db
// bind mount can latch a torn snapshot mid-host-write, after which
// every fresh openInboundDb() in this process sees the same broken
// view. Reopening inside the container does NOT recover; only a fresh
// container mount does. Exit so the host sweep respawns us.
if (isCorruptionError(errMsg)) {
corruptionStreak += 1;
if (corruptionStreak >= CORRUPTION_STREAK_EXIT) {
log(
`Follow-up poll: ${corruptionStreak} consecutive '${errMsg}' errors — ` +
`inbound.db page cache is poisoned. Exiting so host respawns with a fresh mount.`,
);
// Stop touching the heartbeat so host-sweep stale detection fires
// promptly even if exit() races with in-flight async work.
done = true;
clearInterval(pollHandle);
// Defer exit one tick so this log line flushes through Docker's
// log driver before the process dies.
setTimeout(() => process.exit(75), 100);
}
} else {
corruptionStreak = 0;
}
} finally {
pollInFlight = false;
}