mirror of
https://github.com/qwibitai/nanoclaw.git
synced 2026-06-04 10:14:47 +08:00
fix(agent-runner): exit on persistent inbound.db corruption errors
The follow-up poll catches and logs SQLite errors but never recovers from them. On Docker Desktop macOS, the kernel page cache for the inbound.db bind mount can latch a torn snapshot mid-host-write (a known virtiofs / gRPC-FUSE coherency issue), after which every fresh openInboundDb() in the same process sees the same broken view and emits 'database disk image is malformed' at the poll rate (2/sec). Reopening the DB handle inside the container does not recover — only a fresh container mount does. The fix: after CORRUPTION_STREAK_EXIT consecutive corruption errors (~5s), log a clear message and process.exit(75) so host-sweep respawns the container with a fresh mount. Transient single torn reads are still tolerated. - Add isCorruptionError() helper covering the three SQLite read-side corruption symptoms (disk image malformed, SQLITE_CORRUPT, file is not a database). - Add streak counter scoped to processQuery's pollHandle so it resets on any successful or non-corruption error. - Add unit tests for the matcher. Refs the cross-mount invariants documented in db/connection.ts:11-18.
This commit is contained in:
@@ -4,6 +4,7 @@ import { initTestSessionDb, closeSessionDb, getInboundDb, getOutboundDb } from '
|
||||
import { getPendingMessages, markCompleted } from './db/messages-in.js';
|
||||
import { getUndeliveredMessages } from './db/messages-out.js';
|
||||
import { formatMessages, extractRouting } from './formatter.js';
|
||||
import { isCorruptionError } from './poll-loop.js';
|
||||
import { MockProvider } from './providers/mock.js';
|
||||
|
||||
beforeEach(() => {
|
||||
@@ -375,3 +376,20 @@ describe('end-to-end with mock provider', () => {
|
||||
expect(outMessages[0].in_reply_to).toBe('m1');
|
||||
});
|
||||
});
|
||||
|
||||
describe('isCorruptionError', () => {
|
||||
it('matches the Docker Desktop macOS torn-read symptom', () => {
|
||||
expect(isCorruptionError('database disk image is malformed')).toBe(true);
|
||||
});
|
||||
|
||||
it('matches wrapped SQLite corruption codes', () => {
|
||||
expect(isCorruptionError('SqliteError: SQLITE_CORRUPT_VTAB: ...')).toBe(true);
|
||||
expect(isCorruptionError('file is not a database')).toBe(true);
|
||||
});
|
||||
|
||||
it('returns false for unrelated errors', () => {
|
||||
expect(isCorruptionError('database is locked')).toBe(false);
|
||||
expect(isCorruptionError('no such table: messages_in')).toBe(false);
|
||||
expect(isCorruptionError('')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -18,6 +18,30 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types
|
||||
const POLL_INTERVAL_MS = 1000;
|
||||
const ACTIVE_POLL_INTERVAL_MS = 500;
|
||||
|
||||
/**
|
||||
* Number of consecutive `database disk image is malformed` errors after which
|
||||
* the follow-up poll gives up and exits the process. At ACTIVE_POLL_INTERVAL_MS
|
||||
* = 500ms this is roughly 5 seconds — long enough to dodge a transient torn
|
||||
* read during a host write, short enough to recover quickly from a poisoned
|
||||
* page cache (host-sweep then respawns with a fresh mount).
|
||||
*/
|
||||
const CORRUPTION_STREAK_EXIT = 10;
|
||||
|
||||
/**
|
||||
* True for SQLite errors that indicate a corrupt READ view — almost always a
|
||||
* cross-mount page-cache coherency issue on Docker Desktop macOS rather than
|
||||
* actual file damage (host-side integrity_check passes). Reopening the DB
|
||||
* handle inside this process does NOT recover; only a fresh container mount
|
||||
* does. Caller's job is to exit so host-sweep respawns the container.
|
||||
*/
|
||||
export function isCorruptionError(msg: string): boolean {
|
||||
return (
|
||||
msg.includes('database disk image is malformed') ||
|
||||
msg.includes('SQLITE_CORRUPT') ||
|
||||
msg.includes('file is not a database')
|
||||
);
|
||||
}
|
||||
|
||||
function log(msg: string): void {
|
||||
console.error(`[poll-loop] ${msg}`);
|
||||
}
|
||||
@@ -291,6 +315,7 @@ async function processQuery(
|
||||
// will kill the container and messages get reset to pending.
|
||||
let pollInFlight = false;
|
||||
let endedForCommand = false;
|
||||
let corruptionStreak = 0;
|
||||
const pollHandle = setInterval(() => {
|
||||
if (done || pollInFlight || endedForCommand) return;
|
||||
pollInFlight = true;
|
||||
@@ -362,6 +387,31 @@ async function processQuery(
|
||||
// path is not, so it needs its own.
|
||||
const errMsg = err instanceof Error ? err.message : String(err);
|
||||
log(`Follow-up poll error: ${errMsg}`);
|
||||
|
||||
// Detect SQLite cross-mount corruption (Docker Desktop macOS virtiofs /
|
||||
// gRPC-FUSE coherency bug — the kernel page cache for the inbound.db
|
||||
// bind mount can latch a torn snapshot mid-host-write, after which
|
||||
// every fresh openInboundDb() in this process sees the same broken
|
||||
// view. Reopening inside the container does NOT recover; only a fresh
|
||||
// container mount does. Exit so the host sweep respawns us.
|
||||
if (isCorruptionError(errMsg)) {
|
||||
corruptionStreak += 1;
|
||||
if (corruptionStreak >= CORRUPTION_STREAK_EXIT) {
|
||||
log(
|
||||
`Follow-up poll: ${corruptionStreak} consecutive '${errMsg}' errors — ` +
|
||||
`inbound.db page cache is poisoned. Exiting so host respawns with a fresh mount.`,
|
||||
);
|
||||
// Stop touching the heartbeat so host-sweep stale detection fires
|
||||
// promptly even if exit() races with in-flight async work.
|
||||
done = true;
|
||||
clearInterval(pollHandle);
|
||||
// Defer exit one tick so this log line flushes through Docker's
|
||||
// log driver before the process dies.
|
||||
setTimeout(() => process.exit(75), 100);
|
||||
}
|
||||
} else {
|
||||
corruptionStreak = 0;
|
||||
}
|
||||
} finally {
|
||||
pollInFlight = false;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user