mirror of
https://github.com/qwibitai/nanoclaw.git
synced 2026-06-04 10:14:47 +08:00
fix(agent-runner): exit on persistent inbound.db corruption errors
The follow-up poll catches and logs SQLite errors but never recovers from them. On Docker Desktop macOS, the kernel page cache for the inbound.db bind mount can latch a torn snapshot mid-host-write (a known virtiofs / gRPC-FUSE coherency issue), after which every fresh openInboundDb() in the same process sees the same broken view and emits 'database disk image is malformed' at the poll rate (2/sec). Reopening the DB handle inside the container does not recover — only a fresh container mount does. The fix: after CORRUPTION_STREAK_EXIT consecutive corruption errors (~5s), log a clear message and process.exit(75) so host-sweep respawns the container with a fresh mount. Transient single torn reads are still tolerated. - Add isCorruptionError() helper covering the three SQLite read-side corruption symptoms (disk image malformed, SQLITE_CORRUPT, file is not a database). - Add streak counter scoped to processQuery's pollHandle so it resets on any successful or non-corruption error. - Add unit tests for the matcher. Refs the cross-mount invariants documented in db/connection.ts:11-18.
This commit is contained in:
@@ -4,6 +4,7 @@ import { initTestSessionDb, closeSessionDb, getInboundDb, getOutboundDb } from '
|
|||||||
import { getPendingMessages, markCompleted } from './db/messages-in.js';
|
import { getPendingMessages, markCompleted } from './db/messages-in.js';
|
||||||
import { getUndeliveredMessages } from './db/messages-out.js';
|
import { getUndeliveredMessages } from './db/messages-out.js';
|
||||||
import { formatMessages, extractRouting } from './formatter.js';
|
import { formatMessages, extractRouting } from './formatter.js';
|
||||||
|
import { isCorruptionError } from './poll-loop.js';
|
||||||
import { MockProvider } from './providers/mock.js';
|
import { MockProvider } from './providers/mock.js';
|
||||||
|
|
||||||
beforeEach(() => {
|
beforeEach(() => {
|
||||||
@@ -375,3 +376,20 @@ describe('end-to-end with mock provider', () => {
|
|||||||
expect(outMessages[0].in_reply_to).toBe('m1');
|
expect(outMessages[0].in_reply_to).toBe('m1');
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('isCorruptionError', () => {
|
||||||
|
it('matches the Docker Desktop macOS torn-read symptom', () => {
|
||||||
|
expect(isCorruptionError('database disk image is malformed')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('matches wrapped SQLite corruption codes', () => {
|
||||||
|
expect(isCorruptionError('SqliteError: SQLITE_CORRUPT_VTAB: ...')).toBe(true);
|
||||||
|
expect(isCorruptionError('file is not a database')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for unrelated errors', () => {
|
||||||
|
expect(isCorruptionError('database is locked')).toBe(false);
|
||||||
|
expect(isCorruptionError('no such table: messages_in')).toBe(false);
|
||||||
|
expect(isCorruptionError('')).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -18,6 +18,30 @@ import type { AgentProvider, AgentQuery, ProviderEvent } from './providers/types
|
|||||||
const POLL_INTERVAL_MS = 1000;
|
const POLL_INTERVAL_MS = 1000;
|
||||||
const ACTIVE_POLL_INTERVAL_MS = 500;
|
const ACTIVE_POLL_INTERVAL_MS = 500;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of consecutive `database disk image is malformed` errors after which
|
||||||
|
* the follow-up poll gives up and exits the process. At ACTIVE_POLL_INTERVAL_MS
|
||||||
|
* = 500ms this is roughly 5 seconds — long enough to dodge a transient torn
|
||||||
|
* read during a host write, short enough to recover quickly from a poisoned
|
||||||
|
* page cache (host-sweep then respawns with a fresh mount).
|
||||||
|
*/
|
||||||
|
const CORRUPTION_STREAK_EXIT = 10;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* True for SQLite errors that indicate a corrupt READ view — almost always a
|
||||||
|
* cross-mount page-cache coherency issue on Docker Desktop macOS rather than
|
||||||
|
* actual file damage (host-side integrity_check passes). Reopening the DB
|
||||||
|
* handle inside this process does NOT recover; only a fresh container mount
|
||||||
|
* does. Caller's job is to exit so host-sweep respawns the container.
|
||||||
|
*/
|
||||||
|
export function isCorruptionError(msg: string): boolean {
|
||||||
|
return (
|
||||||
|
msg.includes('database disk image is malformed') ||
|
||||||
|
msg.includes('SQLITE_CORRUPT') ||
|
||||||
|
msg.includes('file is not a database')
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
function log(msg: string): void {
|
function log(msg: string): void {
|
||||||
console.error(`[poll-loop] ${msg}`);
|
console.error(`[poll-loop] ${msg}`);
|
||||||
}
|
}
|
||||||
@@ -291,6 +315,7 @@ async function processQuery(
|
|||||||
// will kill the container and messages get reset to pending.
|
// will kill the container and messages get reset to pending.
|
||||||
let pollInFlight = false;
|
let pollInFlight = false;
|
||||||
let endedForCommand = false;
|
let endedForCommand = false;
|
||||||
|
let corruptionStreak = 0;
|
||||||
const pollHandle = setInterval(() => {
|
const pollHandle = setInterval(() => {
|
||||||
if (done || pollInFlight || endedForCommand) return;
|
if (done || pollInFlight || endedForCommand) return;
|
||||||
pollInFlight = true;
|
pollInFlight = true;
|
||||||
@@ -362,6 +387,31 @@ async function processQuery(
|
|||||||
// path is not, so it needs its own.
|
// path is not, so it needs its own.
|
||||||
const errMsg = err instanceof Error ? err.message : String(err);
|
const errMsg = err instanceof Error ? err.message : String(err);
|
||||||
log(`Follow-up poll error: ${errMsg}`);
|
log(`Follow-up poll error: ${errMsg}`);
|
||||||
|
|
||||||
|
// Detect SQLite cross-mount corruption (Docker Desktop macOS virtiofs /
|
||||||
|
// gRPC-FUSE coherency bug — the kernel page cache for the inbound.db
|
||||||
|
// bind mount can latch a torn snapshot mid-host-write, after which
|
||||||
|
// every fresh openInboundDb() in this process sees the same broken
|
||||||
|
// view. Reopening inside the container does NOT recover; only a fresh
|
||||||
|
// container mount does. Exit so the host sweep respawns us.
|
||||||
|
if (isCorruptionError(errMsg)) {
|
||||||
|
corruptionStreak += 1;
|
||||||
|
if (corruptionStreak >= CORRUPTION_STREAK_EXIT) {
|
||||||
|
log(
|
||||||
|
`Follow-up poll: ${corruptionStreak} consecutive '${errMsg}' errors — ` +
|
||||||
|
`inbound.db page cache is poisoned. Exiting so host respawns with a fresh mount.`,
|
||||||
|
);
|
||||||
|
// Stop touching the heartbeat so host-sweep stale detection fires
|
||||||
|
// promptly even if exit() races with in-flight async work.
|
||||||
|
done = true;
|
||||||
|
clearInterval(pollHandle);
|
||||||
|
// Defer exit one tick so this log line flushes through Docker's
|
||||||
|
// log driver before the process dies.
|
||||||
|
setTimeout(() => process.exit(75), 100);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
corruptionStreak = 0;
|
||||||
|
}
|
||||||
} finally {
|
} finally {
|
||||||
pollInFlight = false;
|
pollInFlight = false;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user