fix(v2/delivery): prevent double-send from overlapping delivery polls

The active poll (1s, running sessions) and sweep poll (60s, all active
sessions) both call deliverSessionMessages, and a running session is in
both result sets. Without locking they race on the same outbound row:
both read it as undelivered, both call the channel adapter, both
markDelivered. INSERT OR IGNORE hides the DB collision but the user has
already received the message twice.

Adds a per-session inflight guard; the second concurrent caller skips
and picks up any leftover work on the next poll tick.

Also makes outbox cleanup in deliverMessage best-effort: the message is
already on the user's screen, so a cleanup throw must not propagate to
the retry path (which would resend).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
gavrielc
2026-04-17 15:16:18 +03:00
parent bfc626be82
commit ac9535baa8
2 changed files with 193 additions and 3 deletions
+155
View File
@@ -0,0 +1,155 @@
/**
* Delivery race tests.
*
* The active poll (1s, running sessions) and the sweep poll (60s, all
* active sessions) both call deliverSessionMessages. A running session
* sits in both result sets, so the two timer chains can race on the same
* outbound row — read-undelivered → call channel API → markDelivered. The
* INSERT OR IGNORE in markDelivered makes the DB write idempotent, but
* the channel API has already fired twice → user sees the message twice.
*/
import Database from 'better-sqlite3';
import fs from 'fs';
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
vi.mock('./container-runner.js', () => ({
wakeContainer: vi.fn().mockResolvedValue(undefined),
resetContainerIdleTimer: vi.fn(),
isContainerRunning: vi.fn().mockReturnValue(false),
killContainer: vi.fn(),
buildAgentGroupImage: vi.fn().mockResolvedValue(undefined),
}));
vi.mock('./config.js', async () => {
const actual = await vi.importActual<typeof import('./config.js')>('./config.js');
return { ...actual, DATA_DIR: '/tmp/nanoclaw-test-delivery' };
});
const TEST_DIR = '/tmp/nanoclaw-test-delivery';
import {
initTestDb,
closeDb,
runMigrations,
createAgentGroup,
createMessagingGroup,
} from './db/index.js';
import { resolveSession, outboundDbPath } from './session-manager.js';
import { deliverSessionMessages, setDeliveryAdapter } from './delivery.js';
function now(): string {
return new Date().toISOString();
}
function seedAgentAndChannel(): void {
createAgentGroup({
id: 'ag-1',
name: 'Test Agent',
folder: 'test-agent',
agent_provider: null,
created_at: now(),
});
createMessagingGroup({
id: 'mg-1',
channel_type: 'telegram',
platform_id: 'telegram:123',
name: 'Test Chat',
is_group: 0,
unknown_sender_policy: 'public',
created_at: now(),
});
}
function insertOutbound(agentGroupId: string, sessionId: string, msgId: string): void {
const db = new Database(outboundDbPath(agentGroupId, sessionId));
db.prepare(
`INSERT INTO messages_out (id, timestamp, kind, platform_id, channel_type, content)
VALUES (?, datetime('now'), 'chat', 'telegram:123', 'telegram', ?)`,
).run(msgId, JSON.stringify({ text: 'hello' }));
db.close();
}
beforeEach(() => {
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
fs.mkdirSync(TEST_DIR, { recursive: true });
const db = initTestDb();
runMigrations(db);
});
afterEach(() => {
closeDb();
if (fs.existsSync(TEST_DIR)) fs.rmSync(TEST_DIR, { recursive: true });
});
describe('deliverSessionMessages — concurrent invocations', () => {
it('delivers a message exactly once when active and sweep polls overlap', async () => {
seedAgentAndChannel();
const { session } = resolveSession('ag-1', 'mg-1', null, 'shared');
insertOutbound('ag-1', session.id, 'out-1');
const calls: string[] = [];
setDeliveryAdapter({
async deliver(_channelType, _platformId, _threadId, _kind, content) {
calls.push(content);
// Hold long enough that the second concurrent caller can race the
// read-undelivered → markDelivered window.
await new Promise((r) => setTimeout(r, 100));
return 'plat-msg-1';
},
});
// Two concurrent calls — simulating active (1s) and sweep (60s) polls
// hitting the same running session at the same moment.
await Promise.all([deliverSessionMessages(session), deliverSessionMessages(session)]);
expect(calls).toHaveLength(1);
});
it('still delivers on a subsequent call after the first finishes', async () => {
seedAgentAndChannel();
const { session } = resolveSession('ag-1', 'mg-1', null, 'shared');
insertOutbound('ag-1', session.id, 'out-first');
const calls: string[] = [];
setDeliveryAdapter({
async deliver(_channelType, _platformId, _threadId, _kind, content) {
calls.push(content);
return 'plat-msg-id';
},
});
await deliverSessionMessages(session);
expect(calls).toHaveLength(1);
// Insert a second outbound message and deliver again — the lock from
// the first call must have been released.
insertOutbound('ag-1', session.id, 'out-second');
await deliverSessionMessages(session);
expect(calls).toHaveLength(2);
});
it('does not re-deliver when retried after a successful send (cleanup-after-send safety)', async () => {
// If something post-send throws (e.g. outbox cleanup), the message has
// still landed on the user's screen — the catch path must not trigger
// a re-send. We simulate by having the adapter succeed on the first
// call and recording how many times it's invoked across two attempts.
seedAgentAndChannel();
const { session } = resolveSession('ag-1', 'mg-1', null, 'shared');
insertOutbound('ag-1', session.id, 'out-once');
let callCount = 0;
setDeliveryAdapter({
async deliver() {
callCount++;
return 'plat-msg-id';
},
});
await deliverSessionMessages(session);
// Re-invoke — should be idempotent because the message is now in the
// delivered table; the channel adapter must not be called again.
await deliverSessionMessages(session);
expect(callCount).toBe(1);
});
});
+38 -3
View File
@@ -60,6 +60,21 @@ const MAX_DELIVERY_ATTEMPTS = 3;
/** Track delivery attempt counts. Resets on process restart (gives failed messages a fresh chance). */
const deliveryAttempts = new Map<string, number>();
/**
* Sessions whose outbound queue is currently being drained.
*
* The active poll (1s, running sessions) and the sweep poll (60s, all
* active sessions) both call deliverSessionMessages, and a running session
* is in *both* result sets. Without this guard, the two timer chains can
* race on the same outbound row: both read it as undelivered, both call
* the channel adapter, both markDelivered (idempotent in the DB via
* INSERT OR IGNORE — but the user has already seen the message twice).
*
* Skipping (vs. queueing) is correct: any message left over when the
* second caller skips will be picked up on the next poll tick (~1s).
*/
const inflightDeliveries = new Set<string>();
export interface ChannelDeliveryAdapter {
deliver(
channelType: string,
@@ -365,7 +380,20 @@ async function pollSweep(): Promise<void> {
setTimeout(pollSweep, SWEEP_POLL_MS);
}
async function deliverSessionMessages(session: Session): Promise<void> {
export async function deliverSessionMessages(session: Session): Promise<void> {
// Reject re-entry from a concurrent poll on the same session — see the
// comment on inflightDeliveries above.
if (inflightDeliveries.has(session.id)) return;
inflightDeliveries.add(session.id);
try {
await drainSession(session);
} finally {
inflightDeliveries.delete(session.id);
}
}
async function drainSession(session: Session): Promise<void> {
const agentGroup = getAgentGroup(session.agent_group_id);
if (!agentGroup) return;
@@ -594,9 +622,16 @@ async function deliverMessage(
fileCount: files?.length,
});
// Clean up outbox directory after successful delivery
// Clean up outbox best-effort — the message is already on the user's
// screen, so a cleanup failure must NOT propagate. If it did, the
// caller would treat the whole delivery as failed, retry on the next
// poll, and the user would see the message twice.
if (fs.existsSync(outboxDir)) {
fs.rmSync(outboxDir, { recursive: true, force: true });
try {
fs.rmSync(outboxDir, { recursive: true, force: true });
} catch (err) {
log.warn('Outbox cleanup failed (message already delivered)', { messageId: msg.id, err });
}
}
return platformMsgId;