add startup circuit breaker and troubleshooting docs

Backs off on rapid restarts to avoid exhausting Discord gateway identify
limits and triggering Cloudflare IP bans. Resets on clean shutdown so only
crashes accumulate the counter. Also adds a troubleshooting section to
CLAUDE.md with the most useful diagnostic locations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Daniel Milliner
2026-04-28 14:01:32 +00:00
parent ae9bcb7c33
commit 2bf296b04a
3 changed files with 95 additions and 1 deletions
+11 -1
View File
@@ -186,7 +186,17 @@ launchctl kickstart -k gui/$(id -u)/com.nanoclaw # restart
systemctl --user start|stop|restart nanoclaw
```
Host logs: `logs/nanoclaw.log` (normal) and `logs/nanoclaw.error.log` (errors only — some delivery/approval failures only show up here).
## Troubleshooting
Check these first when something goes wrong:
| What | Where |
|------|-------|
| Host logs | `logs/nanoclaw.error.log` first (delivery failures, crash-loop backoff, warnings), then `logs/nanoclaw.log` for the full routing chain |
| Setup logs | `logs/setup.log` (overall), `logs/setup-steps/*.log` (per-step: bootstrap, environment, container, onecli, mounts, service, etc.) |
| Session DBs | `data/v2-sessions/<agent-group>/<session>/``inbound.db` (`messages_in`: did the message reach the container?), `outbound.db` (`messages_out`: did the agent produce a response?) |
Note: container logs are lost after the container exits (`--rm` flag). If the agent silently failed inside the container, there's no persistent log to inspect.
## Supply Chain Security (pnpm)
+79
View File
@@ -0,0 +1,79 @@
import fs from 'fs';
import path from 'path';
import { DATA_DIR } from './config.js';
import { log } from './log.js';
const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json');
const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour
const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; // index = attempt number, 6+ capped at 15min
interface CircuitBreakerState {
attempt: number;
timestamp: string;
}
function read(): CircuitBreakerState | null {
try {
const raw = fs.readFileSync(CB_PATH, 'utf-8');
return JSON.parse(raw) as CircuitBreakerState;
} catch {
return null;
}
}
function write(state: CircuitBreakerState): void {
fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n');
}
function getDelay(attempt: number): number {
const idx = Math.min(attempt, BACKOFF_SCHEDULE_S.length - 1);
return BACKOFF_SCHEDULE_S[idx];
}
export function resetCircuitBreaker(): void {
try {
fs.unlinkSync(CB_PATH);
log.info('Circuit breaker reset on clean shutdown');
} catch {}
}
export async function enforceStartupBackoff(): Promise<void> {
const now = new Date();
const prev = read();
let attempt: number;
if (!prev) {
attempt = 1;
} else {
const elapsedMs = now.getTime() - new Date(prev.timestamp).getTime();
if (elapsedMs < RESET_WINDOW_MS) {
attempt = prev.attempt + 1;
log.warn('Previous startup was not a clean shutdown', {
previousAttempt: prev.attempt,
previousTimestamp: prev.timestamp,
elapsedSec: Math.round(elapsedMs / 1000),
});
} else {
attempt = 1;
log.info('Circuit breaker reset — last startup was over 1h ago', {
previousAttempt: prev.attempt,
previousTimestamp: prev.timestamp,
});
}
}
write({ attempt, timestamp: now.toISOString() });
const delaySec = getDelay(attempt);
if (delaySec > 0) {
const resumeAt = new Date(now.getTime() + delaySec * 1000).toISOString();
log.warn('Circuit breaker: delaying startup due to repeated crashes', {
attempt,
delaySec,
resumeAt,
});
await new Promise((resolve) => setTimeout(resolve, delaySec * 1000));
log.info('Circuit breaker: backoff complete, resuming startup', { attempt });
}
}
+5
View File
@@ -7,6 +7,7 @@
import path from 'path';
import { DATA_DIR } from './config.js';
import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js';
import { migrateGroupsToClaudeLocal } from './claude-md-compose.js';
import { initDb } from './db/connection.js';
import { runMigrations } from './db/migrations/index.js';
@@ -58,6 +59,9 @@ import { initChannelAdapters, teardownChannelAdapters, getChannelAdapter } from
async function main(): Promise<void> {
log.info('NanoClaw starting');
// 0. Circuit breaker — backoff on rapid restarts
await enforceStartupBackoff();
// 1. Init central DB
const dbPath = path.join(DATA_DIR, 'v2.db');
const db = initDb(dbPath);
@@ -175,6 +179,7 @@ async function shutdown(signal: string): Promise<void> {
stopDeliveryPolls();
stopHostSweep();
await teardownChannelAdapters();
resetCircuitBreaker();
process.exit(0);
}