mirror of
https://github.com/qwibitai/nanoclaw.git
synced 2026-06-04 10:14:47 +08:00
add startup circuit breaker and troubleshooting docs
Backs off on rapid restarts to avoid exhausting Discord gateway identify limits and triggering Cloudflare IP bans. Resets on clean shutdown so only crashes accumulate the counter. Also adds a troubleshooting section to CLAUDE.md with the most useful diagnostic locations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -186,7 +186,17 @@ launchctl kickstart -k gui/$(id -u)/com.nanoclaw # restart
|
||||
systemctl --user start|stop|restart nanoclaw
|
||||
```
|
||||
|
||||
Host logs: `logs/nanoclaw.log` (normal) and `logs/nanoclaw.error.log` (errors only — some delivery/approval failures only show up here).
|
||||
## Troubleshooting
|
||||
|
||||
Check these first when something goes wrong:
|
||||
|
||||
| What | Where |
|
||||
|------|-------|
|
||||
| Host logs | `logs/nanoclaw.error.log` first (delivery failures, crash-loop backoff, warnings), then `logs/nanoclaw.log` for the full routing chain |
|
||||
| Setup logs | `logs/setup.log` (overall), `logs/setup-steps/*.log` (per-step: bootstrap, environment, container, onecli, mounts, service, etc.) |
|
||||
| Session DBs | `data/v2-sessions/<agent-group>/<session>/` — `inbound.db` (`messages_in`: did the message reach the container?), `outbound.db` (`messages_out`: did the agent produce a response?) |
|
||||
|
||||
Note: container logs are lost after the container exits (`--rm` flag). If the agent silently failed inside the container, there's no persistent log to inspect.
|
||||
|
||||
## Supply Chain Security (pnpm)
|
||||
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
import { DATA_DIR } from './config.js';
|
||||
import { log } from './log.js';
|
||||
|
||||
const CB_PATH = path.join(DATA_DIR, 'circuit-breaker.json');
|
||||
const RESET_WINDOW_MS = 60 * 60 * 1000; // 1 hour
|
||||
const BACKOFF_SCHEDULE_S = [0, 0, 10, 30, 120, 300, 900]; // index = attempt number, 6+ capped at 15min
|
||||
|
||||
interface CircuitBreakerState {
|
||||
attempt: number;
|
||||
timestamp: string;
|
||||
}
|
||||
|
||||
function read(): CircuitBreakerState | null {
|
||||
try {
|
||||
const raw = fs.readFileSync(CB_PATH, 'utf-8');
|
||||
return JSON.parse(raw) as CircuitBreakerState;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function write(state: CircuitBreakerState): void {
|
||||
fs.writeFileSync(CB_PATH, JSON.stringify(state, null, 2) + '\n');
|
||||
}
|
||||
|
||||
function getDelay(attempt: number): number {
|
||||
const idx = Math.min(attempt, BACKOFF_SCHEDULE_S.length - 1);
|
||||
return BACKOFF_SCHEDULE_S[idx];
|
||||
}
|
||||
|
||||
export function resetCircuitBreaker(): void {
|
||||
try {
|
||||
fs.unlinkSync(CB_PATH);
|
||||
log.info('Circuit breaker reset on clean shutdown');
|
||||
} catch {}
|
||||
}
|
||||
|
||||
export async function enforceStartupBackoff(): Promise<void> {
|
||||
const now = new Date();
|
||||
const prev = read();
|
||||
|
||||
let attempt: number;
|
||||
if (!prev) {
|
||||
attempt = 1;
|
||||
} else {
|
||||
const elapsedMs = now.getTime() - new Date(prev.timestamp).getTime();
|
||||
if (elapsedMs < RESET_WINDOW_MS) {
|
||||
attempt = prev.attempt + 1;
|
||||
log.warn('Previous startup was not a clean shutdown', {
|
||||
previousAttempt: prev.attempt,
|
||||
previousTimestamp: prev.timestamp,
|
||||
elapsedSec: Math.round(elapsedMs / 1000),
|
||||
});
|
||||
} else {
|
||||
attempt = 1;
|
||||
log.info('Circuit breaker reset — last startup was over 1h ago', {
|
||||
previousAttempt: prev.attempt,
|
||||
previousTimestamp: prev.timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
write({ attempt, timestamp: now.toISOString() });
|
||||
|
||||
const delaySec = getDelay(attempt);
|
||||
if (delaySec > 0) {
|
||||
const resumeAt = new Date(now.getTime() + delaySec * 1000).toISOString();
|
||||
log.warn('Circuit breaker: delaying startup due to repeated crashes', {
|
||||
attempt,
|
||||
delaySec,
|
||||
resumeAt,
|
||||
});
|
||||
await new Promise((resolve) => setTimeout(resolve, delaySec * 1000));
|
||||
log.info('Circuit breaker: backoff complete, resuming startup', { attempt });
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@
|
||||
import path from 'path';
|
||||
|
||||
import { DATA_DIR } from './config.js';
|
||||
import { enforceStartupBackoff, resetCircuitBreaker } from './circuit-breaker.js';
|
||||
import { migrateGroupsToClaudeLocal } from './claude-md-compose.js';
|
||||
import { initDb } from './db/connection.js';
|
||||
import { runMigrations } from './db/migrations/index.js';
|
||||
@@ -58,6 +59,9 @@ import { initChannelAdapters, teardownChannelAdapters, getChannelAdapter } from
|
||||
async function main(): Promise<void> {
|
||||
log.info('NanoClaw starting');
|
||||
|
||||
// 0. Circuit breaker — backoff on rapid restarts
|
||||
await enforceStartupBackoff();
|
||||
|
||||
// 1. Init central DB
|
||||
const dbPath = path.join(DATA_DIR, 'v2.db');
|
||||
const db = initDb(dbPath);
|
||||
@@ -175,6 +179,7 @@ async function shutdown(signal: string): Promise<void> {
|
||||
stopDeliveryPolls();
|
||||
stopHostSweep();
|
||||
await teardownChannelAdapters();
|
||||
resetCircuitBreaker();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user