diff --git a/setup/peer-cleanup.ts b/setup/peer-cleanup.ts new file mode 100644 index 000000000..10b22b992 --- /dev/null +++ b/setup/peer-cleanup.ts @@ -0,0 +1,186 @@ +/** + * Detect and clean up unhealthy NanoClaw peer services. + * + * Runs as a setup preflight before we install our own service. A crash-looping + * peer install (typically the legacy v1 `com.nanoclaw` plist) silently trashes + * this install's containers on every respawn because its `cleanupOrphans()` + * reaps anything matching `nanoclaw-`. We scope our reaper by label now, but + * we still need to stop the peer from killing us on its way down. + * + * A peer is "unhealthy" when: + * - launchd: `state != running` AND `runs > UNHEALTHY_RUNS_THRESHOLD` + * - systemd: unit is in `failed` state, OR `activating` with many restarts + * + * Healthy peers are left alone — multiple installs can coexist fine now that + * container-reaper is label-scoped. + */ +import { execFileSync } from 'child_process'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; + +import { getLaunchdLabel, getSystemdUnit } from '../src/install-slug.js'; +import { log } from '../src/log.js'; + +const UNHEALTHY_RUNS_THRESHOLD = 10; + +export interface PeerStatus { + label: string; + configPath: string; + state: string; + runs: number; + unhealthy: boolean; +} + +export interface PeerCleanupResult { + checked: PeerStatus[]; + unloaded: PeerStatus[]; + failures: Array<{ label: string; err: string }>; +} + +/** + * Scan for peer NanoClaw services and unload any that are crash-looping. + * Returns a summary suitable for emitStatus / setup-log reporting. + */ +export function cleanupUnhealthyPeers(projectRoot: string = process.cwd()): PeerCleanupResult { + const platform = os.platform(); + if (platform === 'darwin') { + return cleanupLaunchdPeers(projectRoot); + } + if (platform === 'linux') { + return cleanupSystemdPeers(projectRoot); + } + return { checked: [], unloaded: [], failures: [] }; +} + +// ---- launchd (macOS) -------------------------------------------------------- + +function cleanupLaunchdPeers(projectRoot: string): PeerCleanupResult { + const ownLabel = getLaunchdLabel(projectRoot); + const agentsDir = path.join(os.homedir(), 'Library', 'LaunchAgents'); + const result: PeerCleanupResult = { checked: [], unloaded: [], failures: [] }; + + let plists: string[]; + try { + plists = fs + .readdirSync(agentsDir) + .filter((f) => /^com\.nanoclaw.*\.plist$/.test(f)) + .map((f) => path.join(agentsDir, f)); + } catch { + return result; + } + + const uid = process.getuid?.() ?? 0; + + for (const plistPath of plists) { + const label = path.basename(plistPath, '.plist'); + if (label === ownLabel) continue; + + const status = probeLaunchdPeer(label, plistPath, uid); + if (!status) continue; + result.checked.push(status); + + if (!status.unhealthy) continue; + + try { + execFileSync('launchctl', ['unload', plistPath], { stdio: 'pipe' }); + log.info('Unloaded unhealthy peer launchd service', { + label, + state: status.state, + runs: status.runs, + plistPath, + }); + result.unloaded.push(status); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + log.warn('Failed to unload peer launchd service', { label, err: message }); + result.failures.push({ label, err: message }); + } + } + + return result; +} + +function probeLaunchdPeer(label: string, plistPath: string, uid: number): PeerStatus | null { + let output: string; + try { + output = execFileSync('launchctl', ['print', `gui/${uid}/${label}`], { + stdio: ['ignore', 'pipe', 'pipe'], + encoding: 'utf-8', + }); + } catch { + // Not loaded → not currently a threat. Skip silently. + return null; + } + + const state = /^\s*state\s*=\s*(.+?)\s*$/m.exec(output)?.[1] ?? 'unknown'; + const runsStr = /^\s*runs\s*=\s*(\d+)/m.exec(output)?.[1]; + const runs = runsStr ? parseInt(runsStr, 10) : 0; + + const unhealthy = state !== 'running' && runs > UNHEALTHY_RUNS_THRESHOLD; + return { label, configPath: plistPath, state, runs, unhealthy }; +} + +// ---- systemd (Linux) -------------------------------------------------------- + +function cleanupSystemdPeers(projectRoot: string): PeerCleanupResult { + const ownUnit = getSystemdUnit(projectRoot); + const unitDir = path.join(os.homedir(), '.config', 'systemd', 'user'); + const result: PeerCleanupResult = { checked: [], unloaded: [], failures: [] }; + + let units: string[]; + try { + units = fs + .readdirSync(unitDir) + .filter((f) => /^nanoclaw.*\.service$/.test(f)) + .map((f) => f.replace(/\.service$/, '')); + } catch { + return result; + } + + for (const unit of units) { + if (unit === ownUnit) continue; + + const status = probeSystemdPeer(unit); + if (!status) continue; + result.checked.push(status); + + if (!status.unhealthy) continue; + + try { + execFileSync('systemctl', ['--user', 'disable', '--now', `${unit}.service`], { stdio: 'pipe' }); + log.info('Disabled unhealthy peer systemd unit', { + unit, + state: status.state, + runs: status.runs, + }); + result.unloaded.push(status); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + log.warn('Failed to disable peer systemd unit', { unit, err: message }); + result.failures.push({ label: unit, err: message }); + } + } + + return result; +} + +function probeSystemdPeer(unit: string): PeerStatus | null { + const unitPath = path.join(os.homedir(), '.config', 'systemd', 'user', `${unit}.service`); + try { + const output = execFileSync( + 'systemctl', + ['--user', 'show', '--property=ActiveState,NRestarts', `${unit}.service`], + { stdio: ['ignore', 'pipe', 'pipe'], encoding: 'utf-8' }, + ); + const activeState = /^ActiveState=(.+)$/m.exec(output)?.[1]?.trim() ?? 'unknown'; + const restartsStr = /^NRestarts=(\d+)/m.exec(output)?.[1]; + const runs = restartsStr ? parseInt(restartsStr, 10) : 0; + + const unhealthy = + activeState === 'failed' || (activeState !== 'active' && runs > UNHEALTHY_RUNS_THRESHOLD); + return { label: unit, configPath: unitPath, state: activeState, runs, unhealthy }; + } catch { + return null; + } +} diff --git a/setup/service.ts b/setup/service.ts index 79304610f..777c0c5cb 100644 --- a/setup/service.ts +++ b/setup/service.ts @@ -11,6 +11,7 @@ import path from 'path'; import { log } from '../src/log.js'; import { getLaunchdLabel, getSystemdUnit } from '../src/install-slug.js'; +import { cleanupUnhealthyPeers } from './peer-cleanup.js'; import { commandExists, getPlatform, @@ -53,6 +54,19 @@ export async function run(_args: string[]): Promise { fs.mkdirSync(path.join(projectRoot, 'logs'), { recursive: true }); + // Peer preflight — a crash-looping peer install (most often the legacy v1 + // `com.nanoclaw` plist) will keep trashing this install's containers on + // every respawn via its own cleanupOrphans. Detect and unload any peer + // that's unhealthy before we install our service. Healthy peers are left + // alone now that container reaping is install-label-scoped. + const peerReport = cleanupUnhealthyPeers(projectRoot); + if (peerReport.unloaded.length > 0) { + log.warn('Unloaded unhealthy peer NanoClaw services', { + count: peerReport.unloaded.length, + labels: peerReport.unloaded.map((p) => p.label), + }); + } + if (platform === 'macos') { setupLaunchd(projectRoot, nodePath, homeDir); } else if (platform === 'linux') { diff --git a/src/config.ts b/src/config.ts index 79a1ce9df..a82d4f5c9 100644 --- a/src/config.ts +++ b/src/config.ts @@ -2,7 +2,7 @@ import os from 'os'; import path from 'path'; import { readEnvFile } from './env.js'; -import { getContainerImageBase, getDefaultContainerImage } from './install-slug.js'; +import { getContainerImageBase, getDefaultContainerImage, getInstallSlug } from './install-slug.js'; import { isValidTimezone } from './timezone.js'; // Read config values from .env (falls back to process.env). @@ -27,6 +27,10 @@ export const DATA_DIR = path.resolve(PROJECT_ROOT, 'data'); // `nanoclaw-agent:latest` and clobber each other on rebuild. export const CONTAINER_IMAGE_BASE = process.env.CONTAINER_IMAGE_BASE || getContainerImageBase(PROJECT_ROOT); export const CONTAINER_IMAGE = process.env.CONTAINER_IMAGE || getDefaultContainerImage(PROJECT_ROOT); +// Install slug — stamped onto every spawned container via --label so +// cleanupOrphans only reaps containers from this install, not peers. +export const INSTALL_SLUG = getInstallSlug(PROJECT_ROOT); +export const CONTAINER_INSTALL_LABEL = `nanoclaw-install=${INSTALL_SLUG}`; export const CONTAINER_TIMEOUT = parseInt(process.env.CONTAINER_TIMEOUT || '1800000', 10); export const CONTAINER_MAX_OUTPUT_SIZE = parseInt(process.env.CONTAINER_MAX_OUTPUT_SIZE || '10485760', 10); // 10MB default export const ONECLI_URL = process.env.ONECLI_URL || envConfig.ONECLI_URL; diff --git a/src/container-runner.ts b/src/container-runner.ts index 646b11811..71e2064f3 100644 --- a/src/container-runner.ts +++ b/src/container-runner.ts @@ -12,6 +12,7 @@ import { OneCLI } from '@onecli-sh/sdk'; import { CONTAINER_IMAGE, CONTAINER_IMAGE_BASE, + CONTAINER_INSTALL_LABEL, DATA_DIR, GROUPS_DIR, ONECLI_API_KEY, @@ -389,7 +390,7 @@ async function buildContainerArgs( providerContribution: ProviderContainerContribution, agentIdentifier?: string, ): Promise { - const args: string[] = ['run', '--rm', '--name', containerName]; + const args: string[] = ['run', '--rm', '--name', containerName, '--label', CONTAINER_INSTALL_LABEL]; // Environment — only vars read by code we don't own. // Everything NanoClaw-specific is in container.json (read by runner at startup). diff --git a/src/container-runtime.test.ts b/src/container-runtime.test.ts index 47d97448e..f6f6e8a82 100644 --- a/src/container-runtime.test.ts +++ b/src/container-runtime.test.ts @@ -24,6 +24,7 @@ import { ensureContainerRuntimeRunning, cleanupOrphans, } from './container-runtime.js'; +import { CONTAINER_INSTALL_LABEL } from './config.js'; import { log } from './log.js'; beforeEach(() => { @@ -84,6 +85,17 @@ describe('ensureContainerRuntimeRunning', () => { // --- cleanupOrphans --- describe('cleanupOrphans', () => { + it('filters ps by the install label so peers are not reaped', () => { + mockExecSync.mockReturnValueOnce(''); + + cleanupOrphans(); + + expect(mockExecSync).toHaveBeenCalledWith( + `${CONTAINER_RUNTIME_BIN} ps --filter label=${CONTAINER_INSTALL_LABEL} --format '{{.Names}}'`, + expect.any(Object), + ); + }); + it('stops orphaned nanoclaw containers', () => { // docker ps returns container names, one per line mockExecSync.mockReturnValueOnce('nanoclaw-group1-111\nnanoclaw-group2-222\n'); diff --git a/src/container-runtime.ts b/src/container-runtime.ts index 5e684269a..82ddb5eca 100644 --- a/src/container-runtime.ts +++ b/src/container-runtime.ts @@ -5,6 +5,7 @@ import { execSync } from 'child_process'; import os from 'os'; +import { CONTAINER_INSTALL_LABEL } from './config.js'; import { log } from './log.js'; /** The container runtime binary name. */ @@ -56,13 +57,22 @@ export function ensureContainerRuntimeRunning(): void { } } -/** Kill orphaned NanoClaw containers from previous runs. */ +/** + * Kill orphaned NanoClaw containers from THIS install's previous runs. + * + * Scoped by label `nanoclaw-install=` so a crash-looping peer install + * cannot reap our containers, and we cannot reap theirs. The label is + * stamped onto every container at spawn time — see container-runner.ts. + */ export function cleanupOrphans(): void { try { - const output = execSync(`${CONTAINER_RUNTIME_BIN} ps --filter name=nanoclaw- --format '{{.Names}}'`, { - stdio: ['pipe', 'pipe', 'pipe'], - encoding: 'utf-8', - }); + const output = execSync( + `${CONTAINER_RUNTIME_BIN} ps --filter label=${CONTAINER_INSTALL_LABEL} --format '{{.Names}}'`, + { + stdio: ['pipe', 'pipe', 'pipe'], + encoding: 'utf-8', + }, + ); const orphans = output.trim().split('\n').filter(Boolean); for (const name of orphans) { try {