Files
nanoclaw/src/container-runner.ts
T
gavrielc 3db66c0ced fix: forward ONECLI_API_KEY to OneCLI SDK for authenticated container config
Ports the v1 fix from PR #1777 (originally 8b5b581 by @johnnyfish).
Cherry-pick did not apply cleanly because v2 reformatted the surrounding
code and split OneCLI usage into two sites — manual port was needed.

v2-specific adaptations:
- Also forward apiKey at the second OneCLI call site in
  src/modules/approvals/onecli-approvals.ts (v2 split the approvals
  module out of container-runner).
- Skipped the companion test-mock commit (38163bc) — it patches
  src/container-runner.test.ts, which no longer exists in v2 (tests
  consolidated into host-core.test.ts).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-Authored-By: johnnyfish <jonathanfishner11@gmail.com>
2026-04-22 15:16:59 +03:00

479 lines
18 KiB
TypeScript

/**
* Container Runner v2
* Spawns agent containers with session folder + agent group folder mounts.
* The container runs the v2 agent-runner which polls the session DB.
*/
import { ChildProcess, execSync, spawn } from 'child_process';
import fs from 'fs';
import path from 'path';
import { OneCLI } from '@onecli-sh/sdk';
import { CONTAINER_IMAGE, DATA_DIR, GROUPS_DIR, ONECLI_API_KEY, ONECLI_URL, TIMEZONE } from './config.js';
import { readContainerConfig, writeContainerConfig } from './container-config.js';
import { CONTAINER_RUNTIME_BIN, hostGatewayArgs, readonlyMountArgs, stopContainer } from './container-runtime.js';
import { composeGroupClaudeMd } from './claude-md-compose.js';
import { getAgentGroup } from './db/agent-groups.js';
import { getDb, hasTable } from './db/connection.js';
import { initGroupFilesystem } from './group-init.js';
import { stopTypingRefresh } from './modules/typing/index.js';
import { log } from './log.js';
import { validateAdditionalMounts } from './modules/mount-security/index.js';
// Provider host-side config barrel — each provider that needs host-side
// container setup self-registers on import.
import './providers/index.js';
import {
getProviderContainerConfig,
type ProviderContainerContribution,
type VolumeMount,
} from './providers/provider-container-registry.js';
import { markContainerRunning, markContainerStopped, sessionDir, writeSessionRouting } from './session-manager.js';
import type { AgentGroup, Session } from './types.js';
const onecli = new OneCLI({ url: ONECLI_URL, apiKey: ONECLI_API_KEY });
/** Active containers tracked by session ID. */
const activeContainers = new Map<string, { process: ChildProcess; containerName: string }>();
/**
* In-flight wake promises, keyed by session id. Deduplicates concurrent
* `wakeContainer` calls while the first spawn is still mid-setup (async
* buildContainerArgs, OneCLI gateway apply, etc.) — otherwise a second
* wake in that window passes the `activeContainers.has` check and spawns
* a duplicate container against the same session directory, producing
* racy double-replies.
*/
const wakePromises = new Map<string, Promise<void>>();
export function getActiveContainerCount(): number {
return activeContainers.size;
}
export function isContainerRunning(sessionId: string): boolean {
return activeContainers.has(sessionId);
}
/**
* Wake up a container for a session. If already running or mid-spawn, no-op
* (the in-flight wake promise is reused).
*
* The container runs the v2 agent-runner which polls the session DB.
*/
export function wakeContainer(session: Session): Promise<void> {
if (activeContainers.has(session.id)) {
log.debug('Container already running', { sessionId: session.id });
return Promise.resolve();
}
const existing = wakePromises.get(session.id);
if (existing) {
log.debug('Container wake already in-flight — joining existing promise', { sessionId: session.id });
return existing;
}
const promise = spawnContainer(session).finally(() => {
wakePromises.delete(session.id);
});
wakePromises.set(session.id, promise);
return promise;
}
async function spawnContainer(session: Session): Promise<void> {
const agentGroup = getAgentGroup(session.agent_group_id);
if (!agentGroup) {
log.error('Agent group not found', { agentGroupId: session.agent_group_id });
return;
}
// Refresh the destination map and default reply routing so any admin
// changes take effect on wake. Destinations come from the agent-to-agent
// module — skip when the module isn't installed (table absent).
if (hasTable(getDb(), 'agent_destinations')) {
const { writeDestinations } = await import('./modules/agent-to-agent/write-destinations.js');
writeDestinations(agentGroup.id, session.id);
}
writeSessionRouting(agentGroup.id, session.id);
// Read container config once — threaded through provider resolution,
// buildMounts, and buildContainerArgs so we don't re-read the file.
const containerConfig = readContainerConfig(agentGroup.folder);
// Ensure container.json has the agent group identity fields the runner needs.
// Written at spawn time so the runner can read them from the RO mount.
ensureRuntimeFields(containerConfig, agentGroup);
// Resolve the effective provider + any host-side contribution it declares
// (extra mounts, env passthrough). Computed once and threaded through both
// buildMounts and buildContainerArgs so side effects (mkdir, etc.) fire once.
const { provider, contribution } = resolveProviderContribution(session, agentGroup, containerConfig);
const mounts = buildMounts(agentGroup, session, containerConfig, contribution);
const containerName = `nanoclaw-v2-${agentGroup.folder}-${Date.now()}`;
// OneCLI agent identifier is always the agent group id — stable across
// sessions and reversible via getAgentGroup() for approval routing.
const agentIdentifier = agentGroup.id;
const args = await buildContainerArgs(
mounts,
containerName,
agentGroup,
containerConfig,
provider,
contribution,
agentIdentifier,
);
log.info('Spawning container', { sessionId: session.id, agentGroup: agentGroup.name, containerName });
const container = spawn(CONTAINER_RUNTIME_BIN, args, { stdio: ['ignore', 'pipe', 'pipe'] });
activeContainers.set(session.id, { process: container, containerName });
markContainerRunning(session.id);
// Log stderr
container.stderr?.on('data', (data) => {
for (const line of data.toString().trim().split('\n')) {
if (line) log.debug(line, { container: agentGroup.folder });
}
});
// stdout is unused in v2 (all IO is via session DB)
container.stdout?.on('data', () => {});
// No host-side idle timeout. Stale/stuck detection is driven by the host
// sweep reading heartbeat mtime + processing_ack claim age + container_state
// (see src/host-sweep.ts). This avoids killing long-running legitimate work
// on a wall-clock timer.
container.on('close', (code) => {
activeContainers.delete(session.id);
markContainerStopped(session.id);
stopTypingRefresh(session.id);
log.info('Container exited', { sessionId: session.id, code, containerName });
});
container.on('error', (err) => {
activeContainers.delete(session.id);
markContainerStopped(session.id);
stopTypingRefresh(session.id);
log.error('Container spawn error', { sessionId: session.id, err });
});
}
/** Kill a container for a session. */
export function killContainer(sessionId: string, reason: string): void {
const entry = activeContainers.get(sessionId);
if (!entry) return;
log.info('Killing container', { sessionId, reason, containerName: entry.containerName });
try {
stopContainer(entry.containerName);
} catch {
entry.process.kill('SIGKILL');
}
}
function resolveProviderContribution(
session: Session,
agentGroup: AgentGroup,
containerConfig: import('./container-config.js').ContainerConfig,
): { provider: string; contribution: ProviderContainerContribution } {
const provider = (containerConfig.provider || 'claude').toLowerCase();
const fn = getProviderContainerConfig(provider);
const contribution = fn
? fn({
sessionDir: sessionDir(agentGroup.id, session.id),
agentGroupId: agentGroup.id,
hostEnv: process.env,
})
: {};
return { provider, contribution };
}
function buildMounts(
agentGroup: AgentGroup,
session: Session,
containerConfig: import('./container-config.js').ContainerConfig,
providerContribution: ProviderContainerContribution,
): VolumeMount[] {
const projectRoot = process.cwd();
// Per-group filesystem state lives forever after first creation. Init is
// idempotent: it only writes paths that don't already exist, so this call
// is a no-op for groups that have spawned before.
initGroupFilesystem(agentGroup);
// Sync skill symlinks based on container.json selection before mounting.
const claudeDir = path.join(DATA_DIR, 'v2-sessions', agentGroup.id, '.claude-shared');
syncSkillSymlinks(claudeDir, containerConfig);
// Compose CLAUDE.md fresh every spawn from the shared base, enabled skill
// fragments, and MCP server instructions. See `claude-md-compose.ts`.
composeGroupClaudeMd(agentGroup);
const mounts: VolumeMount[] = [];
const sessDir = sessionDir(agentGroup.id, session.id);
const groupDir = path.resolve(GROUPS_DIR, agentGroup.folder);
// Session folder at /workspace (contains inbound.db, outbound.db, outbox/, .claude/)
mounts.push({ hostPath: sessDir, containerPath: '/workspace', readonly: false });
// Agent group folder at /workspace/agent (RW for working files + CLAUDE.md)
mounts.push({ hostPath: groupDir, containerPath: '/workspace/agent', readonly: false });
// container.json — nested RO mount on top of RW group dir so the agent
// can read its config but cannot modify it.
const containerJsonPath = path.join(groupDir, 'container.json');
if (fs.existsSync(containerJsonPath)) {
mounts.push({ hostPath: containerJsonPath, containerPath: '/workspace/agent/container.json', readonly: true });
}
// Global memory directory — always read-only.
const globalDir = path.join(GROUPS_DIR, 'global');
if (fs.existsSync(globalDir)) {
mounts.push({ hostPath: globalDir, containerPath: '/workspace/global', readonly: true });
}
// Shared CLAUDE.md — read-only, imported by the composed entry point via
// the `.claude-shared.md` symlink inside the group dir.
const sharedClaudeMd = path.join(process.cwd(), 'container', 'CLAUDE.md');
if (fs.existsSync(sharedClaudeMd)) {
mounts.push({ hostPath: sharedClaudeMd, containerPath: '/app/CLAUDE.md', readonly: true });
}
// Per-group .claude-shared at /home/node/.claude (Claude state, settings,
// skill symlinks)
mounts.push({ hostPath: claudeDir, containerPath: '/home/node/.claude', readonly: false });
// Shared agent-runner source — read-only, same code for all groups.
const agentRunnerSrc = path.join(projectRoot, 'container', 'agent-runner', 'src');
mounts.push({ hostPath: agentRunnerSrc, containerPath: '/app/src', readonly: true });
// Shared skills — read-only, symlinks in .claude-shared/skills/ point here.
const skillsSrc = path.join(projectRoot, 'container', 'skills');
if (fs.existsSync(skillsSrc)) {
mounts.push({ hostPath: skillsSrc, containerPath: '/app/skills', readonly: true });
}
// Additional mounts from container config
if (containerConfig.additionalMounts && containerConfig.additionalMounts.length > 0) {
const validated = validateAdditionalMounts(containerConfig.additionalMounts, agentGroup.name);
mounts.push(...validated);
}
// Provider-contributed mounts (e.g. opencode-xdg)
if (providerContribution.mounts) {
mounts.push(...providerContribution.mounts);
}
return mounts;
}
/**
* Sync skill symlinks in .claude-shared/skills/ to match the container.json
* selection. Each symlink points to a container path (/app/skills/<name>)
* so it's dangling on the host but valid inside the container.
*/
function syncSkillSymlinks(claudeDir: string, containerConfig: import('./container-config.js').ContainerConfig): void {
const skillsDir = path.join(claudeDir, 'skills');
if (!fs.existsSync(skillsDir)) {
fs.mkdirSync(skillsDir, { recursive: true });
}
// Determine desired skill set
const projectRoot = process.cwd();
const sharedSkillsDir = path.join(projectRoot, 'container', 'skills');
let desired: string[];
if (containerConfig.skills === 'all') {
// Recompute from shared dir — newly-added upstream skills appear automatically
desired = fs.existsSync(sharedSkillsDir)
? fs.readdirSync(sharedSkillsDir).filter((e) => {
try {
return fs.statSync(path.join(sharedSkillsDir, e)).isDirectory();
} catch {
return false;
}
})
: [];
} else {
desired = containerConfig.skills;
}
const desiredSet = new Set(desired);
// Remove symlinks not in the desired set
for (const entry of fs.readdirSync(skillsDir)) {
const entryPath = path.join(skillsDir, entry);
let isSymlink = false;
try {
isSymlink = fs.lstatSync(entryPath).isSymbolicLink();
} catch {
continue;
}
if (isSymlink && !desiredSet.has(entry)) {
fs.unlinkSync(entryPath);
}
}
// Create symlinks for desired skills (container path targets)
for (const skill of desired) {
const linkPath = path.join(skillsDir, skill);
let exists = false;
try {
fs.lstatSync(linkPath);
exists = true;
} catch {
/* missing */
}
if (!exists) {
fs.symlinkSync(`/app/skills/${skill}`, linkPath);
}
}
}
/**
* Ensure container.json has the runtime identity fields the runner needs.
* Written at spawn time so they're always current even if the DB values
* change (e.g. group rename). Only writes if values differ to avoid
* unnecessary file churn.
*/
function ensureRuntimeFields(
containerConfig: import('./container-config.js').ContainerConfig,
agentGroup: AgentGroup,
): void {
let dirty = false;
if (containerConfig.agentGroupId !== agentGroup.id) {
containerConfig.agentGroupId = agentGroup.id;
dirty = true;
}
if (containerConfig.groupName !== agentGroup.name) {
containerConfig.groupName = agentGroup.name;
dirty = true;
}
if (containerConfig.assistantName !== agentGroup.name) {
containerConfig.assistantName = agentGroup.name;
dirty = true;
}
if (dirty) {
writeContainerConfig(agentGroup.folder, containerConfig);
}
}
async function buildContainerArgs(
mounts: VolumeMount[],
containerName: string,
agentGroup: AgentGroup,
containerConfig: import('./container-config.js').ContainerConfig,
provider: string,
providerContribution: ProviderContainerContribution,
agentIdentifier?: string,
): Promise<string[]> {
const args: string[] = ['run', '--rm', '--name', containerName];
// Environment — only vars read by code we don't own.
// Everything NanoClaw-specific is in container.json (read by runner at startup).
args.push('-e', `TZ=${TIMEZONE}`);
// Provider-contributed env vars (e.g. XDG_DATA_HOME, OPENCODE_*, NO_PROXY).
if (providerContribution.env) {
for (const [key, value] of Object.entries(providerContribution.env)) {
args.push('-e', `${key}=${value}`);
}
}
// OneCLI gateway — injects HTTPS_PROXY + certs so container API calls
// are routed through the agent vault for credential injection.
try {
if (agentIdentifier) {
await onecli.ensureAgent({ name: agentGroup.name, identifier: agentIdentifier });
}
const onecliApplied = await onecli.applyContainerConfig(args, { addHostMapping: false, agent: agentIdentifier });
if (onecliApplied) {
log.info('OneCLI gateway applied', { containerName });
} else {
log.warn('OneCLI gateway not applied — container will have no credentials', { containerName });
}
} catch (err) {
log.warn('OneCLI gateway error — container will have no credentials', { containerName, err });
}
// Host gateway
args.push(...hostGatewayArgs());
// User mapping
const hostUid = process.getuid?.();
const hostGid = process.getgid?.();
if (hostUid != null && hostUid !== 0 && hostUid !== 1000) {
args.push('--user', `${hostUid}:${hostGid}`);
args.push('-e', 'HOME=/home/node');
}
// Volume mounts
for (const mount of mounts) {
if (mount.readonly) {
args.push(...readonlyMountArgs(mount.hostPath, mount.containerPath));
} else {
args.push('-v', `${mount.hostPath}:${mount.containerPath}`);
}
}
// Override entrypoint: run v2 entry point directly via Bun (no tsc, no stdin).
args.push('--entrypoint', 'bash');
// Use per-agent-group image if one has been built, otherwise base image
const imageTag = containerConfig.imageTag || CONTAINER_IMAGE;
args.push(imageTag);
args.push('-c', 'exec bun run /app/src/index.ts');
return args;
}
/** Build a per-agent-group Docker image with custom packages. */
export async function buildAgentGroupImage(agentGroupId: string): Promise<void> {
const agentGroup = getAgentGroup(agentGroupId);
if (!agentGroup) throw new Error('Agent group not found');
const containerConfig = readContainerConfig(agentGroup.folder);
const aptPackages = containerConfig.packages.apt;
const npmPackages = containerConfig.packages.npm;
if (aptPackages.length === 0 && npmPackages.length === 0) {
throw new Error('No packages to install. Use install_packages first.');
}
let dockerfile = `FROM ${CONTAINER_IMAGE}\nUSER root\n`;
if (aptPackages.length > 0) {
dockerfile += `RUN apt-get update && apt-get install -y ${aptPackages.join(' ')} && rm -rf /var/lib/apt/lists/*\n`;
}
if (npmPackages.length > 0) {
// pnpm skips build scripts unless packages are allowlisted. Append each
// to /root/.npmrc (base image sets it up for agent-browser) so packages
// with postinstall — e.g. playwright, puppeteer, native addons — don't
// install silently broken.
const allowlist = npmPackages.map((p) => `echo 'only-built-dependencies[]=${p}' >> /root/.npmrc`).join(' && ');
dockerfile += `RUN ${allowlist} && pnpm install -g ${npmPackages.join(' ')}\n`;
}
dockerfile += 'USER node\n';
const imageTag = `nanoclaw-agent:${agentGroupId}`;
log.info('Building per-agent-group image', { agentGroupId, imageTag, apt: aptPackages, npm: npmPackages });
// Write Dockerfile to temp file and build
const tmpDockerfile = path.join(DATA_DIR, `Dockerfile.${agentGroupId}`);
fs.writeFileSync(tmpDockerfile, dockerfile);
try {
execSync(`${CONTAINER_RUNTIME_BIN} build -t ${imageTag} -f ${tmpDockerfile} .`, {
cwd: DATA_DIR,
stdio: 'pipe',
timeout: 300_000,
});
} finally {
fs.unlinkSync(tmpDockerfile);
}
// Store the image tag in groups/<folder>/container.json
containerConfig.imageTag = imageTag;
writeContainerConfig(agentGroup.folder, containerConfig);
log.info('Per-agent-group image built', { agentGroupId, imageTag });
}