diff --git a/setup/auto.ts b/setup/auto.ts index bbe6326e8..8ef87d804 100644 --- a/setup/auto.ts +++ b/setup/auto.ts @@ -71,6 +71,33 @@ function runStep(name: string, extra: string[] = []): Promise { }); } +/** + * After installing Docker, this process's supplementary groups are still + * frozen from login — subsequent steps that talk to /var/run/docker.sock + * (onecli install, service start, …) fail with EACCES even though the + * daemon is up. Detect that and re-exec the whole driver under `sg docker` + * so the rest of the run inherits the docker group without a re-login. + */ +function maybeReexecUnderSg(): void { + if (process.env.NANOCLAW_REEXEC_SG === '1') return; // already re-exec'd + if (process.platform !== 'linux') return; + const info = spawnSync('docker', ['info'], { encoding: 'utf-8' }); + if (info.status === 0) return; + const err = `${info.stderr ?? ''}\n${info.stdout ?? ''}`; + if (!/permission denied/i.test(err)) return; + if (spawnSync('which', ['sg'], { stdio: 'ignore' }).status !== 0) return; + + console.log( + '\n[setup:auto] Docker socket not accessible in current group — ' + + 're-executing under `sg docker` to pick up new group membership.', + ); + const res = spawnSync('sg', ['docker', '-c', 'pnpm run setup:auto'], { + stdio: 'inherit', + env: { ...process.env, NANOCLAW_REEXEC_SG: '1' }, + }); + process.exit(res.status ?? 1); +} + function anthropicSecretExists(): boolean { try { const res = spawnSync('onecli', ['secrets', 'list'], { @@ -132,11 +159,18 @@ async function main(): Promise { 'Install Docker Desktop or start it manually, then retry.', ); } + if (res.fields.ERROR === 'docker_group_not_active') { + fail( + 'Docker was just installed but your shell is not yet in the `docker` group.', + 'Log out and back in (or run `newgrp docker` in a new shell), then retry `pnpm run setup:auto`.', + ); + } fail( 'container build/test failed', 'For stale build cache: `docker builder prune -f`, then retry `pnpm run setup:auto`.', ); } + maybeReexecUnderSg(); } if (!skip.has('onecli')) { diff --git a/setup/container.ts b/setup/container.ts index aadd04c9f..a2e64333e 100644 --- a/setup/container.ts +++ b/setup/container.ts @@ -2,7 +2,7 @@ * Step: container — Build container image and verify with test run. * Replaces 03-setup-container.sh */ -import { execSync } from 'child_process'; +import { execSync, spawnSync } from 'child_process'; import path from 'path'; import { setTimeout as sleep } from 'timers/promises'; @@ -10,20 +10,28 @@ import { log } from '../src/log.js'; import { commandExists, getPlatform } from './platform.js'; import { emitStatus } from './status.js'; +type DockerStatus = 'ok' | 'no-permission' | 'no-daemon' | 'other'; + +function dockerStatus(): DockerStatus { + const res = spawnSync('docker', ['info'], { encoding: 'utf-8' }); + if (res.status === 0) return 'ok'; + const err = `${res.stderr ?? ''}\n${res.stdout ?? ''}`; + if (/permission denied/i.test(err)) return 'no-permission'; + if (/cannot connect|is the docker daemon running|no such file/i.test(err)) return 'no-daemon'; + return 'other'; +} + function dockerRunning(): boolean { - try { - execSync('docker info', { stdio: 'ignore' }); - return true; - } catch { - return false; - } + return dockerStatus() === 'ok'; } /** - * Try to start Docker if it's installed but idle. Poll for up to 60s. - * Returns true once `docker info` succeeds, false if we gave up. + * Try to start Docker if it's installed but idle. Poll up to 60s for the + * daemon to come up — but bail immediately if the socket is reachable and + * only blocked by a group-permission error, since that won't resolve by + * waiting (the caller handles the sg re-exec for that case). */ -async function tryStartDocker(): Promise { +async function tryStartDocker(): Promise { const platform = getPlatform(); log.info('Docker not running — attempting to start', { platform }); @@ -34,22 +42,27 @@ async function tryStartDocker(): Promise { // Inherit stdio so sudo can prompt for a password if needed. execSync('sudo systemctl start docker', { stdio: 'inherit' }); } else { - return false; + return 'other'; } } catch (err) { log.warn('Start command failed', { err }); - return false; + return 'other'; } for (let i = 0; i < 30; i++) { await sleep(2000); - if (dockerRunning()) { + const s = dockerStatus(); + if (s === 'ok') { log.info('Docker is up'); - return true; + return 'ok'; + } + if (s === 'no-permission') { + log.info('Docker daemon is up but socket is not accessible (group membership)'); + return 'no-permission'; } } log.warn('Docker did not become ready within 60s'); - return false; + return 'no-daemon'; } function parseArgs(args: string[]): { runtime: string } { @@ -84,6 +97,15 @@ export async function run(args: string[]): Promise { process.exit(4); } + if (!commandExists('docker')) { + log.info('Docker not found — running setup/install-docker.sh'); + try { + execSync('bash setup/install-docker.sh', { cwd: projectRoot, stdio: 'inherit' }); + } catch (err) { + log.warn('install-docker.sh failed', { err }); + } + } + if (!commandExists('docker')) { emitStatus('SETUP_CONTAINER', { RUNTIME: runtime, @@ -97,16 +119,37 @@ export async function run(args: string[]): Promise { process.exit(2); } - if (!dockerRunning()) { - const started = await tryStartDocker(); - if (!started) { + { + let status = dockerStatus(); + if (status !== 'ok') { + status = await tryStartDocker(); + } + + // Socket is unreachable due to group perms — current shell's supplementary + // groups are fixed at login, so `usermod -aG docker` (via install-docker.sh + // or a prior install) doesn't affect us until next login. Re-exec this + // step under `sg docker` so the child picks up docker as its primary + // group and can talk to /var/run/docker.sock without a logout. + if (status === 'no-permission' && getPlatform() === 'linux' && commandExists('sg')) { + log.info('Re-executing container step under `sg docker`'); + const res = spawnSync( + 'sg', + ['docker', '-c', 'pnpm exec tsx setup/index.ts --step container'], + { cwd: projectRoot, stdio: 'inherit' }, + ); + process.exit(res.status ?? 1); + } + + if (status !== 'ok') { + const error = + status === 'no-permission' ? 'docker_group_not_active' : 'runtime_not_available'; emitStatus('SETUP_CONTAINER', { RUNTIME: runtime, IMAGE: image, BUILD_OK: false, TEST_OK: false, STATUS: 'failed', - ERROR: 'runtime_not_available', + ERROR: error, LOG: 'logs/setup.log', }); process.exit(2); diff --git a/setup/service.ts b/setup/service.ts index bc85d1626..56bf3938d 100644 --- a/setup/service.ts +++ b/setup/service.ts @@ -11,6 +11,7 @@ import path from 'path'; import { log } from '../src/log.js'; import { + commandExists, getPlatform, getNodePath, getServiceManager, @@ -255,12 +256,34 @@ WantedBy=${runningAsRoot ? 'multi-user.target' : 'default.target'}`; fs.writeFileSync(unitPath, unit); log.info('Wrote systemd unit', { unitPath }); - // Detect stale docker group before starting (user systemd only) - const dockerGroupStale = !runningAsRoot && checkDockerGroupStale(); + // Detect stale docker group before starting (user systemd only). The user + // systemd manager is a long-running process whose group list is frozen at + // login, so `usermod -aG docker` mid-session doesn't reach it. Rather than + // require the user to log out + back in, punch a POSIX ACL onto the socket + // that grants the current user rw directly. This is temporary — the socket + // is recreated by dockerd on restart (and by then the user has relogged, so + // normal group perms apply again). + let dockerGroupStale = !runningAsRoot && checkDockerGroupStale(); if (dockerGroupStale) { log.warn( 'Docker group not active in systemd session — user was likely added to docker group mid-session', ); + if (commandExists('setfacl')) { + const user = execSync('whoami', { encoding: 'utf-8' }).trim(); + try { + execSync(`sudo setfacl -m u:${user}:rw /var/run/docker.sock`, { + stdio: 'inherit', + }); + log.info( + 'Applied temporary ACL to /var/run/docker.sock (resets on docker restart or reboot)', + ); + dockerGroupStale = false; + } catch (err) { + log.warn('Failed to apply setfacl workaround', { err }); + } + } else { + log.warn('setfacl not installed — cannot apply automatic workaround'); + } } // Kill orphaned nanoclaw processes to avoid channel connection conflicts