diff --git a/.github/workflows/frontend-release.yml b/.github/workflows/frontend-release.yml index cdd1610b..1ab4012f 100644 --- a/.github/workflows/frontend-release.yml +++ b/.github/workflows/frontend-release.yml @@ -4,6 +4,12 @@ name: Desktop release # Generates a GitHub Release (draft) with installers + update manifests. # Triggered by a `desktop-v*` tag or manually. # +# Each target OS builds on its own runner so the bundled `ao` daemon is compiled +# natively for that platform. build-daemon.mjs keys the binary off the build +# host's platform, so cross-OS packaging (e.g. building the Windows installer on +# macOS) would ship a non-Windows binary named `ao` and the app could not launch +# the daemon (issues #235/#256). The per-OS matrix keeps host == target. +# # ⚠️ Until macOS code signing + notarization secrets are configured (see # frontend/docs/desktop-release.md), published builds are UNSIGNED and will # NOT auto-update on macOS. The workflow still produces installable artifacts. @@ -16,7 +22,11 @@ on: jobs: release: - runs-on: macos-latest + strategy: + fail-fast: false + matrix: + os: [macos-latest, windows-latest] + runs-on: ${{ matrix.os }} permissions: contents: write defaults: @@ -29,6 +39,12 @@ jobs: node-version: 20 cache: npm cache-dependency-path: frontend/package-lock.json + # The daemon is compiled by build-daemon.mjs during prepackage/premake, so + # the Go toolchain must be present and pinned on every runner. + - uses: actions/setup-go@v5 + with: + go-version-file: backend/go.mod + cache-dependency-path: backend/go.sum - run: npm ci - name: Publish run: npm run publish diff --git a/backend/go.mod b/backend/go.mod index 76e1cbbe..cee5b42f 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,6 +3,7 @@ module github.com/aoagents/agent-orchestrator/backend go 1.25.7 require ( + github.com/aymanbagabas/go-pty v0.2.3 github.com/coder/websocket v1.8.14 github.com/creack/pty v1.1.24 github.com/go-chi/chi/v5 v5.1.0 @@ -12,7 +13,7 @@ require ( github.com/spf13/pflag v1.0.9 github.com/swaggest/jsonschema-go v0.3.79 github.com/swaggest/openapi-go v0.2.61 - golang.org/x/sys v0.43.0 + golang.org/x/sys v0.44.0 gopkg.in/yaml.v3 v3.0.1 modernc.org/sqlite v1.51.0 ) @@ -26,7 +27,9 @@ require ( github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/sethvargo/go-retry v0.3.0 // indirect github.com/swaggest/refl v1.4.0 // indirect + github.com/u-root/u-root v0.16.0 // indirect go.uber.org/multierr v1.11.0 // indirect + golang.org/x/crypto v0.51.0 // indirect golang.org/x/sync v0.20.0 // indirect golang.org/x/tools v0.43.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/backend/go.sum b/backend/go.sum index 83ac372d..56fcf87f 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,3 +1,5 @@ +github.com/aymanbagabas/go-pty v0.2.3 h1:hsqcTIUV8I4iTSh3HQl61CR2wh0YPS6gHOYLhAfWu/E= +github.com/aymanbagabas/go-pty v0.2.3/go.mod h1:GLkgQovzqN5A1xMB79yHWiG1rhcquZCjkwKQGKFPdPg= github.com/bool64/dev v0.2.43 h1:yQ7qiZVef6WtCl2vDYU0Y+qSq+0aBrQzY8KXkklk9cQ= github.com/bool64/dev v0.2.43/go.mod h1:iJbh1y/HkunEPhgebWRNcs8wfGq7sjvJ6W5iabL8ACg= github.com/bool64/shared v0.1.5 h1:fp3eUhBsrSjNCQPcSdQqZxxh9bBwrYiZ+zOKFkM0/2E= @@ -19,6 +21,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/hugelgupf/vmtest v0.0.0-20240307030256-5d9f3d34a58d h1:nP8SfQJqruIVSWYJTuYc37jLHEY1Z0fF+zKSrs3K/C8= +github.com/hugelgupf/vmtest v0.0.0-20240307030256-5d9f3d34a58d/go.mod h1:B63hDJMhTupLWCHwopAyEo7wRFowx9kOc8m8j1sfOqE= github.com/iancoleman/orderedmap v0.3.0 h1:5cbR2grmZR/DiVt+VJopEhtVs9YGInGIxAoMJn+Ichc= github.com/iancoleman/orderedmap v0.3.0/go.mod h1:XuLcCUkdL5owUCQeF2Ue9uuw1EptkJDkXXS7VoV7XGE= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -54,18 +58,26 @@ github.com/swaggest/openapi-go v0.2.61 h1:psc+LE7pWhEjmJpmkti9tUmBPkkobdUNflBf5P github.com/swaggest/openapi-go v0.2.61/go.mod h1:786CwSwleh1IorB0nfwYGESWf83JgQh6fBc1PeJe4Iw= github.com/swaggest/refl v1.4.0 h1:CftOSdTqRqs100xpFOT/Rifss5xBV/CT0S/FN60Xe9k= github.com/swaggest/refl v1.4.0/go.mod h1:4uUVFVfPJ0NSX9FPwMPspeHos9wPFlCMGoPRllUbpvA= +github.com/u-root/gobusybox/src v0.0.0-20250101170133-2e884e4509c7 h1:dtiVT4SeBUc/vHtwI2HjDZN+FCKTstQBxugIxJEGo9g= +github.com/u-root/gobusybox/src v0.0.0-20250101170133-2e884e4509c7/go.mod h1:PW3wGFCHjdHxAhra5FKvcARbCGqGfentYuPKmuhv8DY= +github.com/u-root/u-root v0.16.0 h1:wY40O83MBVks97+Is0WlFlOPSwKQMIrWP9R1IsrExg8= +github.com/u-root/u-root v0.16.0/go.mod h1:yL/XdSSW27PdGLgUh4MNRBy54mKM+TBLzpwiB4nwj90= github.com/yudai/gojsondiff v1.0.0 h1:27cbfqXLVEJ1o8I6v3y9lg8Ydm53EKqHXAOMxEGlCOA= github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg= github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 h1:BHyfKlQyqbsFN5p3IfnEUduWvb9is428/nNb5L3U01M= github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82/go.mod h1:lgjkn3NuSvDfVJdfcVVdX+jpBxNmX4rDAzaS45IcYoM= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= +golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= golang.org/x/mod v0.34.0 h1:xIHgNUUnW6sYkcM5Jleh05DvLOtwc6RitGHbDk4akRI= golang.org/x/mod v0.34.0/go.mod h1:ykgH52iCZe79kzLLMhyCUzhMci+nQj+0XkbXpNYtVjY= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= -golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= -golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= +golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= golang.org/x/tools v0.43.0 h1:12BdW9CeB3Z+J/I/wj34VMl8X+fEXBxVR90JeMX5E7s= golang.org/x/tools v0.43.0/go.mod h1:uHkMso649BX2cZK6+RpuIPXS3ho2hZo4FVwfoy1vIk0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= diff --git a/backend/internal/adapters/agent/codex/codex.go b/backend/internal/adapters/agent/codex/codex.go index 68b33d7d..d8e45d86 100644 --- a/backend/internal/adapters/agent/codex/codex.go +++ b/backend/internal/adapters/agent/codex/codex.go @@ -73,12 +73,13 @@ func (p *Plugin) GetLaunchCommand(ctx context.Context, cfg ports.LaunchConfig) ( appendHookTrustBypassFlag(&cmd) appendApprovalFlags(&cmd, cfg.Permissions) appendSessionHookFlags(&cmd) + appendTerminalCompatibilityFlags(&cmd) appendWorkspaceTrustFlag(&cmd, cfg.WorkspacePath) if cfg.SystemPromptFile != "" { cmd = append(cmd, "-c", "model_instructions_file="+cfg.SystemPromptFile) } else if cfg.SystemPrompt != "" { - cmd = append(cmd, "-c", "developer_instructions="+cfg.SystemPrompt) + cmd = append(cmd, "-c", "developer_instructions="+codexTOMLConfigString(cfg.SystemPrompt)) } if cfg.Prompt != "" { @@ -122,6 +123,7 @@ func (p *Plugin) GetRestoreCommand(ctx context.Context, cfg ports.RestoreConfig) appendHookTrustBypassFlag(&cmd) appendApprovalFlags(&cmd, cfg.Permissions) appendSessionHookFlags(&cmd) + appendTerminalCompatibilityFlags(&cmd) appendWorkspaceTrustFlag(&cmd, cfg.Session.WorkspacePath) cmd = append(cmd, agentSessionID) return cmd, true, nil @@ -154,10 +156,10 @@ func ResolveCodexBinary(ctx context.Context) (string, error) { } if runtime.GOOS == "windows" { - for _, name := range []string{"codex.cmd", "codex.exe", "codex"} { + for _, name := range []string{"codex.exe", "codex.cmd", "codex"} { path, err := exec.LookPath(name) if err == nil && path != "" { - return path, nil + return resolveNativeWindowsCodex(path), nil } if err := ctx.Err(); err != nil { return "", err @@ -166,9 +168,11 @@ func ResolveCodexBinary(ctx context.Context) (string, error) { candidates := []string{} if appData := os.Getenv("APPDATA"); appData != "" { + shim := filepath.Join(appData, "npm", "codex.cmd") + candidates = append(candidates, windowsNativeCodexCandidatesForShim(shim)...) candidates = append(candidates, - filepath.Join(appData, "npm", "codex.cmd"), filepath.Join(appData, "npm", "codex.exe"), + shim, ) } if home, err := os.UserHomeDir(); err == nil { @@ -176,7 +180,7 @@ func ResolveCodexBinary(ctx context.Context) (string, error) { } for _, candidate := range candidates { if fileExists(candidate) { - return candidate, nil + return resolveNativeWindowsCodex(candidate), nil } if err := ctx.Err(); err != nil { return "", err @@ -213,6 +217,26 @@ func ResolveCodexBinary(ctx context.Context) (string, error) { return "", fmt.Errorf("codex: %w", ports.ErrAgentBinaryNotFound) } +func resolveNativeWindowsCodex(path string) string { + if runtime.GOOS != "windows" || !strings.EqualFold(filepath.Ext(path), ".cmd") { + return path + } + for _, candidate := range windowsNativeCodexCandidatesForShim(path) { + if fileExists(candidate) { + return candidate + } + } + return path +} + +func windowsNativeCodexCandidatesForShim(shim string) []string { + dir := filepath.Dir(shim) + return []string{ + filepath.Join(dir, "node_modules", "@openai", "codex", "node_modules", "@openai", "codex-win32-x64", "vendor", "x86_64-pc-windows-msvc", "bin", "codex.exe"), + filepath.Join(dir, "node_modules", "@openai", "codex", "bin", "codex.exe"), + } +} + func (p *Plugin) codexBinary(ctx context.Context) (string, error) { p.binaryMu.Lock() defer p.binaryMu.Unlock() @@ -270,6 +294,12 @@ func appendHookTrustBypassFlag(cmd *[]string) { *cmd = append(*cmd, "--dangerously-bypass-hook-trust") } +func appendTerminalCompatibilityFlags(cmd *[]string) { + if runtime.GOOS == "windows" { + *cmd = append(*cmd, "--no-alt-screen") + } +} + func appendApprovalFlags(cmd *[]string, permissions ports.PermissionMode) { switch normalizePermissionMode(permissions) { case ports.PermissionModeDefault: diff --git a/backend/internal/adapters/agent/codex/codex_test.go b/backend/internal/adapters/agent/codex/codex_test.go index dba7e2e2..c19d5a9d 100644 --- a/backend/internal/adapters/agent/codex/codex_test.go +++ b/backend/internal/adapters/agent/codex/codex_test.go @@ -60,8 +60,11 @@ func TestGetLaunchCommandBuildsCrossPlatformArgv(t *testing.T) { "--dangerously-bypass-approvals-and-sandbox", } want = append(want, sessionHookFlags()...) + if runtime.GOOS == "windows" { + want = append(want, "--no-alt-screen") + } want = append(want, - "-c", `projects={"`+workspace+`"={trust_level="trusted"}}`, + "-c", `projects={`+codexTOMLConfigString(workspace)+`={trust_level="trusted"}}`, "-c", "model_instructions_file="+filepath.Join("tmp", "prompt with spaces.md"), "--", "-fix this", ) @@ -158,7 +161,7 @@ func TestAppendWorkspaceTrustFlagCoversLiteralAndResolvedPaths(t *testing.T) { appendWorkspaceTrustFlag(&cmd, link) want := []string{ "-c", - `projects={"` + link + `"={trust_level="trusted"},"` + target + `"={trust_level="trusted"}}`, + `projects={'` + link + `'={trust_level="trusted"},'` + target + `'={trust_level="trusted"}}`, } if !reflect.DeepEqual(cmd, want) { t.Fatalf("trust flag\nwant: %#v\n got: %#v", want, cmd) @@ -166,7 +169,7 @@ func TestAppendWorkspaceTrustFlagCoversLiteralAndResolvedPaths(t *testing.T) { cmd = nil appendWorkspaceTrustFlag(&cmd, target) - want = []string{"-c", `projects={"` + target + `"={trust_level="trusted"}}`} + want = []string{"-c", `projects={'` + target + `'={trust_level="trusted"}}`} if !reflect.DeepEqual(cmd, want) { t.Fatalf("canonical-path trust flag\nwant: %#v\n got: %#v", want, cmd) } @@ -415,8 +418,11 @@ func TestGetRestoreCommandReadsAgentSessionID(t *testing.T) { "-c", `approvals_reviewer="auto_review"`, } want = append(want, sessionHookFlags()...) + if runtime.GOOS == "windows" { + want = append(want, "--no-alt-screen") + } want = append(want, - "-c", `projects={"`+workspace+`"={trust_level="trusted"}}`, + "-c", `projects={`+codexTOMLConfigString(workspace)+`={trust_level="trusted"}}`, "thread-123", ) if !reflect.DeepEqual(cmd, want) { @@ -566,7 +572,7 @@ func TestDoctorLaunchProbesMirrorLaunchFlags(t *testing.T) { for _, want := range []string{ "hooks.SessionStart=", "hooks.UserPromptSubmit=", "hooks.PermissionRequest=", "hooks.Stop=", "notice.hide_rate_limit_model_nudge=true", - `projects={"`, + `projects={`, } { if !strings.Contains(joined, want) { t.Fatalf("override probe missing %q in %s", want, joined) diff --git a/backend/internal/adapters/agent/codex/hooks.go b/backend/internal/adapters/agent/codex/hooks.go index 031d28ae..8092127b 100644 --- a/backend/internal/adapters/agent/codex/hooks.go +++ b/backend/internal/adapters/agent/codex/hooks.go @@ -105,11 +105,22 @@ func appendWorkspaceTrustFlag(cmd *[]string, workspacePath string) { } entries := make([]string, 0, len(keys)) for _, key := range keys { - entries = append(entries, codexTOMLBasicString(key)+`={trust_level="trusted"}`) + entries = append(entries, codexTOMLConfigString(key)+`={trust_level="trusted"}`) } *cmd = append(*cmd, "-c", "projects={"+strings.Join(entries, ",")+"}") } +func codexTOMLConfigString(s string) string { + if !containsTOMLControl(s) && !strings.Contains(s, "'") { + return codexTOMLLiteralString(s) + } + return codexTOMLBasicString(s) +} + +func codexTOMLLiteralString(s string) string { + return "'" + s + "'" +} + // codexTOMLBasicString renders s as a TOML basic string, escaping backslashes // and quotes (Windows paths) plus control characters so the value survives // Codex's TOML parse of the `-c` override. @@ -132,6 +143,15 @@ func codexTOMLBasicString(s string) string { return b.String() } +func containsTOMLControl(s string) bool { + for _, r := range s { + if r < 0x20 || r == 0x7f { + return true + } + } + return false +} + // GetAgentHooks no longer installs workspace files — Codex never loads them // from AO's worktrees (see the package comment above); the hooks ride the // launch command instead. It still strips hook entries that older AO versions diff --git a/backend/internal/adapters/runtime/zellij/commands.go b/backend/internal/adapters/runtime/zellij/commands.go index 5b035b69..3e834212 100644 --- a/backend/internal/adapters/runtime/zellij/commands.go +++ b/backend/internal/adapters/runtime/zellij/commands.go @@ -1,10 +1,14 @@ package zellij import ( + "encoding/base64" + "encoding/binary" "fmt" + "runtime" "sort" "strconv" "strings" + "unicode/utf16" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) @@ -35,6 +39,9 @@ func listPanesArgs(id string) []string { } func pasteArgs(id, paneID, chunk string) []string { + if runtime.GOOS == "windows" { + return []string{"--session", id, "action", "write-chars", "--pane-id", paneID, chunk} + } return []string{"--session", id, "action", "paste", "--pane-id", paneID, chunk} } @@ -77,11 +84,74 @@ func terminalPaneID(id int) string { } func buildLayout(cfg ports.RuntimeConfig, shellPath string) string { + if runtime.GOOS == "windows" { + return directLayoutString(cfg.WorkspacePath, cfg.Argv) + } spec := shellLaunchSpecFor(shellPath) shellCommand := shellLaunchCommand(cfg, shellPath, spec) return layoutString(cfg.WorkspacePath, shellPath, spec.args, shellCommand) } +// windowsLaunchArgv returns the argv zellij executes on Windows to start the +// agent. The trampoline reads the launch spec from AO_LAUNCH_SPEC, so KDL +// args quoting cannot mangle codex's `--config key=value` flags. +func windowsLaunchArgv(launcherBinary string) []string { + command := launcherBinary + if command == "" { + command = "ao" + } + return []string{command, "launch"} +} + +func windowsFallbackShellArgv(shellPath string) []string { + if strings.TrimSpace(shellPath) == "" { + shellPath = "powershell.exe" + } + base := strings.ToLower(filepathBase(shellPath)) + if strings.Contains(base, "cmd") { + return []string{shellPath, "/D", "/Q", "/K"} + } + if strings.Contains(base, "powershell") || strings.Contains(base, "pwsh") { + return []string{shellPath, "-NoLogo", "-NoProfile", "-NoExit"} + } + if strings.Contains(base, "sh") { + return []string{shellPath, "-i"} + } + return []string{shellPath} +} + +// directLayoutString builds a layout that runs argv[0] with argv[1:] as zellij +// `args`, with no intermediate shell. Used on Windows where wrapping the agent +// in powershell/cmd quoting is unsound for arbitrary argv (e.g. codex's +// `--config key="value with spaces"`). +func directLayoutString(workspacePath string, argv []string) string { + command := "" + args := []string{} + if len(argv) > 0 { + command = argv[0] + args = argv[1:] + } + + var b strings.Builder + b.WriteString("layout {\n") + b.WriteString(" cwd ") + b.WriteString(kdlQuote(workspacePath)) + b.WriteString("\n") + b.WriteString(" pane command=") + b.WriteString(kdlQuote(command)) + b.WriteString(" name=") + b.WriteString(kdlQuote(agentPaneName)) + b.WriteString(" borderless=true {\n") + if len(args) > 0 { + b.WriteString(" args ") + b.WriteString(kdlJoin(args)) + b.WriteString("\n") + } + b.WriteString(" }\n") + b.WriteString("}\n") + return b.String() +} + type shellLaunchSpec struct { args []string } @@ -92,7 +162,7 @@ func shellLaunchSpecFor(shellPath string) shellLaunchSpec { return shellLaunchSpec{args: []string{"/D", "/S", "/K"}} } if strings.Contains(base, "powershell") || strings.Contains(base, "pwsh") { - return shellLaunchSpec{args: []string{"-NoLogo", "-NoProfile", "-NoExit", "-Command"}} + return shellLaunchSpec{args: []string{"-NoLogo", "-NoProfile", "-NoExit", "-EncodedCommand"}} } return shellLaunchSpec{args: []string{"-lc"}} } @@ -168,7 +238,21 @@ func wrapLaunchCommandPowerShell(cfg ports.RuntimeConfig) string { b.WriteString("; ") } b.WriteString(quoteArgvPowerShell(cfg.Argv)) - return b.String() + return powerShellEncodedCommand(b.String()) +} + +// powerShellEncodedCommand returns the base64'd UTF-16-LE form of script, +// suitable for `powershell.exe -EncodedCommand`. zellij's KDL `args` quoting +// is not robust enough to round-trip arbitrary PowerShell script text through +// a plain `-Command` argv slot, so we hand PowerShell a single opaque base64 +// blob instead. +func powerShellEncodedCommand(script string) string { + words := utf16.Encode([]rune(script)) + buf := make([]byte, len(words)*2) + for i, word := range words { + binary.LittleEndian.PutUint16(buf[i*2:], word) + } + return base64.StdEncoding.EncodeToString(buf) } func wrapLaunchCommandCmd(cfg ports.RuntimeConfig) string { diff --git a/backend/internal/adapters/runtime/zellij/process_other.go b/backend/internal/adapters/runtime/zellij/process_other.go new file mode 100644 index 00000000..fa963c4c --- /dev/null +++ b/backend/internal/adapters/runtime/zellij/process_other.go @@ -0,0 +1,12 @@ +//go:build !windows + +package zellij + +import "errors" + +// startBackgroundProcess is a stub: the fire-and-forget path is only used by +// the Windows zellij codepath. Non-Windows builds create sessions +// synchronously via runner.Run. +func startBackgroundProcess(env []string, name string, args ...string) error { + return errors.New("zellij runtime: background spawn is windows-only") +} diff --git a/backend/internal/adapters/runtime/zellij/process_windows.go b/backend/internal/adapters/runtime/zellij/process_windows.go new file mode 100644 index 00000000..f58ea45e --- /dev/null +++ b/backend/internal/adapters/runtime/zellij/process_windows.go @@ -0,0 +1,68 @@ +//go:build windows + +package zellij + +import ( + "os/exec" + "strings" + "syscall" + + "golang.org/x/sys/windows" +) + +func startBackgroundProcess(env []string, name string, args ...string) error { + script := "Start-Process -FilePath " + psQuote(name) + " -ArgumentList " + psQuote(windowsCommandLine(args)) + " -WindowStyle Hidden" + cmd := exec.Command("powershell.exe", "-NoLogo", "-NoProfile", "-EncodedCommand", powerShellEncodedCommand(script)) + cmd.Env = env + cmd.SysProcAttr = &syscall.SysProcAttr{ + CreationFlags: windows.CREATE_NEW_CONSOLE, + HideWindow: true, + } + if err := cmd.Start(); err != nil { + return err + } + go func() { _ = cmd.Wait() }() + return nil +} + +func windowsCommandLine(args []string) string { + quoted := make([]string, len(args)) + for i, arg := range args { + quoted[i] = windowsQuoteArg(arg) + } + return strings.Join(quoted, " ") +} + +func windowsQuoteArg(arg string) string { + if arg == "" { + return `""` + } + if !strings.ContainsAny(arg, " \t\"") { + return arg + } + + var b strings.Builder + b.WriteByte('"') + backslashes := 0 + for _, r := range arg { + switch r { + case '\\': + backslashes++ + case '"': + b.WriteString(strings.Repeat(`\`, backslashes*2+1)) + b.WriteRune(r) + backslashes = 0 + default: + if backslashes > 0 { + b.WriteString(strings.Repeat(`\`, backslashes)) + backslashes = 0 + } + b.WriteRune(r) + } + } + if backslashes > 0 { + b.WriteString(strings.Repeat(`\`, backslashes*2)) + } + b.WriteByte('"') + return b.String() +} diff --git a/backend/internal/adapters/runtime/zellij/zellij.go b/backend/internal/adapters/runtime/zellij/zellij.go index 938815f6..f71f5ed2 100644 --- a/backend/internal/adapters/runtime/zellij/zellij.go +++ b/backend/internal/adapters/runtime/zellij/zellij.go @@ -10,6 +10,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "regexp" "runtime" "strconv" @@ -17,33 +18,41 @@ import ( "time" "unicode/utf8" + "github.com/aoagents/agent-orchestrator/backend/internal/agentlaunch" "github.com/aoagents/agent-orchestrator/backend/internal/domain" "github.com/aoagents/agent-orchestrator/backend/internal/ports" ) const ( - defaultTimeout = 5 * time.Second - defaultZellijTerm = "xterm-256color" - defaultZellijColor = "truecolor" - minMajor = 0 - minMinor = 44 - minPatch = 3 + defaultTimeout = 5 * time.Second + defaultWindowsTimeout = 30 * time.Second + defaultZellijTerm = "xterm-256color" + defaultZellijColor = "truecolor" + minMajor = 0 + minMinor = 44 + minPatch = 3 ) var sessionIDPattern = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`) var paneIDPattern = regexp.MustCompile(`^terminal_\d+$`) var getenv = os.Getenv +var lookPath = exec.LookPath +var fileExists = func(path string) bool { + info, err := os.Stat(path) + return err == nil && !info.IsDir() +} // Options configures a zellij Runtime; every field has a sensible default // (see New), so the zero value is usable. type Options struct { - Binary string - Timeout time.Duration - Shell string - SocketDir string - ConfigDir string - ChunkSize int + Binary string + Timeout time.Duration + Shell string + SocketDir string + ConfigDir string + ChunkSize int + LauncherBinary string } // Runtime runs agent sessions inside zellij sessions, driving them via the @@ -55,6 +64,7 @@ type Runtime struct { socketDir string configDir string chunkSize int + launcher string runner runner } @@ -75,6 +85,7 @@ func DefaultSocketDir() string { type runner interface { Run(ctx context.Context, env []string, name string, args ...string) ([]byte, error) + Start(env []string, name string, args ...string) error } type execRunner struct{} @@ -85,17 +96,21 @@ func (execRunner) Run(ctx context.Context, env []string, name string, args ...st return cmd.CombinedOutput() } +func (execRunner) Start(env []string, name string, args ...string) error { + return startBackgroundProcess(zellijCommandEnv(os.Environ(), env), name, args...) +} + // New builds a zellij Runtime, filling unset Options with defaults: binary // "zellij", shell from $SHELL (else /bin/sh, or powershell.exe on Windows), and // the default timeout and output chunk size. func New(opts Options) *Runtime { binary := opts.Binary if binary == "" { - binary = "zellij" + binary = defaultBinary() } timeout := opts.Timeout if timeout == 0 { - timeout = defaultTimeout + timeout = defaultCommandTimeout() } shellPath := opts.Shell if shellPath == "" { @@ -112,7 +127,75 @@ func New(opts Options) *Runtime { if chunkSize <= 0 { chunkSize = defaultChunkBytes } - return &Runtime{binary: binary, timeout: timeout, shell: shellPath, socketDir: opts.SocketDir, configDir: opts.ConfigDir, chunkSize: chunkSize, runner: execRunner{}} + launcher := opts.LauncherBinary + if launcher == "" { + launcher = defaultLauncherBinary() + } + return &Runtime{binary: binary, timeout: timeout, shell: shellPath, socketDir: opts.SocketDir, configDir: opts.ConfigDir, chunkSize: chunkSize, launcher: launcher, runner: execRunner{}} +} + +// defaultLauncherBinary returns the path used by the zellij Windows codepath +// to invoke the `ao launch` trampoline. On Windows the agent's argv is +// persisted to a temp spec file (see agentlaunch); zellij then runs this +// binary with `launch` and it execs the real agent. Falls back to plain "ao" +// if the daemon binary path cannot be resolved (PATH lookup at runtime). +func defaultLauncherBinary() string { + path, err := os.Executable() + if err == nil && isLauncherBinary(path) { + return path + } + return "ao" +} + +func isLauncherBinary(path string) bool { + name := strings.ToLower(filepath.Base(path)) + if runtime.GOOS == "windows" { + name = strings.TrimSuffix(name, ".exe") + } + return name == "ao" +} + +func defaultCommandTimeout() time.Duration { + if runtime.GOOS == "windows" { + return defaultWindowsTimeout + } + return defaultTimeout +} + +func defaultBinary() string { + names := []string{"zellij"} + if runtime.GOOS == "windows" { + names = []string{"zellij.exe", "zellij"} + } + for _, name := range names { + if path, err := lookPath(name); err == nil && path != "" { + return path + } + } + if runtime.GOOS == "windows" { + for _, candidate := range windowsZellijCandidates() { + if fileExists(candidate) { + return candidate + } + } + } + return "zellij" +} + +func windowsZellijCandidates() []string { + candidates := []string{} + if localAppData := getenv("LOCALAPPDATA"); localAppData != "" { + candidates = append(candidates, filepath.Join(localAppData, "Programs", "zellij", "zellij.exe")) + } + for _, key := range []string{"ProgramFiles", "ProgramFiles(x86)"} { + if dir := getenv(key); dir != "" { + candidates = append(candidates, + filepath.Join(dir, "zellij", "zellij.exe"), + filepath.Join(dir, "Zellij", "zellij.exe"), + ) + } + } + return candidates } // Create starts a new zellij session in the workspace, running the agent's @@ -142,13 +225,19 @@ func (r *Runtime) Create(ctx context.Context, cfg ports.RuntimeConfig) (ports.Ru return ports.RuntimeHandle{}, err } - layoutPath, err := r.writeLayout(cfg) + layoutPath, launchEnv, cleanupLaunchSpec, err := r.writeLayout(cfg) if err != nil { return ports.RuntimeHandle{}, err } defer func() { _ = os.Remove(layoutPath) }() + cleanupOnFailure := true + defer func() { + if cleanupOnFailure && cleanupLaunchSpec != nil { + cleanupLaunchSpec() + } + }() - if _, err := r.run(ctx, createSessionArgs(id, layoutPath)...); err != nil { + if err := r.createSession(ctx, id, layoutPath, launchEnv); err != nil { return ports.RuntimeHandle{}, fmt.Errorf("zellij runtime: create session %s: %w", id, err) } paneID, err := r.findAgentPane(ctx, id) @@ -156,7 +245,34 @@ func (r *Runtime) Create(ctx context.Context, cfg ports.RuntimeConfig) (ports.Ru _ = r.Destroy(context.Background(), ports.RuntimeHandle{ID: id}) return ports.RuntimeHandle{}, err } - return ports.RuntimeHandle{ID: handleIDValue(id, paneID)}, nil + if err := r.waitForPaneReady(ctx, id, paneID); err != nil { + _ = r.Destroy(context.Background(), ports.RuntimeHandle{ID: id}) + return ports.RuntimeHandle{}, err + } + handle := ports.RuntimeHandle{ID: handleIDValue(id, paneID)} + alive, err := r.IsAlive(ctx, handle) + if err != nil { + _ = r.Destroy(context.Background(), handle) + return ports.RuntimeHandle{}, fmt.Errorf("zellij runtime: verify session %s: %w", id, err) + } + if !alive { + _ = r.Destroy(context.Background(), handle) + return ports.RuntimeHandle{}, fmt.Errorf("zellij runtime: session %s exited before ready", id) + } + cleanupOnFailure = false + return handle, nil +} + +// createSession runs `zellij attach --create-background`. On Windows we spawn +// it via runner.Start (fire-and-forget) because the inherited daemon stdio +// confuses zellij's own readiness probe; on Unix we keep the synchronous run. +func (r *Runtime) createSession(ctx context.Context, id, layoutPath string, env map[string]string) error { + args := createSessionArgs(id, layoutPath) + if runtime.GOOS != "windows" { + _, err := r.run(ctx, args...) + return err + } + return r.startWithEnv(env, args...) } // Destroy kills the handle's zellij session and deletes its serialized state, @@ -208,7 +324,7 @@ func (r *Runtime) GetOutput(ctx context.Context, handle ports.RuntimeHandle, lin if err != nil { return "", fmt.Errorf("zellij runtime: capture output %s/%s: %w", id, paneID, err) } - return tailLines(string(out), lines), nil + return tailLines(trimTrailingBlankLines(string(out)), lines), nil } // IsAlive reports whether the handle's session still appears in `zellij @@ -242,15 +358,17 @@ func noActiveSessionsOutput(out string) bool { } // AttachCommand returns the argv a human runs to attach their terminal to the -// session. -func (r *Runtime) AttachCommand(handle ports.RuntimeHandle) ([]string, error) { +// session, plus an optional env block that the spawn should apply (used on +// Windows where wrapping the attach in an `env` shim is unsafe under ConPTY). +func (r *Runtime) AttachCommand(handle ports.RuntimeHandle) ([]string, []string, error) { id, _, err := handleID(handle) if err != nil { - return nil, err + return nil, nil, err } args := append([]string{}, r.baseArgs()...) args = append(args, attachArgs(id)...) - return attachCommandWithEnv(r.binary, r.socketDir, args...), nil + argv, env := attachCommandWithEnv(r.binary, r.socketDir, args...) + return argv, env, nil } func (r *Runtime) ensureSupportedVersion(ctx context.Context) error { @@ -264,22 +382,77 @@ func (r *Runtime) ensureSupportedVersion(ctx context.Context) error { return nil } -func (r *Runtime) writeLayout(cfg ports.RuntimeConfig) (string, error) { +func (r *Runtime) writeLayout(cfg ports.RuntimeConfig) (string, map[string]string, func(), error) { + launchEnv := cfg.Env + var cleanupLaunchSpec func() + if runtime.GOOS == "windows" { + specPath, err := agentlaunch.WriteTemp(agentlaunch.Spec{ + WorkspacePath: cfg.WorkspacePath, + Argv: cfg.Argv, + FallbackArgv: windowsFallbackShellArgv(r.shell), + }) + if err != nil { + return "", nil, nil, fmt.Errorf("zellij runtime: %w", err) + } + cleanupLaunchSpec = func() { _ = os.Remove(specPath) } + cfg.Argv = windowsLaunchArgv(r.launcher) + launchEnv = windowsLaunchEnv(cfg.Env, r.launcher, specPath) + } + file, err := os.CreateTemp(os.TempDir(), "ao-zellij-layout-*.kdl") if err != nil { - return "", fmt.Errorf("zellij runtime: create layout temp file: %w", err) + if cleanupLaunchSpec != nil { + cleanupLaunchSpec() + } + return "", nil, nil, fmt.Errorf("zellij runtime: create layout temp file: %w", err) } path := file.Name() if _, err := file.WriteString(buildLayout(cfg, r.shell)); err != nil { _ = file.Close() _ = os.Remove(path) - return "", fmt.Errorf("zellij runtime: write layout temp file: %w", err) + if cleanupLaunchSpec != nil { + cleanupLaunchSpec() + } + return "", nil, nil, fmt.Errorf("zellij runtime: write layout temp file: %w", err) } if err := file.Close(); err != nil { _ = os.Remove(path) - return "", fmt.Errorf("zellij runtime: close layout temp file: %w", err) + if cleanupLaunchSpec != nil { + cleanupLaunchSpec() + } + return "", nil, nil, fmt.Errorf("zellij runtime: close layout temp file: %w", err) + } + return path, launchEnv, cleanupLaunchSpec, nil +} + +// windowsLaunchEnv augments cfg.Env with the AO_LAUNCH_SPEC pointer the `ao +// launch` trampoline reads, and prepends the launcher's directory to PATH so +// the trampoline (i.e. `ao` itself) is resolvable in the spawned environment. +func windowsLaunchEnv(env map[string]string, launcherBinary, specPath string) map[string]string { + launchEnv := make(map[string]string, len(env)+2) + for k, v := range env { + launchEnv[k] = v + } + launchEnv[agentlaunch.EnvSpecPath] = specPath + if dir := launcherDir(launcherBinary); dir != "" { + base := launchEnv["PATH"] + if base == "" { + base = getenv("PATH") + } + if base == "" { + launchEnv["PATH"] = dir + } else { + launchEnv["PATH"] = dir + string(os.PathListSeparator) + base + } + } + return launchEnv +} + +func launcherDir(launcherBinary string) string { + if launcherBinary == "" || !filepath.IsAbs(launcherBinary) { + return "" } - return path, nil + return filepath.Dir(launcherBinary) } func (r *Runtime) findAgentPane(ctx context.Context, id string) (string, error) { @@ -307,6 +480,42 @@ func (r *Runtime) findAgentPane(ctx context.Context, id string) (string, error) } } +func (r *Runtime) waitForPaneReady(ctx context.Context, id, paneID string) error { + if runtime.GOOS != "windows" { + return nil + } + + deadline := time.Now().Add(r.timeout) + var lastErr error + for { + out, err := r.run(ctx, listPanesArgs(id)...) + if err == nil { + pane, parseErr := paneByID(out, paneID) + if parseErr == nil { + if pane.Exited { + return fmt.Errorf("zellij runtime: pane %s/%s exited before ready", id, paneID) + } + if paneReady(pane) { + return nil + } + lastErr = fmt.Errorf("pane %s/%s is not ready", id, paneID) + } else { + lastErr = parseErr + } + } else { + lastErr = err + } + if time.Now().After(deadline) { + return fmt.Errorf("zellij runtime: wait for pane %s/%s: %w", id, paneID, lastErr) + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(50 * time.Millisecond): + } + } +} + func (r *Runtime) run(ctx context.Context, args ...string) ([]byte, error) { cmdCtx, cancel := context.WithTimeout(ctx, r.timeout) defer cancel() @@ -321,6 +530,17 @@ func (r *Runtime) run(ctx context.Context, args ...string) ([]byte, error) { return out, nil } +// startWithEnv fires zellij in the background with extra env vars merged onto +// the runtime's base env. Used by the Windows createSession path so the daemon +// is not blocked waiting on zellij's `--create-background` to settle. +func (r *Runtime) startWithEnv(extra map[string]string, args ...string) error { + fullArgs := append(r.baseArgs(), args...) + if err := r.runner.Start(r.envWith(extra), r.binary, fullArgs...); err != nil { + return commandError{err: err} + } + return nil +} + func (r *Runtime) baseArgs() []string { args := []string{} if r.configDir != "" { @@ -330,44 +550,45 @@ func (r *Runtime) baseArgs() []string { } func (r *Runtime) env() []string { + return r.envWith(nil) +} + +func (r *Runtime) envWith(extra map[string]string) []string { env := zellijColorEnv(nil) if r.socketDir == "" { - return env + return appendRuntimeEnv(env, extra) } - return append(env, "ZELLIJ_SOCKET_DIR="+r.socketDir) + env = append(env, "ZELLIJ_SOCKET_DIR="+r.socketDir) + return appendRuntimeEnv(env, extra) } -func attachCommandWithEnv(binary, socketDir string, args ...string) []string { - env := zellijColorEnv(nil) - if socketDir != "" { - env = append(env, "ZELLIJ_SOCKET_DIR="+socketDir) +func appendRuntimeEnv(env []string, extra map[string]string) []string { + for _, key := range sortedKeys(extra) { + env = append(env, key+"="+extra[key]) } + return env +} + +func attachCommandWithEnv(binary, socketDir string, args ...string) ([]string, []string) { if runtime.GOOS == "windows" { - command := strings.Builder{} - command.WriteString("Remove-Item Env:NO_COLOR -ErrorAction SilentlyContinue; ") - for _, pair := range env { - key, value, ok := strings.Cut(pair, "=") - if !ok { - continue - } - command.WriteString("$env:") - command.WriteString(key) - command.WriteString(" = ") - command.WriteString(psQuote(value)) - command.WriteString("; ") - } - command.WriteString("& ") - command.WriteString(psQuote(binary)) - for _, arg := range args { - command.WriteByte(' ') - command.WriteString(psQuote(arg)) + // Windows ConPTY attaches the child directly. Avoid shell wrappers here: + // malformed ConPTY startup around powershell.exe/cmd.exe surfaces as modal + // application-error dialogs. Per-session ZELLIJ_SOCKET_DIR is delivered + // via the spawn's env block (CreateProcess) instead of an `env` shim. + var envBlock []string + if socketDir != "" { + envBlock = upsertEnv(append([]string(nil), os.Environ()...), "ZELLIJ_SOCKET_DIR="+socketDir) } - return []string{"powershell.exe", "-NoLogo", "-NoProfile", "-Command", command.String()} + return append([]string{binary}, args...), envBlock + } + env := zellijColorEnv(nil) + if socketDir != "" { + env = append(env, "ZELLIJ_SOCKET_DIR="+socketDir) } argv := []string{"env", "-u", "NO_COLOR"} argv = append(argv, env...) argv = append(argv, binary) - return append(argv, args...) + return append(argv, args...), nil } func zellijCommandEnv(base, overrides []string) []string { @@ -501,15 +722,18 @@ func handleID(handle ports.RuntimeHandle) (string, string, error) { } type paneInfo struct { - ID int `json:"id"` - IsPlugin bool `json:"is_plugin"` - Title string `json:"title"` + ID int `json:"id"` + IsPlugin bool `json:"is_plugin"` + Title string `json:"title"` + Exited bool `json:"exited"` + TerminalCommand string `json:"terminal_command"` + PaneCommand string `json:"pane_command"` } func agentPaneID(out []byte) (string, error) { - var panes []paneInfo - if err := json.Unmarshal(out, &panes); err != nil { - return "", fmt.Errorf("parse panes: %w", err) + panes, err := parsePanes(out) + if err != nil { + return "", err } for _, pane := range panes { if !pane.IsPlugin && pane.Title == agentPaneName { @@ -524,6 +748,34 @@ func agentPaneID(out []byte) (string, error) { return "", errors.New("agent pane not found") } +func paneByID(out []byte, paneID string) (paneInfo, error) { + panes, err := parsePanes(out) + if err != nil { + return paneInfo{}, err + } + for _, pane := range panes { + if !pane.IsPlugin && terminalPaneID(pane.ID) == paneID { + return pane, nil + } + } + return paneInfo{}, fmt.Errorf("pane %s not found", paneID) +} + +func parsePanes(out []byte) ([]paneInfo, error) { + var panes []paneInfo + if err := json.Unmarshal(out, &panes); err != nil { + return nil, fmt.Errorf("parse panes: %w", err) + } + return panes, nil +} + +func paneReady(pane paneInfo) bool { + if pane.PaneCommand != "" { + return true + } + return pane.TerminalCommand == "" +} + func chunks(s string, maxBytes int) []string { if s == "" { return []string{""} @@ -565,6 +817,20 @@ func tailLines(s string, n int) string { return strings.Join(lines[len(lines)-n:], "") } +func trimTrailingBlankLines(s string) string { + if s == "" { + return "" + } + lines := strings.SplitAfter(s, "\n") + if lines[len(lines)-1] == "" { + lines = lines[:len(lines)-1] + } + for len(lines) > 0 && strings.TrimRight(lines[len(lines)-1], "\r\n") == "" { + lines = lines[:len(lines)-1] + } + return strings.Join(lines, "") +} + // RequiredVersion returns the minimum Zellij version AO's runtime adapter // supports. func RequiredVersion() string { return minSupportedVersion().String() } diff --git a/backend/internal/adapters/runtime/zellij/zellij_integration_test.go b/backend/internal/adapters/runtime/zellij/zellij_integration_test.go index fdbd261a..c0d7d390 100644 --- a/backend/internal/adapters/runtime/zellij/zellij_integration_test.go +++ b/backend/internal/adapters/runtime/zellij/zellij_integration_test.go @@ -5,6 +5,7 @@ import ( "os" "os/exec" "path/filepath" + "runtime" "strings" "testing" "time" @@ -19,18 +20,26 @@ func TestRuntimeIntegration(t *testing.T) { ctx := context.Background() id := "ao_itest_zj" - socketDir := filepath.Join(os.TempDir(), "ao-zj-itest") - if err := os.MkdirAll(socketDir, 0o755); err != nil { - t.Fatalf("mkdir socket dir: %v", err) - } + socketDir := tempSocketDir(t, "ao-zj-itest-") configDir := t.TempDir() - r := New(Options{Timeout: 5 * time.Second, SocketDir: socketDir, ConfigDir: configDir}) + opts := Options{Timeout: 5 * time.Second, SocketDir: socketDir, ConfigDir: configDir} + if runtime.GOOS == "windows" { + opts.Timeout = 30 * time.Second + opts.LauncherBinary = buildAOForIntegration(t) + opts.Shell = "cmd.exe" + } + r := New(opts) _ = r.Destroy(ctx, ports.RuntimeHandle{ID: id}) + argv := []string{"sh", "-c", "echo ready-$AO_SESSION_ID"} + sendCommand := "echo hello-from-zellij" + if runtime.GOOS == "windows" { + argv = []string{"cmd.exe", "/D", "/Q", "/K", "echo ready-%AO_SESSION_ID%"} + } h, err := r.Create(ctx, ports.RuntimeConfig{ SessionID: "ao_itest_zj", WorkspacePath: t.TempDir(), - Argv: []string{"sh", "-c", "printf ready-$AO_SESSION_ID\\n"}, + Argv: argv, Env: map[string]string{"AO_SESSION_ID": id}, }) if err != nil { @@ -45,22 +54,22 @@ func TestRuntimeIntegration(t *testing.T) { if !alive { t.Fatal("alive = false, want true") } + prefixAlive, err := r.IsAlive(ctx, ports.RuntimeHandle{ID: "ao_itest"}) + if err != nil { + t.Fatalf("IsAlive prefix: %v", err) + } + if prefixAlive { + t.Fatal("prefix handle reported alive; zellij session matching is not exact") + } - if err := r.SendMessage(ctx, h, "printf hello-from-zellij"); err != nil { - t.Fatalf("SendMessage: %v", err) + out := waitForRuntimeOutput(t, r, h, "ready-") + if !strings.Contains(out, "ready-") { + t.Fatalf("output = %q, want ready output", out) } - deadline := time.Now().Add(3 * time.Second) - var out string - for time.Now().Before(deadline) { - out, err = r.GetOutput(ctx, h, 30) - if err != nil { - t.Fatalf("GetOutput: %v", err) - } - if strings.Contains(out, "hello-from-zellij") { - break - } - time.Sleep(100 * time.Millisecond) + if err := r.SendMessage(ctx, h, sendCommand); err != nil { + t.Fatalf("SendMessage: %v", err) } + out = waitForRuntimeOutput(t, r, h, "hello-from-zellij") if !strings.Contains(out, "hello-from-zellij") { t.Fatalf("output = %q, want sent command output", out) } @@ -77,16 +86,55 @@ func TestRuntimeIntegration(t *testing.T) { } } +func waitForRuntimeOutput(t *testing.T, r *Runtime, h ports.RuntimeHandle, want string) string { + t.Helper() + deadline := time.Now().Add(3 * time.Second) + var out string + for time.Now().Before(deadline) { + var err error + out, err = r.GetOutput(context.Background(), h, 30) + if err != nil { + t.Fatalf("GetOutput: %v", err) + } + if strings.Contains(out, want) { + return out + } + time.Sleep(100 * time.Millisecond) + } + return out +} + +func buildAOForIntegration(t *testing.T) string { + t.Helper() + out := filepath.Join(t.TempDir(), "ao.exe") + cmd := exec.Command("go", "build", "-o", out, "./cmd/ao") + cmd.Dir = filepath.Clean(filepath.Join("..", "..", "..", "..")) + if raw, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("build ao test launcher: %v: %s", err, strings.TrimSpace(string(raw))) + } + return out +} + +func tempSocketDir(t *testing.T, pattern string) string { + t.Helper() + socketDir, err := os.MkdirTemp(os.TempDir(), pattern) + if err != nil { + t.Fatalf("mkdir socket dir: %v", err) + } + t.Cleanup(func() { _ = os.RemoveAll(socketDir) }) + return socketDir +} + func TestRuntimeIntegrationUsesExactSessionParsing(t *testing.T) { if _, err := exec.LookPath("zellij"); err != nil { t.Skip("zellij unavailable") } + if runtime.GOOS == "windows" { + t.Skip("exact session parsing is covered by TestRuntimeIntegration on Windows") + } ctx := context.Background() - socketDir := filepath.Join(os.TempDir(), "ao-zj-exact-itest") - if err := os.MkdirAll(socketDir, 0o755); err != nil { - t.Fatalf("mkdir socket dir: %v", err) - } + socketDir := tempSocketDir(t, "ao-zj-exact-itest-") r := New(Options{Timeout: 5 * time.Second, SocketDir: socketDir, ConfigDir: t.TempDir()}) longID := "ao_zj_exact_long" prefixID := "ao_zj_exact" @@ -96,7 +144,7 @@ func TestRuntimeIntegrationUsesExactSessionParsing(t *testing.T) { h, err := r.Create(ctx, ports.RuntimeConfig{ SessionID: "ao_zj_exact_long", WorkspacePath: t.TempDir(), - Argv: []string{"printf", "ready\\n"}, + Argv: []string{"printf", "ready\n"}, }) if err != nil { t.Fatalf("Create: %v", err) diff --git a/backend/internal/adapters/runtime/zellij/zellij_test.go b/backend/internal/adapters/runtime/zellij/zellij_test.go index 784cda43..38420ae6 100644 --- a/backend/internal/adapters/runtime/zellij/zellij_test.go +++ b/backend/internal/adapters/runtime/zellij/zellij_test.go @@ -96,7 +96,11 @@ func TestCommandBuilders(t *testing.T) { if got, want := listPanesArgs("sess-1"), []string{"--session", "sess-1", "action", "list-panes", "--all", "--json"}; !reflect.DeepEqual(got, want) { t.Fatalf("listPanesArgs = %#v, want %#v", got, want) } - if got, want := pasteArgs("sess-1", "terminal_0", "hello"), []string{"--session", "sess-1", "action", "paste", "--pane-id", "terminal_0", "hello"}; !reflect.DeepEqual(got, want) { + pasteAction := "paste" + if runtime.GOOS == "windows" { + pasteAction = "write-chars" + } + if got, want := pasteArgs("sess-1", "terminal_0", "hello"), []string{"--session", "sess-1", "action", pasteAction, "--pane-id", "terminal_0", "hello"}; !reflect.DeepEqual(got, want) { t.Fatalf("pasteArgs = %#v, want %#v", got, want) } if got, want := dumpScreenArgs("sess-1", "terminal_0"), []string{"--session", "sess-1", "action", "dump-screen", "--pane-id", "terminal_0", "--full"}; !reflect.DeepEqual(got, want) { @@ -201,6 +205,18 @@ func TestBuildLayoutExportsEnvAndKeepsPaneAlive(t *testing.T) { "ODD": "can't", "PATH": "/custom/bin:/usr/bin", }}, "/bin/zsh") + if runtime.GOOS == "windows" { + for _, want := range []string{ + `cwd "/tmp/ws"`, + `pane command="ao" name="agent" borderless=true`, + `args "run"`, + } { + if !strings.Contains(got, want) { + t.Fatalf("direct windows layout missing %q in %q", want, got) + } + } + return + } for _, want := range []string{ `cwd "/tmp/ws"`, @@ -229,13 +245,22 @@ func TestBuildLayoutUsesPowerShellLaunchOnWindowsShells(t *testing.T) { got := buildLayout(ports.RuntimeConfig{WorkspacePath: `C:\ws`, Argv: []string{"Write-Host", "ready"}, Env: map[string]string{ "AO_SESSION_ID": "sess-1", }}, `C:\Program Files\PowerShell\7\pwsh.exe`) + if runtime.GOOS == "windows" { + for _, want := range []string{ + `cwd "C:\\ws"`, + `pane command="Write-Host" name="agent" borderless=true`, + `args "ready"`, + } { + if !strings.Contains(got, want) { + t.Fatalf("direct windows layout missing %q in %q", want, got) + } + } + return + } for _, want := range []string{ `pane command="C:\\Program Files\\PowerShell\\7\\pwsh.exe" name="agent" borderless=true`, - `args "-NoLogo" "-NoProfile" "-NoExit" "-Command"`, - "$env:AO_SESSION_ID = 'sess-1';", - "$env:PATH = ", - "& 'Write-Host' 'ready'", + `args "-NoLogo" "-NoProfile" "-NoExit" "-EncodedCommand"`, } { if !strings.Contains(got, want) { t.Fatalf("powershell layout missing %q in %q", want, got) @@ -253,6 +278,18 @@ func TestBuildLayoutUsesCmdLaunchOnCmdShells(t *testing.T) { got := buildLayout(ports.RuntimeConfig{WorkspacePath: `C:\ws`, Argv: []string{"echo", "ready"}, Env: map[string]string{ "AO_SESSION_ID": "sess-1", }}, `C:\Windows\System32\cmd.exe`) + if runtime.GOOS == "windows" { + for _, want := range []string{ + `cwd "C:\\ws"`, + `pane command="echo" name="agent" borderless=true`, + `args "ready"`, + } { + if !strings.Contains(got, want) { + t.Fatalf("direct windows layout missing %q in %q", want, got) + } + } + return + } for _, want := range []string{ `pane command="C:\\Windows\\System32\\cmd.exe" name="agent" borderless=true`, @@ -281,7 +318,18 @@ func TestCreateRejectsInvalidEnvKeys(t *testing.T) { } func TestCreateStartsSessionAndDiscoversPane(t *testing.T) { - fr := &fakeRunner{outputs: [][]byte{[]byte("zellij 0.44.3"), nil, nil, []byte(`[{"id":0,"is_plugin":true,"title":"zellij:tab-bar"},{"id":3,"is_plugin":false,"title":"agent"}]`)}} + panesOut := []byte(`[{"id":0,"is_plugin":true,"title":"zellij:tab-bar"},{"id":3,"is_plugin":false,"title":"agent"}]`) + outputs := [][]byte{ + []byte("zellij 0.44.3"), + nil, + nil, + panesOut, + } + if runtime.GOOS == "windows" { + outputs = append(outputs, panesOut) + } + outputs = append(outputs, []byte("sess-1 [Created 1s ago] \n")) + fr := &fakeRunner{outputs: outputs} r := New(Options{Binary: "zellij-test", Timeout: time.Second, Shell: "/bin/zsh", SocketDir: "/tmp/zj", ConfigDir: "/tmp/cfg"}) r.runner = fr @@ -297,8 +345,12 @@ func TestCreateStartsSessionAndDiscoversPane(t *testing.T) { if handle != (ports.RuntimeHandle{ID: "sess-1/terminal_3"}) { t.Fatalf("handle = %+v, want zellij handle", handle) } - if len(fr.calls) != 4 { - t.Fatalf("calls = %d, want 4", len(fr.calls)) + wantCalls := 5 + if runtime.GOOS == "windows" { + wantCalls = 6 + } + if len(fr.calls) != wantCalls { + t.Fatalf("calls = %d, want %d", len(fr.calls), wantCalls) } if got, want := fr.calls[0].args, []string{"--config-dir", "/tmp/cfg", "--version"}; !reflect.DeepEqual(got, want) { t.Fatalf("version args = %#v, want %#v", got, want) @@ -312,18 +364,34 @@ func TestCreateStartsSessionAndDiscoversPane(t *testing.T) { if got := fr.calls[3].args; !reflect.DeepEqual(got, append([]string{"--config-dir", "/tmp/cfg"}, listPanesArgs("sess-1")...)) { t.Fatalf("list panes args = %#v", got) } + listSessionsCall := 4 + if runtime.GOOS == "windows" { + if got := fr.calls[4].args; !reflect.DeepEqual(got, append([]string{"--config-dir", "/tmp/cfg"}, listPanesArgs("sess-1")...)) { + t.Fatalf("ready list panes args = %#v", got) + } + listSessionsCall = 5 + } + if got := fr.calls[listSessionsCall].args; !reflect.DeepEqual(got, append([]string{"--config-dir", "/tmp/cfg"}, listSessionsArgs()...)) { + t.Fatalf("list sessions args = %#v", got) + } if got, want := fr.calls[0].env, expectedZellijEnv("/tmp/zj"); !reflect.DeepEqual(got, want) { t.Fatalf("env = %#v, want %#v", got, want) } } func TestCreateClearsStaleSessionBeforeCreating(t *testing.T) { - fr := &fakeRunner{outputs: [][]byte{ + panesOut := []byte(`[{"id":1,"is_plugin":false,"title":"agent"}]`) + outputs := [][]byte{ []byte("zellij 0.44.3"), nil, nil, - []byte(`[{"id":1,"is_plugin":false,"title":"agent"}]`), - }} + panesOut, + } + if runtime.GOOS == "windows" { + outputs = append(outputs, panesOut) + } + outputs = append(outputs, []byte("sess-1 [Created 1s ago] \n")) + fr := &fakeRunner{outputs: outputs} r := New(Options{Binary: "zellij-test", Timeout: time.Second, Shell: "/bin/zsh"}) r.runner = fr @@ -335,8 +403,12 @@ func TestCreateClearsStaleSessionBeforeCreating(t *testing.T) { t.Fatalf("Create: %v", err) } - if len(fr.calls) != 4 { - t.Fatalf("calls = %d, want 4", len(fr.calls)) + wantCalls := 5 + if runtime.GOOS == "windows" { + wantCalls = 6 + } + if len(fr.calls) != wantCalls { + t.Fatalf("calls = %d, want %d", len(fr.calls), wantCalls) } if got, want := fr.calls[1].args, []string{"delete-session", "--force", "sess-1"}; !reflect.DeepEqual(got, want) { t.Fatalf("delete args = %#v, want %#v", got, want) @@ -348,16 +420,14 @@ func TestCreateClearsStaleSessionBeforeCreating(t *testing.T) { func TestAttachCommandDisablesPaneFrames(t *testing.T) { r := New(Options{}) - args, err := r.AttachCommand(ports.RuntimeHandle{ID: "sess-1/terminal_0"}) + args, _, err := r.AttachCommand(ports.RuntimeHandle{ID: "sess-1/terminal_0"}) if err != nil { t.Fatalf("AttachCommand: %v", err) } if runtime.GOOS == "windows" { - joined := strings.Join(args, " ") - for _, want := range []string{"--pane-frames", "false"} { - if !strings.Contains(joined, want) { - t.Fatalf("windows attach command missing %q: %#v", want, args) - } + want := []string{r.binary, "attach", "sess-1", "options", "--pane-frames", "false"} + if !reflect.DeepEqual(args, want) { + t.Fatalf("AttachCommand = %#v, want %#v", args, want) } return } @@ -369,16 +439,13 @@ func TestAttachCommandDisablesPaneFrames(t *testing.T) { func TestAttachCommandUsesSocketDir(t *testing.T) { r := New(Options{SocketDir: "/tmp/zj"}) - args, err := r.AttachCommand(ports.RuntimeHandle{ID: "sess-1/terminal_0"}) + args, _, err := r.AttachCommand(ports.RuntimeHandle{ID: "sess-1/terminal_0"}) if err != nil { t.Fatalf("AttachCommand: %v", err) } if runtime.GOOS == "windows" { - if len(args) < 4 || args[0] != "powershell.exe" { - t.Fatalf("attach command = %#v, want powershell wrapper", args) - } - if !strings.Contains(strings.Join(args, " "), "ZELLIJ_SOCKET_DIR") { - t.Fatalf("attach command missing socket dir env: %#v", args) + if got, want := args[0], r.binary; got != want { + t.Fatalf("attach binary = %q, want %q", got, want) } return } @@ -480,6 +547,20 @@ func TestGetOutputTrimsLines(t *testing.T) { } } +func TestGetOutputTrimsTrailingScreenPaddingBeforeTailing(t *testing.T) { + fr := &fakeRunner{outputs: [][]byte{[]byte("ready\nprompt> echo hi\nhi\n\n\n\n")}} + r := New(Options{Timeout: time.Second}) + r.runner = fr + + out, err := r.GetOutput(context.Background(), ports.RuntimeHandle{ID: "sess-1/terminal_0"}, 2) + if err != nil { + t.Fatalf("GetOutput: %v", err) + } + if out != "prompt> echo hi\nhi\n" { + t.Fatalf("output = %q, want last non-padding lines", out) + } +} + func TestIsAliveParsesNoFormattingOutput(t *testing.T) { fr := &fakeRunner{outputs: [][]byte{[]byte("sess-1 [Created 1s ago] \nold [Created 2s ago] (EXITED - attach to resurrect)\n")}} r := New(Options{Timeout: time.Second}) @@ -596,6 +677,14 @@ func (f *fakeRunner) Run(_ context.Context, env []string, name string, args ...s return out, nil } +func (f *fakeRunner) Start(env []string, name string, args ...string) error { + f.calls = append(f.calls, runnerCall{env: append([]string(nil), env...), name: name, args: append([]string(nil), args...)}) + if len(f.outputs) > 0 { + f.outputs = f.outputs[1:] + } + return f.err +} + func TestCommandErrorUnwraps(t *testing.T) { base := errors.New("base") err := commandError{err: base, output: "details"} diff --git a/backend/internal/agentlaunch/spec.go b/backend/internal/agentlaunch/spec.go new file mode 100644 index 00000000..f3a16190 --- /dev/null +++ b/backend/internal/agentlaunch/spec.go @@ -0,0 +1,57 @@ +// Package agentlaunch persists the exact argv a runtime should execute. +package agentlaunch + +import ( + "encoding/json" + "fmt" + "os" +) + +// EnvSpecPath is the environment variable that holds the path to the launch spec file. +const EnvSpecPath = "AO_LAUNCH_SPEC" + +// Spec describes the agent process the launcher trampoline should exec. +type Spec struct { + WorkspacePath string `json:"workspacePath"` + Argv []string `json:"argv"` + FallbackArgv []string `json:"fallbackArgv,omitempty"` +} + +// WriteTemp serialises spec to a temporary JSON file and returns its path. +func WriteTemp(spec Spec) (string, error) { + file, err := os.CreateTemp(os.TempDir(), "ao-launch-*.json") + if err != nil { + return "", fmt.Errorf("create launch spec: %w", err) + } + path := file.Name() + enc := json.NewEncoder(file) + enc.SetEscapeHTML(false) + if err := enc.Encode(spec); err != nil { + _ = file.Close() + _ = os.Remove(path) + return "", fmt.Errorf("write launch spec: %w", err) + } + if err := file.Close(); err != nil { + _ = os.Remove(path) + return "", fmt.Errorf("close launch spec: %w", err) + } + return path, nil +} + +// ReadAndRemove reads and deletes the spec file at path, returning its contents. +func ReadAndRemove(path string) (Spec, error) { + raw, err := os.ReadFile(path) + if err != nil { + return Spec{}, fmt.Errorf("read launch spec: %w", err) + } + _ = os.Remove(path) + + var spec Spec + if err := json.Unmarshal(raw, &spec); err != nil { + return Spec{}, fmt.Errorf("parse launch spec: %w", err) + } + if len(spec.Argv) == 0 { + return Spec{}, fmt.Errorf("launch spec: argv is required") + } + return spec, nil +} diff --git a/backend/internal/cli/launch.go b/backend/internal/cli/launch.go new file mode 100644 index 00000000..db30300a --- /dev/null +++ b/backend/internal/cli/launch.go @@ -0,0 +1,84 @@ +package cli + +import ( + "context" + "errors" + "fmt" + "os" + "os/exec" + "runtime" + "strings" + + "github.com/spf13/cobra" + + "github.com/aoagents/agent-orchestrator/backend/internal/agentlaunch" +) + +func newLaunchCommand(ctx *commandContext) *cobra.Command { + return &cobra.Command{ + Use: "launch", + Short: "Launch an AO-managed agent process (internal)", + Hidden: true, + Args: noArgs, + RunE: func(cmd *cobra.Command, _ []string) error { + return ctx.launchAgent(cmd.Context()) + }, + } +} + +func (c *commandContext) launchAgent(ctx context.Context) error { + specPath := strings.TrimSpace(os.Getenv(agentlaunch.EnvSpecPath)) + if specPath == "" { + return errors.New("launch: AO_LAUNCH_SPEC is required") + } + spec, err := agentlaunch.ReadAndRemove(specPath) + if err != nil { + return fmt.Errorf("launch: %w", err) + } + + env := withoutLaunchSpecEnv(os.Environ()) + launchErr := c.runLaunchCommand(ctx, spec.WorkspacePath, spec.Argv, env) + if len(spec.FallbackArgv) == 0 { + return launchErr + } + if launchErr != nil { + _, _ = fmt.Fprintf(c.deps.Err, "\r\n[ao launch] agent process exited: %v\r\n", launchErr) + } + return c.runLaunchCommand(ctx, spec.WorkspacePath, spec.FallbackArgv, env) +} + +func (c *commandContext) runLaunchCommand(ctx context.Context, dir string, argv, env []string) error { + if len(argv) == 0 { + return errors.New("launch: command argv is required") + } + cmd := exec.CommandContext(ctx, argv[0], argv[1:]...) + cmd.Dir = dir + cmd.Env = env + cmd.Stdin = c.deps.In + cmd.Stdout = c.deps.Out + cmd.Stderr = c.deps.Err + return cmd.Run() +} + +func withoutLaunchSpecEnv(env []string) []string { + cleaned := env[:0] + for _, pair := range env { + key, _, ok := strings.Cut(pair, "=") + if !ok { + cleaned = append(cleaned, pair) + continue + } + if envKeyEqual(key, agentlaunch.EnvSpecPath) { + continue + } + cleaned = append(cleaned, pair) + } + return cleaned +} + +func envKeyEqual(a, b string) bool { + if runtime.GOOS == "windows" { + return strings.EqualFold(a, b) + } + return a == b +} diff --git a/backend/internal/cli/root.go b/backend/internal/cli/root.go index f0198fee..b24a236d 100644 --- a/backend/internal/cli/root.go +++ b/backend/internal/cli/root.go @@ -168,6 +168,7 @@ func NewRootCommand(deps Deps) *cobra.Command { root.AddCommand(newSpawnCommand(ctx)) root.AddCommand(newSendCommand(ctx)) root.AddCommand(newHooksCommand(ctx)) + root.AddCommand(newLaunchCommand(ctx)) root.AddCommand(newImportCommand(ctx)) root.AddCommand(newProjectCommand(ctx)) root.AddCommand(newSessionCommand(ctx)) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 747f5251..32dd4ee5 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "log/slog" + "net/http" "os" "os/signal" "syscall" @@ -32,12 +33,17 @@ func Run() error { log := newLogger() - // Fail fast if a live daemon already owns the handshake file. A run-file - // left by a crashed predecessor (dead PID) is treated as stale and - // overwritten when the new server starts. + // Fail fast only if a daemon is genuinely still serving the recorded port. + // CheckStale confirms the run-file's PID is alive, but that alone is not + // proof a predecessor owns the port: the file leaks when the daemon is hard + // killed without a graceful shutdown (the norm on Windows, where the desktop + // supervisor can only TerminateProcess it), and Windows reuses the recorded + // PID for unrelated processes. So a "live" PID is verified against an actual + // /healthz probe; a run-file left by a crashed/hard-killed/reused-PID + // predecessor is treated as stale and overwritten when the new server starts. if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil { return fmt.Errorf("inspect run-file: %w", err) - } else if live != nil { + } else if live != nil && runFileOwnerServing(&http.Client{Timeout: staleProbeTimeout}, config.LoopbackHost, live) { return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) } diff --git a/backend/internal/daemon/stale.go b/backend/internal/daemon/stale.go new file mode 100644 index 00000000..e8f7963c --- /dev/null +++ b/backend/internal/daemon/stale.go @@ -0,0 +1,58 @@ +package daemon + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +// staleProbeTimeout bounds the startup ownership probe so a run-file pointing at +// an unreachable port cannot stall daemon startup. +const staleProbeTimeout = 2 * time.Second + +// runFileOwnerServing reports whether an AO daemon matching info is actually +// serving on the recorded loopback port. +// +// runfile.CheckStale only confirms the recorded PID is alive, which is not +// enough to conclude a predecessor still owns the port. On Windows the desktop +// supervisor can only TerminateProcess the daemon (no POSIX signal reaches it), +// so the daemon's graceful shutdown never runs and running.json is never +// removed; the leaked file then survives into the next launch. Because Windows +// reuses PIDs aggressively, the recorded PID routinely belongs to an unrelated +// process, making the PID-only check report "alive" for a daemon that is long +// gone — which is what made the daemon refuse to start (issue #256). +// +// Probing /healthz and matching both the service name and the PID is the ground +// truth that a real predecessor is still listening. When it is not, the +// run-file is stale and the caller should overwrite it instead of refusing. +func runFileOwnerServing(client *http.Client, host string, info *runfile.Info) bool { + if info == nil || info.Port <= 0 { + return false + } + + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s:%d/healthz", host, info.Port), http.NoBody) + if err != nil { + return false + } + resp, err := client.Do(req) + if err != nil { + return false + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return false + } + + var body struct { + Service string `json:"service"` + PID int `json:"pid"` + } + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return false + } + return body.Service == daemonmeta.ServiceName && body.PID == info.PID +} diff --git a/backend/internal/daemon/stale_test.go b/backend/internal/daemon/stale_test.go new file mode 100644 index 00000000..79939ceb --- /dev/null +++ b/backend/internal/daemon/stale_test.go @@ -0,0 +1,116 @@ +package daemon + +import ( + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "strconv" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +// healthzBody returns a handler that answers /healthz with the given service +// name and pid, mimicking the daemon's real liveness probe. +func healthzBody(service string, pid int) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/healthz" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, service, pid) + } +} + +func hostPort(t *testing.T, rawURL string) (string, int) { + t.Helper() + u, err := url.Parse(rawURL) + if err != nil { + t.Fatalf("parse %q: %v", rawURL, err) + } + port, err := strconv.Atoi(u.Port()) + if err != nil { + t.Fatalf("port from %q: %v", rawURL, err) + } + return u.Hostname(), port +} + +func TestRunFileOwnerServing(t *testing.T) { + const pid = 4242 + + tests := []struct { + name string + handler http.HandlerFunc + want bool + }{ + { + name: "matching service and pid is the live owner", + handler: healthzBody(daemonmeta.ServiceName, pid), + want: true, + }, + { + name: "reused pid: same port, different process pid", + handler: healthzBody(daemonmeta.ServiceName, pid+1), + want: false, + }, + { + name: "foreign service occupying the port", + handler: healthzBody("some-other-service", pid), + want: false, + }, + { + name: "non-2xx response", + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + }, + want: false, + }, + { + name: "unparseable body", + handler: func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("not json")) + }, + want: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + srv := httptest.NewServer(tc.handler) + defer srv.Close() + host, port := hostPort(t, srv.URL) + + got := runFileOwnerServing(srv.Client(), host, &runfile.Info{PID: pid, Port: port}) + if got != tc.want { + t.Errorf("runFileOwnerServing = %v, want %v", got, tc.want) + } + }) + } +} + +func TestRunFileOwnerServingNoListener(t *testing.T) { + // Bind then immediately close to obtain a port nothing is listening on, so + // the probe hits a refused connection — the leaked-run-file case. + srv := httptest.NewServer(http.NotFoundHandler()) + host, port := hostPort(t, srv.URL) + srv.Close() + + client := &http.Client{Timeout: time.Second} + if runFileOwnerServing(client, host, &runfile.Info{PID: 4242, Port: port}) { + t.Error("runFileOwnerServing on a dead port = true, want false (stale, safe to overwrite)") + } +} + +func TestRunFileOwnerServingNilOrZeroPort(t *testing.T) { + client := &http.Client{Timeout: time.Second} + if runFileOwnerServing(client, "127.0.0.1", nil) { + t.Error("runFileOwnerServing(nil) = true, want false") + } + if runFileOwnerServing(client, "127.0.0.1", &runfile.Info{PID: 1, Port: 0}) { + t.Error("runFileOwnerServing(port 0) = true, want false") + } +} diff --git a/backend/internal/httpd/terminal_mux_test.go b/backend/internal/httpd/terminal_mux_test.go index 01c05695..e0c9929a 100644 --- a/backend/internal/httpd/terminal_mux_test.go +++ b/backend/internal/httpd/terminal_mux_test.go @@ -28,9 +28,9 @@ type stubSource struct { attached atomic.Bool } -func (s *stubSource) AttachCommand(ports.RuntimeHandle) ([]string, error) { +func (s *stubSource) AttachCommand(ports.RuntimeHandle) ([]string, []string, error) { s.attached.Store(true) - return s.argv, nil + return s.argv, nil, nil } func (s *stubSource) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { diff --git a/backend/internal/terminal/attachment.go b/backend/internal/terminal/attachment.go index 682f05eb..ab03762e 100644 --- a/backend/internal/terminal/attachment.go +++ b/backend/internal/terminal/attachment.go @@ -12,12 +12,14 @@ import ( ) // PTYSource is what a terminal needs from the runtime: the argv that attaches a -// PTY to a session's pane, and a liveness check used to decide whether a dropped -// PTY should be re-attached or treated as a clean exit. The Zellij runtime adapter -// satisfies this via AttachCommand/IsAlive; the interface lives here, next to its -// only consumer, so terminal does not depend on a concrete adapter. +// PTY to a session's pane (plus any env that argv needs but is not in the +// daemon's process env — e.g. a per-session ZELLIJ_SOCKET_DIR on Windows), +// and a liveness check used to decide whether a dropped PTY should be +// re-attached or treated as a clean exit. The Zellij runtime adapter +// satisfies this via AttachCommand/IsAlive; the interface lives here, next to +// its only consumer, so terminal does not depend on a concrete adapter. type PTYSource interface { - AttachCommand(handle ports.RuntimeHandle) ([]string, error) + AttachCommand(handle ports.RuntimeHandle) (argv []string, env []string, err error) IsAlive(ctx context.Context, handle ports.RuntimeHandle) (bool, error) } @@ -35,8 +37,10 @@ type ptyProcess interface { // matters: the attach process reads the tty size once at startup, and sizing // the PTY only after exec relies on SIGWINCH delivery that can race the // process installing its handler — a missed signal leaves the zellij client -// laid out for a stale size. ctx cancellation must terminate the process. -type spawnFunc func(ctx context.Context, argv []string, rows, cols uint16) (ptyProcess, error) +// laid out for a stale size. env, when non-nil, is the full env block for the +// child (mirrors exec.Cmd.Env: nil means inherit). ctx cancellation must +// terminate the process. +type spawnFunc func(ctx context.Context, argv []string, env []string, rows, cols uint16) (ptyProcess, error) // reattach policy: a PTY that drops is re-attached while the underlying Zellij // session is still alive, up to maxReattach consecutive failures. An attach that @@ -134,13 +138,13 @@ func (a *attachment) run(ctx context.Context) { return } - argv, err := a.src.AttachCommand(a.handle) + argv, env, err := a.src.AttachCommand(a.handle) if err != nil { a.fail("attach command: " + err.Error()) return } rows, cols := a.size() - p, err := a.spawn(ctx, argv, rows, cols) + p, err := a.spawn(ctx, argv, env, rows, cols) if err != nil { failures++ if failures > a.maxReattach { diff --git a/backend/internal/terminal/fakes_test.go b/backend/internal/terminal/fakes_test.go index 2ff0688f..c0818f3d 100644 --- a/backend/internal/terminal/fakes_test.go +++ b/backend/internal/terminal/fakes_test.go @@ -19,14 +19,14 @@ type fakeSource struct { attachErr error } -func (f *fakeSource) AttachCommand(ports.RuntimeHandle) ([]string, error) { +func (f *fakeSource) AttachCommand(ports.RuntimeHandle) ([]string, []string, error) { if f.attachErr != nil { - return nil, f.attachErr + return nil, nil, f.attachErr } if f.argv == nil { - return []string{"zellij", "attach"}, nil + return []string{"zellij", "attach"}, nil, nil } - return f.argv, nil + return f.argv, nil, nil } func (f *fakeSource) IsAlive(context.Context, ports.RuntimeHandle) (bool, error) { @@ -119,7 +119,7 @@ type fakeSpawner struct { sizes [][2]uint16 // rows×cols passed to each spawn call, in order } -func (f *fakeSpawner) spawn(_ context.Context, _ []string, rows, cols uint16) (ptyProcess, error) { +func (f *fakeSpawner) spawn(_ context.Context, _ []string, _ []string, rows, cols uint16) (ptyProcess, error) { f.mu.Lock() defer f.mu.Unlock() if f.err != nil { diff --git a/backend/internal/terminal/pty_unix.go b/backend/internal/terminal/pty_unix.go index 6f173094..390a79b9 100644 --- a/backend/internal/terminal/pty_unix.go +++ b/backend/internal/terminal/pty_unix.go @@ -18,13 +18,18 @@ import ( // birth when a size is known: `zellij attach` reads the tty size once at // startup, and a post-spawn TIOCSWINSZ depends on SIGWINCH delivery that can // race the client installing its handler — StartWithSize makes the first read -// correct by construction. ctx cancellation kills the process. Windows uses a -// stub (see pty_windows.go) until a ConPTY path is added. -func defaultSpawn(ctx context.Context, argv []string, rows, cols uint16) (ptyProcess, error) { +// correct by construction. env, when non-nil, replaces the inherited +// environment (mirrors exec.Cmd.Env semantics). ctx cancellation kills the +// process. Windows uses a stub (see pty_windows.go) until a ConPTY path is +// added. +func defaultSpawn(ctx context.Context, argv, env []string, rows, cols uint16) (ptyProcess, error) { if len(argv) == 0 { return nil, errors.New("terminal: empty attach command") } cmd := exec.CommandContext(ctx, argv[0], argv[1:]...) + if env != nil { + cmd.Env = env + } var f *os.File var err error if rows > 0 && cols > 0 { diff --git a/backend/internal/terminal/pty_unix_test.go b/backend/internal/terminal/pty_unix_test.go index 76e26337..9a8bd9a0 100644 --- a/backend/internal/terminal/pty_unix_test.go +++ b/backend/internal/terminal/pty_unix_test.go @@ -14,7 +14,7 @@ import ( // exactly once. Without the sync.Once a second Wait blocks forever, so this test // would hang (caught by the watchdog) rather than fail. func TestCreackPTYCloseIsIdempotent(t *testing.T) { - p, err := defaultSpawn(context.Background(), []string{"/bin/sh", "-c", "sleep 30"}, 0, 0) + p, err := defaultSpawn(context.Background(), []string{"/bin/sh", "-c", "sleep 30"}, nil, 0, 0) if err != nil { t.Fatalf("spawn: %v", err) } @@ -41,7 +41,7 @@ func TestCreackPTYCloseIsIdempotent(t *testing.T) { // the pane" desync. func TestCreackPTYResizeSignalsOnIdenticalSize(t *testing.T) { p, err := defaultSpawn(context.Background(), - []string{"/bin/sh", "-c", `trap 'echo WINCHED' WINCH; while :; do sleep 0.05; done`}, 0, 0) + []string{"/bin/sh", "-c", `trap 'echo WINCHED' WINCH; while :; do sleep 0.05; done`}, nil, 0, 0) if err != nil { t.Fatalf("spawn: %v", err) } @@ -82,7 +82,7 @@ func TestCreackPTYResizeSignalsOnIdenticalSize(t *testing.T) { // races the client installing its WINCH handler (a missed signal strands the // zellij session at the previous client's size). func TestCreackPTYSpawnsAtRequestedSize(t *testing.T) { - p, err := defaultSpawn(context.Background(), []string{"/bin/sh", "-c", "stty size"}, 40, 140) + p, err := defaultSpawn(context.Background(), []string{"/bin/sh", "-c", "stty size"}, nil, 40, 140) if err != nil { t.Fatalf("spawn: %v", err) } @@ -113,7 +113,7 @@ func TestCreackPTYSpawnsAtRequestedSize(t *testing.T) { func TestCreackPTYCloseTermsBeforeKill(t *testing.T) { t.Run("cooperative process exits within the grace", func(t *testing.T) { p, err := defaultSpawn(context.Background(), - []string{"/bin/sh", "-c", `trap 'exit 0' TERM; while :; do sleep 0.05; done`}, 0, 0) + []string{"/bin/sh", "-c", `trap 'exit 0' TERM; while :; do sleep 0.05; done`}, nil, 0, 0) if err != nil { t.Fatalf("spawn: %v", err) } @@ -127,7 +127,7 @@ func TestCreackPTYCloseTermsBeforeKill(t *testing.T) { t.Run("TERM-ignoring process is killed after the grace", func(t *testing.T) { p, err := defaultSpawn(context.Background(), - []string{"/bin/sh", "-c", `trap '' TERM; while :; do sleep 0.05; done`}, 0, 0) + []string{"/bin/sh", "-c", `trap '' TERM; while :; do sleep 0.05; done`}, nil, 0, 0) if err != nil { t.Fatalf("spawn: %v", err) } diff --git a/backend/internal/terminal/pty_windows.go b/backend/internal/terminal/pty_windows.go index 6d8107f3..5a83400f 100644 --- a/backend/internal/terminal/pty_windows.go +++ b/backend/internal/terminal/pty_windows.go @@ -5,11 +5,86 @@ package terminal import ( "context" "errors" + "sync" + "time" + + winpty "github.com/aymanbagabas/go-pty" ) -// defaultSpawn is not implemented on Windows: the POSIX PTY path uses -// creack/pty. The rest of the package compiles and tests on Windows with an -// injected spawner. -func defaultSpawn(_ context.Context, _ []string, _, _ uint16) (ptyProcess, error) { - return nil, errors.New("terminal: PTY streaming is not supported on Windows yet") +// detachGrace mirrors the Unix value: how long Close waits for the attach +// process to exit on its own (closing the ConPTY surfaces as EOF on the +// child's stdin) before falling back to Kill. +const detachGrace = 250 * time.Millisecond + +// defaultSpawn starts argv on a Windows ConPTY and exposes the console pipes +// through the same ptyProcess interface used by the Unix creack/pty path. +// go-pty creates the pseudo-console at 80x25 internally, so we only Resize +// when the caller actually has a grid (mirroring StartWithSize on Unix). +// env, when non-nil, replaces the inherited environment via Win32's native +// CreateProcess env block (mirrors exec.Cmd.Env semantics) — this is how a +// per-session ZELLIJ_SOCKET_DIR reaches the zellij attach client. +func defaultSpawn(ctx context.Context, argv []string, env []string, rows, cols uint16) (ptyProcess, error) { + if len(argv) == 0 { + return nil, errors.New("terminal: empty attach command") + } + pty, err := winpty.New() + if err != nil { + return nil, err + } + if rows > 0 && cols > 0 { + if err := pty.Resize(int(cols), int(rows)); err != nil { + _ = pty.Close() + return nil, err + } + } + cmd := pty.CommandContext(ctx, argv[0], argv[1:]...) + if env != nil { + cmd.Env = env + } + if err := cmd.Start(); err != nil { + _ = pty.Close() + return nil, err + } + + p := &conPTYProcess{pty: pty, cmd: cmd, waitDone: make(chan struct{})} + go func() { + _ = cmd.Wait() + close(p.waitDone) + }() + return p, nil +} + +type conPTYProcess struct { + pty winpty.Pty + cmd *winpty.Cmd + waitDone chan struct{} + closeOnce sync.Once +} + +func (p *conPTYProcess) Read(b []byte) (int, error) { return p.pty.Read(b) } +func (p *conPTYProcess) Write(b []byte) (int, error) { return p.pty.Write(b) } + +func (p *conPTYProcess) Resize(rows, cols uint16) error { + if rows == 0 || cols == 0 { + return nil + } + return p.pty.Resize(int(cols), int(rows)) +} + +// Close stops the attach process and releases the ConPTY. Closing the pty +// signals EOF to the child; if it does not exit within detachGrace we fall +// back to Kill. Idempotent via closeOnce. +func (p *conPTYProcess) Close() error { + p.closeOnce.Do(func() { + _ = p.pty.Close() + select { + case <-p.waitDone: + case <-time.After(detachGrace): + if p.cmd.Process != nil { + _ = p.cmd.Process.Kill() + } + <-p.waitDone + } + }) + return nil } diff --git a/backend/internal/terminal/pty_windows_test.go b/backend/internal/terminal/pty_windows_test.go new file mode 100644 index 00000000..ee8c7d61 --- /dev/null +++ b/backend/internal/terminal/pty_windows_test.go @@ -0,0 +1,99 @@ +//go:build windows + +package terminal + +import ( + "bytes" + "context" + "errors" + "io" + "strings" + "testing" + "time" +) + +func TestDefaultSpawnWindowsStreamsOutput(t *testing.T) { + p, err := defaultSpawn(context.Background(), []string{"cmd.exe", "/D", "/Q", "/K"}, nil, 24, 80) + if err != nil { + t.Fatalf("defaultSpawn: %v", err) + } + defer p.Close() + if _, err := p.Write([]byte("echo AO_CONPTY_OK\r\n")); err != nil { + t.Fatalf("write PTY: %v", err) + } + + out := readPTYUntil(t, p, "AO_CONPTY_OK", 5*time.Second) + if !strings.Contains(out, "AO_CONPTY_OK") { + t.Fatalf("output %q does not contain marker", out) + } +} + +func TestDefaultSpawnWindowsRejectsEmptyCommand(t *testing.T) { + _, err := defaultSpawn(context.Background(), nil, nil, 0, 0) + if err == nil || !strings.Contains(err.Error(), "empty attach command") { + t.Fatalf("expected empty attach command error, got %v", err) + } +} + +func TestConPTYCloseIsIdempotent(t *testing.T) { + p, err := defaultSpawn(context.Background(), []string{"cmd.exe", "/D", "/Q", "/K"}, nil, 24, 80) + if err != nil { + t.Fatalf("defaultSpawn: %v", err) + } + + done := make(chan struct{}) + go func() { + _ = p.Close() + _ = p.Close() + close(done) + }() + + select { + case <-done: + case <-time.After(3 * time.Second): + t.Fatal("Close did not return") + } +} + +func readPTYUntil(t *testing.T, p ptyProcess, marker string, timeout time.Duration) string { + t.Helper() + type result struct { + out string + err error + } + results := make(chan result, 1) + go func() { + var buf bytes.Buffer + tmp := make([]byte, 4096) + for { + n, err := p.Read(tmp) + if n > 0 { + buf.Write(tmp[:n]) + if strings.Contains(buf.String(), marker) { + results <- result{out: buf.String()} + return + } + } + if err != nil { + if errors.Is(err, io.EOF) { + results <- result{out: buf.String()} + } else { + results <- result{out: buf.String(), err: err} + } + return + } + } + }() + + select { + case res := <-results: + if res.err != nil { + t.Fatalf("read PTY: %v (output %q)", res.err, res.out) + } + return res.out + case <-time.After(timeout): + _ = p.Close() + t.Fatal("timed out reading PTY output") + return "" + } +}