diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 833b558..c26288d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,8 +1,9 @@ name: ci -# Lint + test the solx CLI on every push to main and every PR. The agent -# skill itself is prose + references (no build step); its evals run out of -# band (see DEVELOPMENT.md), so CI guards the code that ships as an artifact. +# Lint, test, and build the solx binary on every push to main and every PR. +# The agent skill itself is prose + references (no build step); its evals run +# out of band (see DEVELOPMENT.md), so CI guards the code that ships as the +# release artifact. on: push: @@ -17,80 +18,50 @@ concurrency: cancel-in-progress: true jobs: - test: - name: test + lint (py${{ matrix.python }}) + check: + name: test + lint runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python: ["3.10", "3.11", "3.12", "3.13"] defaults: run: working-directory: solx steps: - uses: actions/checkout@v5 - - - name: Install uv (Python ${{ matrix.python }}) - uses: astral-sh/setup-uv@v7 + - uses: dtolnay/rust-toolchain@stable with: - python-version: ${{ matrix.python }} - enable-cache: true - working-directory: solx # the uv project + lockfile live here - - - name: Sync dependencies (frozen) - run: uv sync --frozen - - - name: Lint - run: uv run --frozen ruff check src tests # pinned via uv.lock (dev group) - + components: rustfmt, clippy + - uses: Swatinem/rust-cache@v2 + with: + workspaces: solx + - name: Format + run: cargo fmt --all --check + - name: Clippy + run: cargo clippy --locked --all-targets -- -D warnings - name: Test - run: uv run --frozen pytest -q + run: cargo test --locked build: - # Build the single-file zipapp and attach it to the run, so a reviewer - # can install and test the PR's solx on Sol without building it: download - # solx.pyz from Checks -> Artifacts, then run it through install.sh (which - # re-stamps the shebang for the local interpreter — see DEVELOPMENT.md). - name: build solx.pyz + # Build the portable release binary and attach it to the run, so a + # reviewer can download it from Checks -> Artifacts and test the PR's + # solx on Sol without a Rust toolchain. The musl target links libc + # statically, so the artifact runs on any x86-64 Linux — Sol's RHEL 8 + # included — regardless of the host glibc. runs-on: ubuntu-latest - env: - SOLX_PYTHON: "3.11" # the zipapp's embedded bytecode is 3.11-specific defaults: run: working-directory: solx steps: - uses: actions/checkout@v5 - - - name: Install uv (Python 3.11) - uses: astral-sh/setup-uv@v7 + - uses: dtolnay/rust-toolchain@stable with: - python-version: "3.11" - enable-cache: true - working-directory: solx - - - name: Build zipapp - run: bash scripts/build-pyz.sh - - - name: Smoke-test the artifact and installer - # The build's only check used to be that zipfile could open the - # archive — which tolerates the corruption the 0.5.0 installer - # produced. Actually *run* it: in place, and end-to-end through - # install.sh. The installer rebinds the interpreter, so point it at a - # path whose length differs from the build shebang (a symlink under - # $RUNNER_TEMP) — that is the condition under which an in-place shebang - # swap corrupts the offsets, so a regression here fails the build - # instead of shipping. Same runner, same interpreter, so no fallback. - run: | - set -eux - ./dist/solx.pyz --version - ln -sf "$(uv python find 3.11)" "$RUNNER_TEMP/python3.11" - SOLX_INSTALL_DIR="$RUNNER_TEMP/bin" SOLX_PYTHON="$RUNNER_TEMP/python3.11" \ - sh scripts/install.sh ./dist/solx.pyz - "$RUNNER_TEMP/bin/solx" --version - - - name: Upload zipapp + targets: x86_64-unknown-linux-musl + - uses: Swatinem/rust-cache@v2 + with: + workspaces: solx + - name: Build (musl static) + run: cargo build --locked --release --target x86_64-unknown-linux-musl + - name: Upload binary uses: actions/upload-artifact@v4 with: - name: solx-pyz - path: solx/dist/solx.pyz + name: solx-x86_64-linux-musl + path: solx/target/x86_64-unknown-linux-musl/release/solx if-no-files-found: error diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 232137d..2fd1729 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,8 +1,8 @@ name: release -# CLI-first release. A pushed `vX.Y.Z` tag builds the single-file zipapp -# (solx.pyz) and publishes a GitHub Release with it + install.sh attached, so -# curl -fsSL .../releases/latest/download/install.sh | sh +# A pushed `vX.Y.Z` tag builds the static `solx` binary and publishes a +# GitHub Release with it attached, so +# curl -fLo solx .../releases/latest/download/solx-x86_64-unknown-linux-musl # always fetches the build matching the tag. The skill rides the same tag # (one version line — see CHANGELOG.md), installed from the repo tree. @@ -12,7 +12,7 @@ on: workflow_dispatch: inputs: tag: - description: "Existing tag to (re)build a release for, e.g. v0.4.0" + description: "Existing tag to (re)build a release for, e.g. v1.0.0" required: true permissions: @@ -20,23 +20,22 @@ permissions: jobs: release: - name: build .pyz + publish release + name: build binary + publish release runs-on: ubuntu-latest - env: - # Must match build-pyz.sh / install.sh: the embedded bytecode is - # interpreter-specific, so the build and the install shebang agree. - SOLX_PYTHON: "3.11" + defaults: + run: + working-directory: solx steps: - uses: actions/checkout@v5 with: ref: ${{ github.event.inputs.tag || github.ref }} - - name: Install uv (Python 3.11) - uses: astral-sh/setup-uv@v7 + - uses: dtolnay/rust-toolchain@stable + with: + targets: x86_64-unknown-linux-musl + - uses: Swatinem/rust-cache@v2 with: - python-version: "3.11" - enable-cache: true - working-directory: solx # the uv project + lockfile live here + workspaces: solx - name: Resolve tag id: tag @@ -47,28 +46,30 @@ jobs: REF_NAME: ${{ github.ref_name }} run: echo "tag=${INPUT_TAG:-$REF_NAME}" >> "$GITHUB_OUTPUT" - - name: Verify the tag matches the one version line (CLI, package, skill) - working-directory: solx + - name: Verify the tag matches the one version line (crate + skill) env: TAG: ${{ steps.tag.outputs.tag }} run: | - uv sync --frozen want="${TAG#v}" - cli="$(uv run --frozen solx --version)" - pkg="$(sed -nE 's/^version = "([^"]+)".*/\1/p' pyproject.toml | head -1)" + crate="$(sed -nE 's/^version = "([^"]+)".*/\1/p' Cargo.toml | head -1)" skill="$(sed -nE 's/^version:[[:space:]]*([^[:space:]]+).*/\1/p' ../skills/sol-skill/SKILL.md | head -1)" - echo "tag=$want solx=$cli pyproject=$pkg SKILL.md=$skill" - if [ "$cli" != "$want" ] || [ "$pkg" != "$want" ] || [ "$skill" != "$want" ]; then - echo "::error::version mismatch — tag=$want solx=$cli pyproject=$pkg SKILL.md=$skill. Bump all three (and uv lock) or retag." >&2 + echo "tag=$want Cargo.toml=$crate SKILL.md=$skill" + if [ "$crate" != "$want" ] || [ "$skill" != "$want" ]; then + echo "::error::version mismatch — tag=$want Cargo.toml=$crate SKILL.md=$skill. Bump both (and the lockfile) or retag." >&2 exit 1 fi - - name: Run tests - working-directory: solx - run: uv run --frozen pytest -q + - name: Test (locked) + run: cargo test --locked + + - name: Build (musl static) + run: cargo build --locked --release --target x86_64-unknown-linux-musl - - name: Build single-file zipapp - run: bash solx/scripts/build-pyz.sh + - name: Stage the release asset + run: | + install -m 755 \ + target/x86_64-unknown-linux-musl/release/solx \ + solx-x86_64-unknown-linux-musl - name: Publish GitHub Release env: @@ -77,11 +78,10 @@ jobs: run: | # Create on first run; on a re-run (workflow_dispatch) refresh assets. if gh release view "$TAG" >/dev/null 2>&1; then - gh release upload "$TAG" solx/dist/solx.pyz solx/scripts/install.sh --clobber + gh release upload "$TAG" solx-x86_64-unknown-linux-musl --clobber else gh release create "$TAG" \ --title "$TAG" \ --generate-notes \ - solx/dist/solx.pyz \ - solx/scripts/install.sh + solx-x86_64-unknown-linux-musl fi diff --git a/CHANGELOG.md b/CHANGELOG.md index e0b5f4a..9636cbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,88 @@ This project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and the [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) format. From v0.4.0 the CLI and the skill share **one version line**: each entry's -version matches `solx/src/solx/__init__.py`, the `version` field in -[`skills/sol-skill/SKILL.md`](skills/sol-skill/SKILL.md), and the git tag, -and a pushed `vX.Y.Z` tag builds and publishes the release. +version matches the `version` field in [`solx/Cargo.toml`](solx/Cargo.toml) +and in [`skills/sol-skill/SKILL.md`](skills/sol-skill/SKILL.md), and the git +tag, and a pushed `vX.Y.Z` tag builds and publishes the release. + +## [1.0.0] — 2026-06-10 + +solx is now a single native binary (Rust); the Python implementation is +retired. Every command starts in ~1ms with no Python interpreter and no +per-module NFS reads, so startup no longer degrades under node load or a +cold NFS cache. Install is one static file — download and `chmod +x` — with +no `uv`, no Python, and no toolchain on the box. + +### Highlights + +Startup latency, warm median on a Sol compute node (NFS `$HOME`): + +| command | raw `squeue` | v0.5.0 (Python) | **v1.0 (Rust)** | speedup | +|---|---|---|---|---| +| `solx --version` | — | 0.10s | **0.010s** | 10× | +| `solx job list` | 0.08s | 0.39s | **0.12s** | 3.3× | +| `solx job time` | 0.08s | 0.31s | **0.12s** | 2.6× | + +The binary tracks raw `squeue` — its residual over `squeue` is just the +`squeue` subprocess it spawns — and, unlike the Python builds, its startup +is flat regardless of node load or cache state. ~4.9MB, no runtime +dependencies (no Python, `uv`, or `rustc` on the target). + +### Added + +- **`solx cheatsheet`** — prints the Sol quick reference (SLURM basics, + `solx` ↔ raw SLURM, the partition/QOS table, Sol's `my*`/`show*` + wrappers, laptop tunnels) as text. It's embedded from the skill's single + source `skills/sol-skill/references/cheatsheet.md`, so the CLI, the + rendered [`docs/cheatsheet.pdf`](docs/cheatsheet.pdf), and the skill + reference can't drift. Wired into the bash/zsh/fish completions. +- **The Sol cheat sheet** in the skill — + `skills/sol-skill/references/cheatsheet.md`, with a centered README nav + and a `scripts/build-cheatsheet.sh` PDF build. +- **Eval-harness L3 grader `l3_sbatch_test_only`** — validates an agent's + recommended `#SBATCH` header against the live scheduler (`sbatch + --test-only`), catching partition/QOS combos that read plausibly but the + scheduler rejects (e.g. `-p htc -q debug`). + +### Changed + +- **The CLI is rewritten in Rust** (the `solx/` crate), preserving the + v0.5.0 command surface, output contract, and exit codes; behavioral + parity was verified during the port and is locked going forward by the + crate's test suite (`solx/tests/cli.rs` + unit vectors). The agent + skill's operational guidance is unchanged apart from the install steps, + the dropped `~/.solkeep` fallback (below), and the partition/QOS rework + (next). +- **SLURM partition/QOS guidance reworked.** The skill routes jobs by + wall-time and priority, not CPU-vs-GPU: ≤4h work (GPUs included) → `htc`; + a ≤15-minute urgent check → `-p public -q debug`; longer runs → `public` + (or `general` with `-q private` for preemptible buy-in nodes). This + fixes the "GPU → `public`" reflex that parked short GPU jobs behind + multi-day ones. The Submitting-Jobs section is promoted ahead of storage + and gains a personalized "know your access" step (`sacctmgr show assoc`). + Factual corrections verified against the live scheduler: `htc` carries + H200 nodes; `highmem`'s wall is 7 days; there is no `myquota` wrapper + (use `beegfs-ctl --getquota`); `sq` is the whole-cluster queue, not + `squeue --me`. +- **Install is a prebuilt static binary.** Download + `solx-x86_64-unknown-linux-musl` from the release, `chmod +x`, and drop + it on `PATH`. The `curl install.sh | sh` and `uv tool install` channels + are gone, along with their `uv`/Python requirement. See + [`solx/README.md`](solx/README.md). + +### Removed + +- **The Python implementation.** The Typer-then-`argparse` CLI that lived + at `solx/` — its test suite, the `.pyz` zipapp build (`build-pyz.sh`), + `install.sh`, and the `uv tool` install channel — is deleted. `solx/` + now holds the Rust crate, the only solx; the `.pyz` and `uv` install + channels no longer exist. +- **`~/.solkeep` support, end to end.** The config `[keep]` block is now + the only keep-list source: `solx keep` never reads a `~/.solkeep` (the + implicit fallback, deprecated since 0.4.0, was slated for 1.0.0), and the + `solx config import-solkeep` command and the `--solkeep ` flag are + removed with it. With no `[keep]` block, `keep` errors and points at + `solx config edit`. ## [0.5.1] — 2026-06-10 @@ -51,7 +130,7 @@ A `solx job` read now costs the same order as a raw SLURM call. Absolute startup over NFS scales with node load — Python pays a per-module open storm, so v0.4.0 can reach ~2.5s under contention — and the win is removing that import tree. On node-local disk the floor is lower still -(`--version` ~0.02s). Full table in `docs/ROADMAP.md`. +(`--version` ~0.02s). ### Upgrading @@ -85,7 +164,7 @@ removing that import tree. On node-local disk the floor is lower still output contract are unchanged apart from the two documented supersets below (`--json` placement and `-h`); verified with `evals/parity/`. - **Startup latency** drops to the order of a raw SLURM call (see - Highlights above; full table in `docs/ROADMAP.md`): removing the + Highlights above): removing the Typer/`click`/`rich` import tree cuts a `solx job` read from seconds to ~0.1–0.4s warm on the NFS `$HOME` install, ~13× / 6.4× / 8.1× over v0.4.0 on `--version` / `job list` / `job time`. On node-local disk the @@ -427,7 +506,8 @@ agentskills.io-compatible layout (skill content under CSV-driven `/scratch` renewal, and shipped the original references (`module.md`, `scratch.md`, `sharing.md`, `slurm.md`). -[Unreleased]: https://github.com/Shu-Wan/solx/compare/v0.5.1...HEAD +[Unreleased]: https://github.com/Shu-Wan/solx/compare/v1.0.0...HEAD +[1.0.0]: https://github.com/Shu-Wan/solx/releases/tag/v1.0.0 [0.5.1]: https://github.com/Shu-Wan/solx/releases/tag/v0.5.1 [0.5.0]: https://github.com/Shu-Wan/solx/releases/tag/v0.5.0 [0.4.0]: https://github.com/Shu-Wan/solx/releases/tag/v0.4.0 diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index ac5035b..a1bade9 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -11,12 +11,12 @@ content). Public-facing test methodology lives in solx/ # the repo ├── README.md # end-user entry point (CLI + skill) ├── DEVELOPMENT.md # you are here (skill + eval harness) -├── .github/workflows/ # ci.yml (lint+test) · release.yml (.pyz + GH release on tag) +├── .github/workflows/ # ci.yml (lint+test+build) · release.yml (musl binary + GH release on tag) ├── docs/ │ ├── ROADMAP.md # roadmap │ ├── solx.md # solx user manual │ └── coverage.md # public methodology + coverage matrix -├── solx/ # the solx CLI package (see solx/DEVELOPMENT.md) +├── solx/ # the solx CLI crate (Rust; see solx/DEVELOPMENT.md) ├── skills/sol-skill/ # the shipped skill (what users install) │ ├── SKILL.md │ └── references/ # solx, module, scratch, slurm, sessions, sharing @@ -27,7 +27,7 @@ solx/ # the repo ├── mocks/ # userland Sol mock environment │ ├── activate.sh │ ├── bin/ # PATH shims (hostname, module, srun, …) - │ └── home/ # fake $HOME with .solkeep + CSV warnings + │ └── home/ # fake $HOME with solx config ([keep]) + CSV warnings ├── runner/ # thin wrapper over skill-creator └── results/ # gitignored — per-iteration benchmarks ``` @@ -113,8 +113,8 @@ environment, each graded differently. |---|---|---|---| | **L0 — Triggering** | Does the skill's frontmatter description make Claude invoke the skill on Sol-related prompts and *not* on near-misses (generic SLURM, generic Python venv)? | Anywhere with `claude -p` | `skill-creator/scripts/run_loop.py` | | **L1 — Static / transcript-only** | Agent's *proposed* commands and reference-file reads. No execution. Catches: wrong placeholder, wrong storage location, missing reference load, suggesting `sudo`, suggesting a bulk-touch, snooping `~/.ssh/config`, forgetting the `command -v solx` branch. | Laptop, Sol login, anywhere | Subagent runs the prompt in a "describe what you'd do" mode; grader greps the transcript for required/forbidden patterns. | -| **L2 — Mocked Sol** | `solx` run against a fake Sol environment, plus its own unit suite. Catches: parsing the warning CSVs, keep-list matching (incl. carve-outs), side-detection logic, the destructive-confirm contract. | Laptop or Sol login (no privileges needed — pure userland mocks) | Run → assert on exit code + stdout/stderr + filesystem mutations. The renewal mechanism is covered by `solx/tests/test_keep.py` (incl. an end-to-end real-touch test over a real tree with stale mtimes); the static `mocks/` CSVs (absolute `/scratch` paths) back L1 parsing checks. | -| **L3 — Real Sol smoke** | Things only meaningful on actual Sol: real `module avail`, real `srun`, real ssh tunnel through compute node, the `vscode` wrapper, and `solx`'s startup latency vs raw SLURM. | Sol, manually, by maintainer | Short checklist the maintainer runs before release, plus `evals/runner/bench_solx_latency.sh` (read-only timing of `solx job` vs raw `squeue`). | +| **L2 — Mocked Sol** | `solx` run against a fake Sol environment, plus its own unit suite. Catches: parsing the warning CSVs, keep-list matching (incl. carve-outs), side-detection logic, the destructive-confirm contract. | Laptop or Sol login (no privileges needed — pure userland mocks) | Run → assert on exit code + stdout/stderr + filesystem mutations. The renewal mechanism is covered by the crate's keep tests (`solx/src/keep.rs` vectors + the end-to-end `solx/tests/cli.rs` real-touch test over a real tree with stale mtimes); the static `mocks/` CSVs (absolute `/scratch` paths) back L1 parsing checks. | +| **L3 — Real Sol smoke** | Things only meaningful on actual Sol: real `module avail`, real `srun`, real ssh tunnel through compute node, the `vscode` wrapper, `solx`'s startup latency vs raw SLURM, and whether a recommended partition/QOS/gres/time combo is actually schedulable. | Sol, manually, by maintainer | Short checklist the maintainer runs before release, `evals/runner/bench_solx_latency.sh` (read-only timing of `solx job` vs raw `squeue`), and `l3_sbatch_test_only` assertions that run an agent's recommended `#SBATCH` header through `sbatch --test-only`. | The classification lives **in the eval file** — each assertion is tagged `layer: L1 | L2 | L3` so the runner picks the right execution @@ -136,7 +136,7 @@ evals/mocks/ │ ├── srun, sbatch, scancel, squeue # log args, return canned exit │ └── ssh # log args, never connect ├── home/ # fake $HOME during eval -│ ├── .solkeep # example keep-list +│ ├── .config/solx/config.toml # example config with a [keep] block │ └── scratch-dirs-*.csv # synthetic Sol warning files └── scratch/swan16/ # fake scratch tree under fake $HOME ``` @@ -251,7 +251,7 @@ python /eval-viewer/generate_review.py \ - `"mock_log_contains": "..."` (L2 only — greps `$MOCK_LOG`) - `"manual"` (L3 only — surfaces in the manual checklist) 3. If the eval needs a specific mock state (e.g., `solx` present, or a - different `.solkeep`), add a `setup` block that the runner sources + different `[keep]` config), add a `setup` block that the runner sources before spawning the subagent. Keep prompts concrete and realistic — see the skill-creator @@ -260,14 +260,15 @@ description-optimization guide for what makes a good prompt. ## Release process tie-in The CLI and the skill share one version line; a pushed `vX.Y.Z` tag -triggers `.github/workflows/release.yml` (build `solx.pyz`, publish the -GitHub Release). Before tagging: - -1. Bump the version in `solx/src/solx/__init__.py`, - `solx/pyproject.toml`, and `skills/sol-skill/SKILL.md` (`version:`); - refresh `solx/uv.lock` (`uv lock`). -2. Run the full eval suite locally (L1 + L2) and `solx`'s unit suite - (`cd solx && uv run pytest`). +triggers `.github/workflows/release.yml` (build the static musl binary, +publish the GitHub Release with it attached). Before tagging: + +1. Bump the version in `solx/Cargo.toml` and `skills/sol-skill/SKILL.md` + (`version:`); refresh `solx/Cargo.lock` (`cargo update -p solx`). The + release workflow refuses to publish if the tag, `Cargo.toml`, and + `SKILL.md` disagree. +2. Run the full eval suite locally (L1 + L2) and `solx`'s test suite + (`cd solx && cargo test`). 3. Walk the L3 manual checklist on real Sol (login + compute node). 4. Hand-edit `docs/coverage.md`: bump the "Last verified" date, flip any cells in the matrix, refresh "Known gaps". Move the @@ -282,7 +283,7 @@ GitHub Release). Before tagging: | Thing | Location | In git? | Why | |---|---|---|---| | Skill contents (SKILL.md, references) | `skills/sol-skill/` | yes | shipped to users | -| solx CLI package | `solx/` | yes | the CLI; built to `solx.pyz` by CI on tag | +| solx CLI crate | `solx/` | yes | the Rust CLI; built to a static binary by CI on tag | | CI workflows | `.github/workflows/` | yes | lint + test, and tag-driven release | | Mocks + runner code | `evals/mocks/`, `evals/runner/` | yes | no PII, useful for contributors | | Sanitized eval template | `evals/evals.example.json` | yes | shows the schema | @@ -298,7 +299,10 @@ specific than that stays out of git on purpose. ## Dependencies - [`uv`](https://docs.astral.sh/uv/) — script runner and Python env - manager. The mock harness assumes `uv` on `$PATH`, same as `solx`. + manager for the eval harness (the runner and its helpers). `solx` + itself no longer needs it — the shipped CLI is a static binary. +- [Rust](https://rustup.rs/) (stable) — to build and test the `solx` + crate; `cargo test` runs the unit + end-to-end suites. - [`claude` CLI](https://docs.claude.com/en/docs/claude-code) — the runner shells out to spawn subagents. - The diff --git a/README.md b/README.md index 4947c1b..4dc6ed3 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,17 @@ [![CI](https://img.shields.io/github/actions/workflow/status/Shu-Wan/solx/ci.yml?branch=main&label=ci&logo=github)](https://github.com/Shu-Wan/solx/actions/workflows/ci.yml) [![Release](https://img.shields.io/github/v/release/Shu-Wan/solx?logo=github&color=blue)](https://github.com/Shu-Wan/solx/releases) -[![Python](https://img.shields.io/badge/python-3.10%2B-3776AB?logo=python&logoColor=white)](#installation) +[![Rust](https://img.shields.io/badge/rust-single%20binary-CE412B?logo=rust&logoColor=white)](#installation) [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE) +
+ +### [📋 Cheat sheet](skills/sol-skill/references/cheatsheet.md)  ·  [📖 solx docs](docs/solx.md)  ·  [🌵 Sol skill](skills/sol-skill/SKILL.md) + +cheat sheet also as a [PDF](docs/cheatsheet.pdf) or from the CLI: `solx cheatsheet` + +
+ Solx is a CLI for ASU's **Sol** supercomputer, designed for agent-assisted work so you can stop babysitting Slurm. @@ -16,21 +24,17 @@ SSH to Sol, run `solx`, and keep the whole loop in your terminal. ## Installation -On Sol — `solx` provisions its own Python (≥ 3.10) via -[uv](https://docs.astral.sh/uv/): - -```shell -curl -fsSL https://github.com/Shu-Wan/solx/releases/latest/download/install.sh | sh -``` - -Re-run that command to upgrade. Prefer a package manager? +`solx` is a single static binary — no Python, no `uv`, no toolchain on the +box. On Sol, download it, make it executable, and put it on your `$PATH`: ```shell -uv tool install git+https://github.com/Shu-Wan/solx.git#subdirectory=solx +mkdir -p ~/.local/bin +curl -fLo ~/.local/bin/solx https://github.com/Shu-Wan/solx/releases/latest/download/solx-x86_64-unknown-linux-musl +chmod +x ~/.local/bin/solx ``` -Both channels need `uv` on your `$PATH` — install it from -[astral.sh/uv](https://docs.astral.sh/uv/) first if you don't have it. +Re-run those two lines to upgrade. The binary is fully static (musl), so it +runs on any x86-64 Linux — Sol's RHEL 8 included. ## Usage @@ -57,6 +61,7 @@ What it's good at: **Learn more:** the full command manual is [docs/solx.md](docs/solx.md). Cached reference notes on Sol conventions — [the `solx` CLI](skills/sol-skill/references/solx.md), +[the Sol cheat sheet](skills/sol-skill/references/cheatsheet.md) ([PDF](docs/cheatsheet.pdf)), [modules](skills/sol-skill/references/module.md), [scratch policy](skills/sol-skill/references/scratch.md), [Slurm jobs](skills/sol-skill/references/slurm.md), @@ -81,13 +86,13 @@ same way. ## Development -- **Changelog** — [CHANGELOG.md](CHANGELOG.md); current release **v0.4.0**. -- **Roadmap** — [docs/ROADMAP.md](docs/ROADMAP.md); next up is the native - single-binary rewrite. +- **Changelog** — [CHANGELOG.md](CHANGELOG.md); current release **v1.0.0** + (native Rust binary). +- **Roadmap** — [docs/ROADMAP.md](docs/ROADMAP.md). - **Contributing, tests, and the eval harness** — [DEVELOPMENT.md](DEVELOPMENT.md) and - [solx/DEVELOPMENT.md](solx/DEVELOPMENT.md), with the coverage matrix in - [docs/coverage.md](docs/coverage.md). + [solx/DEVELOPMENT.md](solx/DEVELOPMENT.md), with the coverage matrix + in [docs/coverage.md](docs/coverage.md). ## Disclaimer diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 4f02180..fec0037 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,103 +1,54 @@ # Roadmap: the `solx` CLI -Forward-looking design doc for **`solx`**, a CLI for working -on ASU's **Sol** supercomputer. The Sol-side CLI and its skill -integration shipped in v0.4.0; v0.5.0 cut startup latency to the same -order as a raw SLURM call. The next focus is the **native single-binary -rewrite** (below); the **local-machine-side** design stays deferred. +Forward-looking design doc for **`solx`**, a CLI for working on ASU's +**Sol** supercomputer. solx is a native single binary that drives the +Sol-side loop — interactive Slurm jobs, scratch renewal, and one TOML +config. That loop is stable; the next focus is the **local-machine +(laptop) side** (below). End-user docs: [`../README.md`](../README.md), [`../skills/sol-skill/SKILL.md`](../skills/sol-skill/SKILL.md). Contributor / harness docs: [`../DEVELOPMENT.md`](../DEVELOPMENT.md), -[`coverage.md`](coverage.md). Released history: +[`coverage.md`](coverage.md). Per-release history: [`../CHANGELOG.md`](../CHANGELOG.md). -## Stages - -| Stage | Outcome | Status | -|---|---|---| -| 1 — Skill manual-SSH path | The agent skill (manual SSH, `sbatch`, scratch renewal). | ✅ shipped (v0.2.0) | -| 2 — `solx` CLI (Sol-only) | `solx/` package: jobs, interactive allocation, scratch renewal, config; CLI agent output. | ✅ shipped (v0.3.0) | -| 3 — Skill ↔ `solx` integration + distribution | Skill installs and drives `solx`; single-file install channel + CI releases; one version line; situational job awareness (#9). | ✅ shipped (v0.4.0) | -| 4 — Startup latency | Thin spine: stdlib `argparse` dispatch, `rich` only on human render paths, static completion scripts. A warm `solx job` read costs ~0.13s with the `.pyz` install — same order as raw `squeue`. | ✅ shipped (v0.5.0) | -| 5 — Native single binary | Rewrite `solx` as one native executable (Rust): cold-start immunity on the NFS home, no Python/`uv` runtime requirement, single-file install. | 🟡 in development (`v1.0-rust` branch, targets v1.0) | -| — Local-machine side | `solx up/down/forward`, ssh-chain construction. | ⏸ deferred | - -Shipped-stage detail lives in [`../CHANGELOG.md`](../CHANGELOG.md). - -## Startup latency — shipped in v0.5.0 - -On Sol's NFS home, `solx` used to pay a Python-startup tax a raw SLURM -binary doesn't (Typer/Click import ≈ 0.97s on every invocation, plus -`rich` pulled in even on `--json` runs), so the skill steered agents to -raw `squeue`/`scancel` for one-off reads. v0.5.0 removed that tax with a -**thin spine**: - -- **stdlib `argparse` dispatch** (`solx/src/solx/main.py`, entry point - `solx.main:main`). Importing the entry module costs nothing beyond the - interpreter baseline; `--version`/`version` short-circuit before the - parser tree is even built; command bodies (and their `rich`/`pathspec` - dependency trees) import inside their handlers. -- **`rich` on human render paths only.** `Out` writes JSON and plain - diagnostics straight to `sys.stdout`/`sys.stderr`; `rich.table` / - `rich.prompt` import inside the table-render and prompt branches. A - `--json` or piped run never loads `rich` at all. -- **Static completion scripts.** `solx completions ` - renders the command surface into a fully static script - (`solx/src/solx/_completions.py`) — completion never execs `solx`, so - the first Tab of a session costs no interpreter start. - -**Measured** (Sol compute node inside an allocation, 4 cores, NFS -`$HOME`, real Slurm 25.11.6; warm median seconds, n=9 after 1 warmup, -cold-ish first run in parentheses): - -| command | raw squeue | v0.4.0 venv | v0.4.0 pyz (`~/.local/bin`) | v0.5.0 venv | v0.5.0 pyz (local `/tmp`) | -|---|---|---|---|---|---| -| `--version` | — | 1.137 (1.584) | 1.345 (1.390) | 0.281 (0.234) | **0.018** (0.019) | -| `job list` | 0.076 (0.741) | 2.500 (2.141) | 2.505 (1.537) | 1.020 (2.160) | **0.126** (0.123) | -| `job time` | 0.076 (0.071) | 1.251 (1.346) | 2.505 (2.505) | 0.945 (0.153) | **0.127** (0.116) | - -raw squeue rows: `job list` = `squeue --me`; `job time` = -`squeue -h -j $SLURM_JOB_ID -o %L`. Caveats that keep the table honest: - -- The `.pyz` column places the v0.5.0 artifact on node-local `/tmp` and - the v0.4.0 one on NFS, so the raw 75× / 19.9× / 19.7× overstates code - alone. Installed apples-to-apples on NFS `$HOME` (where `install.sh` - writes it), v0.5.0 `.pyz` is ~0.10s / 0.39s / 0.31s — **13× / 6.4× / - 8.1×** over v0.4.0. Venv-to-venv on NFS: 4.0× / 2.5× / 1.3×. Node-local - `/tmp` is the best case (`--version` ~0.02s). -- The remaining gap vs raw `squeue` is ~50ms: interpreter startup plus - the `squeue` subprocess fork are all that's left. -- "Cold" is the first invocation in the benchmark process only — page - cache on a shared node makes true cold unmeasurable, so treat cold - numbers as cold-ish. The cluster controller showed sporadic ~2s - `squeue` spikes, which the n=9 medians absorb. - -`evals/runner/bench_solx_latency.sh` reproduces the solx-vs-raw -comparison on any Sol node; `evals/parity/` is the behavioral matrix -that verified the dispatch rewrite against captured v0.4.0 output. - -**What remains, for v1.0:** - -- **Stage 5 — the native single-binary rewrite (Rust).** A compiled - `solx` removes the interpreter floor entirely and is immune to NFS - cold starts: no Python or `uv` runtime requirement, one static file to - install. In development on the `v1.0-rust` branch. -- **Actually removing the `~/.solkeep` fallback.** Its removal moved - from 0.5.0 to **1.0.0** — `solx keep` keeps reading a legacy - `~/.solkeep` (with a deprecation notice) through the 0.5.x line, so - the migration window spans one more release. +## What `solx` does today + +- **Interactive jobs from templates** — `solx job start/list/time/stop` + and `solx job jump` onto the compute node. +- **Scratch renewal** — `solx keep` renews only the `[keep]` directories + Sol has actually flagged, never a blanket `touch`. +- **One TOML config** — `solx init` writes a starter; `solx config` + shows/edits it. +- **A built-in cheat sheet** — `solx cheatsheet` prints the Sol quick + reference (partition/QOS table, `solx` ↔ raw SLURM, wrappers, tunnels). +- **Built for CLI agents** — JSON off a TTY, results on stdout / + diagnostics on stderr, meaningful exit codes, no hidden prompts; static + shell completions for bash/zsh/fish. +- **A single static binary** — one exec, startup flat on the NFS home + regardless of node load, with no Python, `uv`, or toolchain on the box. + +The companion `sol-skill` teaches an agent when to reach for `solx` vs. +raw Slurm, and the rest of Sol's conventions. + +## Next: the local-machine (laptop) side + +The Sol-side loop is stable, so the next step is bringing it to where you +start — your laptop. The sketch is `solx up` / `down` / `forward` / `info`: +construct the SSH chain (ProxyJump through the login node), start or attach +an allocation, and forward a port to a compute-node service, all from the +local machine. + +It's the next focus, not started: the design threads ssh-client behavior, +ControlMaster, Duo, and scheduler queue races — none of it unit-testable — +so it needs a from-scratch design and a maintainer greenlight before work +begins. Until then `solx` stays a tool you run **on Sol**, and the manual +`ssh -L … -J …` chain (see the skill) covers the laptop side. ## Out of scope (still) -- **Local-machine-side `solx`** (`up/down/forward/info`, ssh-chain - construction) — deferred. The original "one magic command from the - local machine" threaded ssh-client behavior, ControlMaster, Duo, and queue - races, none of which are unit-testable. It returns only when the - Sol-side primitives are stable, the design is re-thought from scratch, - and the user greenlights it. `solx` stays a tool you run **on Sol**. -- **PyPI publication.** Install is via the `.pyz` channel or - `uv tool install` from Git. +- **Package-manager publication** (crates.io, PyPI, Homebrew). Install is + the prebuilt binary from the GitHub Release. ## Design principles @@ -128,9 +79,8 @@ derives from them. placeholders, never with the maintainer's username baked in. 9. **User experience over the tool.** The skill drives an agent on the user's behalf; where a raw SLURM call is faster and just as clear, - prefer it. `solx` has to *earn* its place per task — the v0.5.0 - startup-latency work exists because of this principle, and the - native rewrite (Stage 5) continues it. + prefer it. `solx` has to *earn* its place per task — that's why it's a + single native binary with startup on the order of a raw SLURM call. ## Command surface, config, and behavior → `solx.md` @@ -163,18 +113,16 @@ surface comes with it. ## Decisions confirmed -- **CLI framework**: stdlib `argparse` as of 0.5.0 (see - [Startup latency](#startup-latency--shipped-in-v050)). `rich` is - retained for human-facing tables and prompts only, imported only on - those paths — agent (`--json`/piped) runs never load it. Textual - deferred. -- **Completions**: static scripts generated from one description of the - command surface (`solx/src/solx/_completions.py`) for bash, zsh, and - fish; completion never execs `solx`. Both zsh install modes - (eval/source and fpath autoload) are supported. -- **`~/.solkeep` removal**: **1.0.0**. Deprecated since 0.4.0; `solx - keep` still reads it with a deprecation notice, and `solx config - import-solkeep` migrates it. +- **Implementation**: native binary in Rust (`clap` command tree). Plain + aligned tables for human output; nothing emits color. Command bodies do + no work until dispatched, so startup is a single exec. +- **Completions**: static scripts for bash, zsh, and fish, embedded in + the binary (`solx/assets/`) and emitted by `solx completions`; + completion never execs `solx`. Both zsh install modes (eval/source and + fpath autoload) are supported. +- **`~/.solkeep`**: not used. The config `[keep]` block is the only + keep-list source; `solx keep` never reads a `~/.solkeep`, and there is + no `import-solkeep` command or `--solkeep` flag. - **Config**: single TOML under `$XDG_CONFIG_HOME/solx/config.toml`. No multi-file split, no `[shared]` merge. - **Glob library for `[keep]`**: `pathspec` (gitignore-style include + @@ -183,11 +131,10 @@ surface comes with it. - **Default jobid resolution**: verb-aware — argument > `$SLURM_JOB_ID` > `squeue`, where `time`/`jump` auto-pick the most recent and `stop` refuses to guess (exit 2). Full rules in [`solx.md`](solx.md). -- **Repo layout**: same repo, CLI under `solx/`, skill under - `skills/sol-skill/`, one version line. Repo renamed `sol-skills` → - `solx` at v0.4.0; the name `solx` was kept (short, unique, evokes Sol). +- **Repo layout**: one repo — CLI under `solx/`, skill under + `skills/sol-skill/`, on one version line. - **`vscode` / `sbatch` wrappers**: out of scope. For VSCode, run `code tunnel` on a compute node; for batch, `sbatch` directly. - **Skill subcommands** (`solx skill install/remove/...`): reserved, not - implemented as of v0.4.0 (the skill installs via agentskills.io - installers). Revisit if it earns its place. + implemented (the skill installs via agentskills.io installers). Revisit + if it earns its place. diff --git a/docs/cheatsheet.pdf b/docs/cheatsheet.pdf new file mode 100644 index 0000000..5a956bc Binary files /dev/null and b/docs/cheatsheet.pdf differ diff --git a/docs/coverage.md b/docs/coverage.md index f0002a2..251011b 100644 --- a/docs/coverage.md +++ b/docs/coverage.md @@ -5,13 +5,12 @@ automated verification, and what's a known gap. The eval harness requires manual orchestration today, so this document is updated by hand before each release. -**Version:** v0.4.0 (see [`../CHANGELOG.md`](../CHANGELOG.md)) -**Last verified:** v0.4.0 restructured the skill around `solx`. The -`solx` CLI is covered by its own unit suite (`solx/tests/`, 187 tests -passing) including an end-to-end real-touch renewal test; the -skill-level L1/L2/L3 evals for the new `solx`-driven flows are **pending -re-run on Sol** and are marked 🟡 below. Rows for unchanged behaviors -carry over from the v0.3.0 verification (2026-05-28). +**Version:** v1.0.0 (see [`../CHANGELOG.md`](../CHANGELOG.md)) +**Last verified:** the `solx` CLI is covered by its own crate suite +(`cargo test` in `solx/`: unit tests per module plus the end-to-end +`tests/cli.rs`, including a real-touch renewal test), which runs in CI. +The skill-level L1/L2/L3 evals for the `solx`-driven flows are **pending +re-run on Sol** and are marked 🟡 below. ## Status legend @@ -45,15 +44,15 @@ to the skill should mean adding a row here in the same group. | Behavior | Status | Notes | |---|---|---| -| Detects `solx` (`command -v solx`) and prompts to install when missing | 🟡 documented | New in v0.4.0; skill eval pending | -| Uses `solx` for the job lifecycle and keep; raw Slurm as the no-`solx` fallback | 🟡 documented | Guidance updated for v0.5.0; skill eval pending | -| `solx` exits 2 off-Sol (wrong-side guard) | 🟢 tested | `solx/tests/` (`require_sol` / `side`) | -| Drives the `solx job` lifecycle (start/list/time/jump/stop) | 🟢 tested (CLI) | `solx/tests/test_jobs.py`; skill-teaching eval pending | -| Verb-aware job-id resolution (most-recent for time/jump; stop refuses to guess) | 🟢 tested | `solx/tests/test_slurm.py`, `test_jobs.py` | -| Destructive-confirm contract (`-y`/`-n`, non-interactive refuse, exit 2) | 🟢 tested | `solx/tests/test_jobs.py`, `test_keep.py` | -| CLI agent output: JSON off a TTY, results on stdout / diagnostics on stderr | 🟢 tested | `solx/tests/test_output.py`, `test_jobs.py`, `test_keep.py` | -| Per-command latency vs raw SLURM quantified (one-off reads at parity as of v0.5.0) | 🟢 tested | `evals/runner/bench_solx_latency.sh` (L3, real Sol): raw `squeue` ~0.08s vs warm `solx job` ~0.13s (`.pyz` install). Full measured table in `docs/ROADMAP.md`. | -| Skill treats `solx` and raw `squeue`/`scancel` as equivalent for one-off reads; raw forms documented as fallback | 🟡 documented | Updated for v0.5.0; skill eval pending | +| Detects `solx` (`command -v solx`) and prompts to install when missing | 🟡 documented | skill eval pending | +| Uses `solx` for the job lifecycle and keep; raw Slurm as the no-`solx` fallback | 🟡 documented | skill eval pending | +| `solx` exits 2 off-Sol (wrong-side guard) | 🟢 tested | `solx/src/side.rs`, `solx/tests/cli.rs` | +| Drives the `solx job` lifecycle (start/list/time/jump/stop) | 🟢 tested (CLI) | `solx/src/jobs.rs`, `solx/tests/cli.rs`; skill-teaching eval pending | +| Verb-aware job-id resolution (most-recent for time/jump; stop refuses to guess) | 🟢 tested | `solx/src/slurm.rs`, `solx/tests/cli.rs` | +| Destructive-confirm contract (`-y`/`-n`, non-interactive refuse, exit 2) | 🟢 tested | `solx/tests/cli.rs` (job stop / keep) | +| CLI agent output: JSON off a TTY, results on stdout / diagnostics on stderr | 🟢 tested | `solx/src/output.rs`, `solx/tests/cli.rs` | +| Per-command latency vs raw SLURM quantified (one-off reads at parity) | 🟢 tested | `evals/runner/bench_solx_latency.sh` (L3, real Sol): raw `squeue` ~0.08s vs warm `solx job` ~0.12s (native binary) | +| Skill treats `solx` and raw `squeue`/`scancel` as equivalent for one-off reads; raw forms documented as fallback | 🟡 documented | skill eval pending | ### Detecting the Environment @@ -69,13 +68,11 @@ to the skill should mean adding a row here in the same group. |---|---|---| | Recommends `/scratch/$USER` for datasets, caches, model weights | 🟢 tested | Verified iter-1: agent recommends `/scratch/$USER` for HF cache | | Steers away from `/home` for large data | 🟢 tested | Verified iter-1 | -| `.solkeep` syntax (gitignore-style, `!` negation, `**` glob) | 🟢 tested | Verified iter-2 eval A: agent produces correct file with explanation | +| `[keep]` block syntax (gitignore-style, `!` negation, `**` glob) | 🟢 tested | Verified iter-2 eval A: agent produces correct config block with explanation | | Refuses to bulk-touch `/scratch` (`find -exec touch`) | 🟡 documented | Negative assertion; not yet probed | -| `solx keep --dry-run` plan correctness | 🟢 tested | `solx/tests/test_keep.py`: dry-run plans without touching; JSON plan bounded | -| `solx keep` refreshes kept files (recursively) | 🟢 tested | `solx/tests/test_keep.py::test_keep_end_to_end_real_touch`: mtimes refresh across the tree | -| keep-list carve-outs honored at run time (`.venv`/`__pycache__` skipped, non-kept dirs skipped) | 🟢 tested | `solx/tests/test_keep.py` (end-to-end + `build_plan`) | -| `solx keep` warns but still works on a legacy `~/.solkeep` (support removed 1.0.0) | 🟢 tested | `solx/tests/test_keep.py::test_keep_solkeep_fallback_warns_deprecated` | -| `solx config import-solkeep` migrates `~/.solkeep` → `[keep]` | 🟢 tested | `solx/tests/test_init.py::test_import_solkeep_*` | +| `solx keep --dry-run` plan correctness | 🟢 tested | `solx/src/keep.rs`, `solx/tests/cli.rs::keep_dry_run_plan_filters_by_keep_block`: dry-run plans without touching; JSON plan bounded | +| `solx keep` refreshes kept files (recursively) | 🟢 tested | `solx/tests/cli.rs::keep_renews_real_files`: mtimes refresh across the tree | +| keep-list carve-outs honored at run time (`.venv`/`__pycache__` skipped, non-kept dirs skipped) | 🟢 tested | `solx/src/keep.rs` (matcher vectors) + `solx/tests/cli.rs` (end-to-end) | | File sharing procedure (`chmod` / `install` / `cp` between users) | 🟡 documented | | | Scratch-quota-exceeded behavior | 🔴 gap | Would need a fault-injection mock | | Concurrent `solx keep` runs | 🔴 gap | No locking; documented behavior is "don't" | @@ -97,7 +94,9 @@ to the skill should mean adding a row here in the same group. |---|---|---| | Picks `interactive` wrapper for interactive shells over raw `salloc` | 🟢 tested | Verified iter-4 eval B | | Knows `interactive` defaults to `-p htc -q public -c 1 -t 0-4` (bare invocation works) | 🟡 documented | Added after reading `/usr/local/bin/interactive` source | -| Routes "lightweight / debug / quick" workloads to `htc` partition | 🟢 tested | Verified iter-4 eval B (rule promoted to SKILL.md from references) | +| Routes work by wall-time, not CPU-vs-GPU: ≤4h (incl. GPU) → `htc`; resists the "GPU → public" reflex | 🟢 tested | Partition eval: fixed skill 14/14 vs pre-fix 10/14 (evals #4/#8/#9/#10) | +| Knows the QOS layer: `debug` (≤15m, high-pri, public/general only — rejected on `htc`), `private` (preemptible, >4h) | 🟢 tested | Eval #10; `-p htc -q debug` rejected by live `sbatch --test-only` | +| L3 grader: agent's recommended `#SBATCH` header is validated against the live scheduler (`sbatch --test-only`) | 🟢 tested | `l3_sbatch_test_only` check on evals #4/#8/#9/#10 | | Recommends `/packages/public/sol-sbatch-templates/` over writing SBATCH from scratch | 🟡 documented | Iter-5 P5: agent acknowledged templates exist but didn't name the specific subdir; skill gap to sharpen | | SBATCH header generation (partition, QOS, time, GPU) | 🟢 tested | Verified iter-5 P5: complete OpenMPI script with correct partition/QOS, `srun --mpi=pmix`, `--export=NONE`, `/scratch` logs | | Job lifecycle: `sbatch`, `squeue`, `scancel`, `scontrol update` | 🟡 documented | | @@ -107,8 +106,8 @@ to the skill should mean adding a row here in the same group. | Behavior | Status | Notes | |---|---|---| -| Checks `myfairshare` before submitting; backs off below ~0.05 (no scheduler spam) | 🟡 documented | New in v0.4.0 (issue #9); skill eval pending. `myfairshare` lookup itself 🟢 (iter-5 P3) | -| Tracks remaining wall-time (`solx job time` / `squeue -O TimeLeft`) and wraps up / hands off before expiry | 🟡 documented | New in v0.4.0 (issue #9); skill eval pending | +| Checks `myfairshare` before submitting; backs off below ~0.05 (no scheduler spam) | 🟡 documented | skill eval pending; `myfairshare` lookup itself 🟢 (iter-5 P3) | +| Tracks remaining wall-time (`solx job time` / `squeue -O TimeLeft`) and wraps up / hands off before expiry | 🟡 documented | skill eval pending | | Uses Sol wrappers directly (`myfairshare`/`myjobs`/`seff`/`showgpus`/…) rather than wrapping them | 🟢 tested | Status-query rows below verified iter-5 P2–P4 | ### Asking the Cluster About Yourself and Your Jobs diff --git a/docs/scratch.md b/docs/scratch.md index 6f0e231..1fe2615 100644 --- a/docs/scratch.md +++ b/docs/scratch.md @@ -68,22 +68,6 @@ ssh soldtn 'export PATH=$HOME/.local/bin:$PATH; solx keep -j 24 -y' solx keep ``` -## Migrating an old `~/.solkeep` - -If you used the older `sol_renew.py` script you have a `~/.solkeep` keep-list. -`solx keep` still reads it (with a deprecation notice; support ends in a future -release), so migrate it into your config once: - -```shell -solx config import-solkeep # folds ~/.solkeep into [keep] -solx config show # review the result -``` - -If your keep-list re-includes a path *under* an earlier `!` carve-out, the -`[keep]` form (include minus exclude) can't reproduce that ordering — the -command tells you and asks you to confirm with `-f`. Compare -`solx keep --dry-run` before and after to be sure. - --- Full command reference: [solx.md](solx.md). Sol's deletion pipeline + CSV diff --git a/docs/solx.md b/docs/solx.md index 11b5e49..d6ed083 100644 --- a/docs/solx.md +++ b/docs/solx.md @@ -19,10 +19,10 @@ Install instructions are in [`solx/README.md`](../solx/README.md). The short version, on Sol: ```shell -# Recommended on Sol — single-file install (re-run to upgrade): -curl -fsSL https://github.com/Shu-Wan/solx/releases/latest/download/install.sh | sh -# Or as a uv tool: -uv tool install git+https://github.com/Shu-Wan/solx.git#subdirectory=solx +# solx is one static binary — download it, make it executable, put it on PATH: +mkdir -p ~/.local/bin +curl -fLo ~/.local/bin/solx https://github.com/Shu-Wan/solx/releases/latest/download/solx-x86_64-unknown-linux-musl +chmod +x ~/.local/bin/solx solx --version solx init # writes ~/.config/solx/config.toml @@ -42,8 +42,8 @@ solx init # writes ~/.config/solx/config.toml | `solx job time [JOBID]` | Print the time remaining on a job. | | `solx keep` | Renew `/scratch` files Sol flagged for deletion. | | `solx config show` / `edit` | Show or edit your config. | -| `solx config import-solkeep` | Migrate a legacy `~/.solkeep` into `[keep]`. | | `solx completions ` | Print a shell-completion script. | +| `solx cheatsheet` | Print the Sol quick reference (SLURM + solx) as text. | | `solx version` (alias `--version`) | Print the version. | | `solx help` (alias `--help`) | Show help. | @@ -64,12 +64,14 @@ exit # back to the login node; the allocation stays alive For a quick **status**, **time-left**, or **cancel**, `solx` and the underlying Slurm command are interchangeable: a warm `solx job` read runs in -≈0.13 s on Sol with the single-file install, vs ≈0.08 s for raw `squeue` -(measured — `evals/runner/bench_solx_latency.sh`; a venv install on the NFS -home is slower, ≈1 s warm). The raw forms, for shells without `solx`: +≈0.12 s on Sol, vs ≈0.08 s for raw `squeue` (measured — +`evals/runner/bench_solx_latency.sh`; the residual is just the `squeue` +subprocess `solx` spawns, and the native binary's startup doesn't degrade +under node load or a cold NFS cache). The raw forms, for shells without +`solx`: ```shell -squeue --me # = solx job list (also: myjobs, sq) +squeue --me # = solx job list (also: myjobs) squeue -h -j "$SLURM_JOB_ID" -o %L # = solx job time, inside a job scancel # = solx job stop -y myfairshare # scheduling priority (no solx equivalent) @@ -119,8 +121,7 @@ generated by an older `solx` gets no candidates from a newer one. `solx` reads one file: `~/.config/solx/config.toml` (or `$XDG_CONFIG_HOME/solx/config.toml`). Run `solx init` to create a starter, then `solx config edit` to fill it in. On a terminal, `solx init` offers a -short walkthrough — pick the shell `solx job jump` opens, and (if you have a -`~/.solkeep`) confirm importing its patterns into `[keep]`. A complete +short walkthrough — pick the shell `solx job jump` opens. A complete example: ```toml @@ -255,15 +256,10 @@ their timestamps with `touch`. It only ever touches directories that are **both** flagged by Sol **and** in your keep-list — so there's nothing for it to do until Sol actually flags something. -**Where the keep-list comes from**, in precedence order: - -1. `--solkeep ` — a specific gitignore-style keep-list, if you pass one. -2. the `[keep]` block in your `solx` config (`include` / `exclude`). -3. `~/.solkeep` — a **deprecated** legacy keep-list. `solx keep` still reads it - if present (so existing setups keep working) but prints a deprecation notice; - **support is removed in solx 1.0.0**. Migrate it once with `solx config - import-solkeep`. (Format: one pattern per line, `!` carves a subtree out, a - bare path means that directory and everything under it — last match wins.) +**The keep-list comes from the `[keep]` block** in your `solx` config +(`include` / `exclude`). Patterns are gitignore-style: a bare path means +that directory and everything under it, `!` carves a subtree out, `**` +recurses. ```shell solx keep --dry-run # preview exactly which directories would be renewed @@ -273,7 +269,6 @@ solx keep --stage pending # only the most-urgent CSV | Flag | Meaning | |---|---| -| `--solkeep FILE` | Use a specific gitignore-style keep-list (overrides `[keep]`). | | `--stage {pending,over90,inactive,all}` | Which warning CSVs to read. Default `all`. | | `--csv-dir DIR` | Where Sol's CSVs live. Default your home directory. | | `-j N`, `--jobs N` | How many parallel workers. The default is small on purpose — `/scratch` is networked storage. | @@ -283,8 +278,8 @@ This does a lot of small filesystem operations, which Sol's login nodes throttle. For a big renewal, run it on a compute node or the data-transfer node (`ssh soldtn`). -If there's no keep-list anywhere — no `[keep]` block and no `~/.solkeep` — -`solx keep` stops and points you to `solx config edit`. +If there's no `[keep]` block in the config, `solx keep` stops and points you +to `solx config edit`. --- @@ -340,9 +335,8 @@ Sol drops warning CSVs in `$HOME` as files age out `scratch-dirs-inactive.csv`). `solx keep`: 1. Reads those CSVs from `--csv-dir` (default `$HOME`). -2. Filters the flagged directories through your keep-list (`--solkeep` file > - the `[keep]` config block > `~/.solkeep`), compiled with `pathspec` - gitignore-style. +2. Filters the flagged directories through your keep-list (the `[keep]` + config block), matched gitignore-style. 3. Runs `touch -a -m -c` on the intersection — only directories that **both** appear in a CSV **and** match the keep-list. It never walks `/scratch` wholesale. diff --git a/evals/README.md b/evals/README.md index b19c59a..e2a8a6d 100644 --- a/evals/README.md +++ b/evals/README.md @@ -19,10 +19,8 @@ evals/ ├── mocks/ # userland Sol mock environment │ ├── activate.sh # source to put mocks on PATH │ ├── bin/ # PATH shims -│ ├── home/ # fake $HOME (CSVs + .solkeep) +│ ├── home/ # fake $HOME (CSVs + solx config [keep]) │ └── scratch/ # fake /scratch tree -├── parity/ # solx CLI behavioral parity matrix -│ └── README.md # how to capture goldens + compare runs ├── runner/ │ ├── bench_solx_latency.sh # L3: solx vs raw SLURM latency, on real Sol │ └── build_sandbox_home.sh # hides the skill for fair baselines @@ -41,27 +39,24 @@ source evals/mocks/activate.sh hostname -a # → sc001.sol.rc.asu.edu (mocked) echo "$MOCK_LOG" # path to per-session invocation log -# 3. The renewal mechanism is unit-tested in the solx package — run that -# suite for the L2 filesystem-mutation coverage (real files + stale -# mtimes; refreshes kept files, honors carve-outs, skips the rest): -( cd solx && uv run pytest tests/test_keep.py -q ) +# 3. The renewal mechanism is tested in the solx crate — run that suite +# for the L2 filesystem-mutation coverage (real files + stale mtimes; +# refreshes kept files, honors carve-outs, skips the rest): +( cd solx && cargo test --test cli ) ``` > The static `mocks/` CSVs list absolute `/scratch/sparky/...` paths > for L1 (parsing/plan) checks, so they can't prove real touching on a -> test box. `solx/tests/test_keep.py::test_keep_end_to_end_real_touch` +> test box. The end-to-end real-touch test in `solx/tests/cli.rs` > builds a self-contained tree under `$TMPDIR` with stale mtimes and > asserts the filesystem mutations. -## CLI parity matrix +## Testing the CLI itself -[`parity/`](parity/README.md) regression-tests the **`solx` CLI itself** -(rather than the skill): 67 cases over the full command surface, each in -an isolated fake `$HOME` with deterministic SLURM mocks, captured as -stdout/stderr/exit-code and compared byte-for-byte between two `solx` -builds. Use it whenever the dispatch layer or runtime changes and the -command surface must provably not. Goldens are environment-captured and -not committed — see its README for the capture/compare workflow. +This harness tests the **skill**. The `solx` CLI is tested in its own +crate: `cd solx && cargo test` drives the compiled binary end-to-end +against deterministic SLURM mocks (`solx/tests/cli.rs`) plus the unit +suites, and runs in CI on every push. ## Eval entry schema @@ -107,6 +102,34 @@ machine-checkable `check`. Layer tags drive how the runner executes each eval and how `docs/coverage.md` is regenerated. +## Check types + +`check` is the machine-checkable grader for an assertion; the runner +dispatches on its key: + +- `transcript_contains` / `transcript_lacks` — literal substring is / + isn't anywhere in the agent's transcript (L1). +- `transcript_matches` — Python regex against the transcript (L1). +- `l2_script` — run the named script (e.g. the `solx` crate's keep + test) and pass if it exits `exit_code` (L2, real filesystem mutation). +- `l3_sbatch_test_only` — extract the resource flags from the agent's + **final** recommendation (the last complete `#SBATCH` / `salloc` + header block in the transcript) and run them through `sbatch + --test-only` on real Sol; `expect: "accepted"` passes iff the + scheduler accepts the combo, `expect: "rejected"` iff it errors (L3, + live-scheduler truth). An empty extraction (no header found) scores + **FAIL**, not pass. Strip non-resource lines (conda/module/`srun` + payload); pass only partition/qos/gres/time/cpu/mem flags. + +The `l3_sbatch_test_only` check is the partition/QOS grader. A regex +assertion checks *which* partition the agent named; this checks the +recommendation is actually **schedulable**. It catches headers that +read plausible but the scheduler rejects — `-p htc -q debug` (htc only +allows `qos=public`), a `debug`-QOS job over its 15-minute wall, or a +GPU job parked on a partition that can't grant it. It exists because a +plausible-looking but wrong partition/QOS pairing is exactly the bug +class regex alone misses. + ## Privacy `evals.json` and `evals/results/` are gitignored because they may diff --git a/evals/evals.example.json b/evals/evals.example.json index 0c27177..9b12f3f 100644 --- a/evals/evals.example.json +++ b/evals/evals.example.json @@ -28,12 +28,12 @@ "id": 2, "name": "scratch-renewal-default-flow", "prompt": "I just got an email from Sol saying some of my scratch directories will be deleted soon. Help me extend the ones I still need.", - "expected_output": "Agent reads references/scratch.md, inspects the [keep] config (or migrates ~/.solkeep), runs solx keep --dry-run first, never proposes a bulk find/touch.", + "expected_output": "Agent reads references/scratch.md, inspects the [keep] config block, runs solx keep --dry-run first, never proposes a bulk find/touch.", "setup": { "mock_hostname": "sc001.sol.rc.asu.edu", "include_solx": true, "fake_csvs": ["scratch-dirs-pending-removal.csv", "scratch-dirs-over-90days.csv"], - "fake_solkeep": "evals/mocks/home/.solkeep" + "fake_config_keep": true }, "assertions": [ { @@ -89,26 +89,36 @@ "id": 4, "name": "slurm-gpu-pytorch-job", "prompt": "Write me an sbatch script that trains a PyTorch model on 2 A100s for 4 hours, in a conda env called `vision`.", - "expected_output": "SBATCH script with -p public, -G a100:2 (or --gres=gpu:a100:2), -t 0-04:00:00, module load mamba/latest + source activate vision, no sudo.", + "expected_output": "SBATCH script targeting -p htc (the 4h request fits htc's wall and htc carries A100s; qos public is the default), -G a100:2 (or --gres=gpu:a100:2), -t 0-04:00:00, module load mamba/latest + source activate vision, no sudo.", "setup": { "mock_hostname": "sc001.sol.rc.asu.edu", "include_solx": false }, "assertions": [ + { + "text": "Routes the 4h GPU job to htc (it fits htc's 4h wall and htc carries A100s), not reflexively to public", + "layer": "L1", + "check": {"transcript_matches": "(-p\\s*htc|--partition[= ]htc)"} + }, { "text": "Script requests 2 A100 GPUs", "layer": "L1", "check": {"transcript_matches": "(--gres=gpu:a100:2|-G\\s+a100:2)"} }, { - "text": "Time limit is 4 hours", + "text": "Time limit is 4 hours (any canonical form: 0-4, 0-04:00:00, 04:00:00, 4:00:00)", "layer": "L1", - "check": {"transcript_matches": "(0-04:00:00|--time=4:00:00|-t 4:00:00)"} + "check": {"transcript_matches": "(0-0?4\\b|0-0?4:00:00|(? public' reflex)", + "layer": "L1", + "check": {"transcript_matches": "(-p\\s*htc|--partition[= ]htc)"} + }, + { + "text": "Requests a single A100", + "layer": "L1", + "check": {"transcript_matches": "(--gres=gpu:a100:1(?!\\d)|-G\\s+a100:1(?!\\d))"} + }, + { + "text": "Recommended header is accepted by the live scheduler", + "layer": "L3", + "check": {"l3_sbatch_test_only": "transcript", "expect": "accepted"} + } + ] + }, + { + "id": 9, + "name": "slurm-long-gpu-needs-public", + "prompt": "I need to fine-tune a 7B model on 4 A100s — it'll take about two days of wall-clock. Write me the sbatch header.", + "expected_output": "Agent routes the multi-day 4-A100 run to public (7-day) or general (-q private, preemptible), NOT htc — two days far exceeds htc's 4-hour wall. -G a100:4, -t ~2 days.", + "setup": { + "mock_hostname": "sc001.sol.rc.asu.edu", + "include_solx": false + }, + "assertions": [ + { + "text": "Routes the >4h run to public or general, not htc (htc's 4h wall can't hold a 2-day job)", + "layer": "L1", + "check": {"transcript_matches": "(-p\\s*(public|general)|--partition[= ](public|general))"} + }, + { + "text": "Sets a wall-time well beyond htc's 4h (day-dash like 2-00:00:00, or >=24h like 48:00:00)", + "layer": "L1", + "check": {"transcript_matches": "(-t\\s*[1-9]-|--time[= ]\\s*[1-9]-|[1-9]-[0-9]{2}:|(?4h job left on htc, or a bare -p general missing -q private)", + "layer": "L3", + "check": {"l3_sbatch_test_only": "transcript", "expect": "accepted"} + } + ] + }, + { + "id": 10, + "name": "slurm-smoke-test-debug-qos", + "prompt": "I just want to confirm my training script even launches on a GPU — a 10-minute test. It keeps sitting in the queue when I submit to public. How do I get it to start as fast as possible?", + "expected_output": "Agent suggests the debug QOS (-q debug) on public or general: a 15-minute hard cap but very high priority with GPUs allowed, so a 10-minute launch check starts almost immediately. Notes debug is NOT valid on htc (-p htc -q debug is rejected).", + "setup": { + "mock_hostname": "sc001.sol.rc.asu.edu", + "include_solx": false + }, + "assertions": [ + { + "text": "Surfaces the debug QOS as the fast-start path for a tiny job", + "layer": "L1", + "check": {"transcript_matches": "(-q\\s*debug|--qos[= ]debug)"} + }, + { + "text": "Names public or general as the debug-QOS partition (the L3 check below catches an invalid -p htc -q debug)", + "layer": "L1", + "check": {"transcript_matches": "(-p\\s*(public|general)|--partition[= ](public|general))"} + }, + { + "text": "Recommended header is accepted by the live scheduler (would catch the invalid -p htc -q debug combo)", + "layer": "L3", + "check": {"l3_sbatch_test_only": "transcript", "expect": "accepted"} + } + ] } ] } diff --git a/evals/mocks/README.md b/evals/mocks/README.md index d979791..474efe4 100644 --- a/evals/mocks/README.md +++ b/evals/mocks/README.md @@ -24,7 +24,7 @@ invocation to `$MOCK_LOG` (default `/tmp/sol-skill-mock-$$.log`): The fake `$HOME` (`evals/mocks/home/`) ships with: -- `.solkeep` — sanitized keep-list using `sparky` +- `.config/solx/config.toml` — sanitized config with a `[keep]` block using `sparky` - `scratch-dirs-pending-removal.csv` — synthetic Sol warning - `scratch-dirs-over-90days.csv` — synthetic Sol warning - `scratch-dirs-inactive.csv` — synthetic Sol warning diff --git a/evals/mocks/activate.sh b/evals/mocks/activate.sh index cfb8236..1fc792a 100644 --- a/evals/mocks/activate.sh +++ b/evals/mocks/activate.sh @@ -7,7 +7,7 @@ # After sourcing: # - hostname, module, srun, sbatch, scancel, squeue, ssh resolve to mocks # - $MOCK_LOG points at a per-session invocation log (every shim appends) -# - $HOME points at evals/mocks/home (fake .solkeep + Sol warning CSVs) +# - $HOME points at evals/mocks/home (solx config [keep] + Sol warning CSVs) # - $MOCK_HOSTNAME controls what `hostname` returns (default: Sol login) # # Toggle the side under test (export first; inline assignment with diff --git a/evals/mocks/home/.config/solx/config.toml b/evals/mocks/home/.config/solx/config.toml new file mode 100644 index 0000000..76cc77b --- /dev/null +++ b/evals/mocks/home/.config/solx/config.toml @@ -0,0 +1,22 @@ +# Sanitized solx config used by the mock environment. +# Replace `sparky` with your real ASURITE in your private evals/evals.json. + +default_shell = "bash" +default_template = "default" + +[jobs.default] +partition = "htc" +time = "0-04:00:00" + +# Directories `solx keep` renews when Sol flags them. Patterns are +# gitignore-style (bare path = that directory and everything under it, +# `**` recurses); `exclude` carves stale build artifacts back out. +[keep] +include = [ + "/scratch/sparky/my-project", + "/scratch/sparky/experiments", +] +exclude = [ + "**/__pycache__", + "**/.venv", +] diff --git a/evals/mocks/home/.solkeep b/evals/mocks/home/.solkeep deleted file mode 100644 index 7c1fe6c..0000000 --- a/evals/mocks/home/.solkeep +++ /dev/null @@ -1,10 +0,0 @@ -# Sanitized .solkeep used by the mock environment. -# Replace `sparky` with your real ASURITE in your private evals/evals.json. - -# Keep these trees (bare path = recursive) -/scratch/sparky/my-project -/scratch/sparky/experiments - -# Carve out stale build artifacts -!/scratch/sparky/my-project/**/__pycache__ -!/scratch/sparky/my-project/**/.venv/** diff --git a/evals/parity/README.md b/evals/parity/README.md deleted file mode 100644 index 83affe7..0000000 --- a/evals/parity/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# evals/parity/ — `solx` behavioral parity matrix - -A black-box regression harness for the `solx` CLI. It runs one `solx` -binary through **80 cases** covering the whole command surface — meta -(`--version`, help, unknown commands), `job list/start/stop/jump/time`, -`jump`, `keep`, `config`, `init`, `completions`, aliases, `--json` in -both positions, dispatch edge cases (`--` shielding, bundled shorts, -junk around `version`, `-h`), and error paths — and captures stdout, -stderr, and exit code per case. Comparing two captured runs byte-for-byte proves (or -disproves) that two `solx` builds behave identically, which is what -makes a dispatch-layer or runtime rewrite safe to ship. - -Each case runs in a **fresh fake `$HOME`** (plus `XDG_CONFIG_HOME`) with -**deterministic SLURM mocks** on `PATH`, under `env -i` with -`USER=sparky` and `TERM=dumb` — so the output is reproducible and -independent of the node, the real queue, or your real config. - -## Layout - -``` -evals/parity/ -├── bin/ # mock squeue / salloc / srun / scancel / hostname -│ # env toggles: MOCK_SQUEUE_EMPTY=1, MOCK_SQUEUE_FAIL=1, -│ # MOCK_SQUEUE_TWORUNNING=1 select canned squeue variants -├── fixtures/ # config.toml variants, ~/.solkeep variants, warning CSVs -├── run_matrix.sh # run the 80 cases against one solx binary -└── compare_runs.py # compare two captured runs (stdlib python3 only) -``` - -Captured runs (`golden-*/`, scratch output dirs) are **not committed** — -see below. - -## Capturing a golden - -A golden is the captured behavior of a reference `solx` version: - -```shell -cd evals/parity -./run_matrix.sh "$(command -v solx)" golden-v0.4.0 -``` - -Each case lands as `golden-v0.4.0/.{out,err,code}` with per-case -tempdir paths normalized to `__HOME__`. Goldens are -**environment-captured, not committed**: capture the reference version's -golden on the same machine (and Python) you'll run the candidate on, so -the diff isolates the code change rather than the environment. - -## Comparing a candidate - -```shell -./run_matrix.sh /path/to/candidate/solx out-candidate -./compare_runs.py golden-v0.4.0 out-candidate # add --json for machine output -``` - -`compare_runs.py` exits 0 when no strict case fails. Case classes: - -- **STRICT** (the default): exit code, stdout, and stderr must match - byte-for-byte. -- **RELAXED** (help/usage text and completion scripts): only the exit - code must match, and stdout is smoke-checked for key content — help - and completion text is allowed to differ across implementations. -- **VERSION_CASES** (`--version`, `version`): exit code must match and - stdout must be a bare semver; the value itself may differ. -- **EXPECTED_DIFF**: known deliberate divergences (e.g. the trailing - `--json` acceptance case, the `~/.solkeep` deprecation message's - version string). Reported, but never fail the run. - -The class membership lives at the top of `compare_runs.py`; when a -behavior change is intentional, move its case into `EXPECTED_DIFF` in -the same change that introduces it, with a comment saying why. - -## Requirements - -A POSIX shell + `bash` for `run_matrix.sh`, any `python3` for -`compare_runs.py` (stdlib only), and a runnable `solx` for each side of -the comparison. The mocks shadow the real SLURM tools via `PATH`, so the -harness is safe to run anywhere — it never talks to a real scheduler. diff --git a/evals/parity/compare_runs.py b/evals/parity/compare_runs.py deleted file mode 100755 index 5f7cc5f..0000000 --- a/evals/parity/compare_runs.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python3 -"""Compare two solx parity-matrix runs (golden vs candidate). - -Usage: compare_runs.py GOLDEN_DIR CANDIDATE_DIR [--json] - -Classes of cases: -* STRICT (default): exit code + stdout + stderr must match byte-for-byte. -* RELAXED: only the exit code must match (help/usage text is allowed to - differ across CLI frameworks); stdout is smoke-checked for key content. -* EXPECTED_DIFF: recorded and reported, but never fails the run (known, - deliberate divergences — e.g. the solkeep deprecation message's version - string, or `job list --json` becoming accepted). - -Exit 0 if no strict failures, 1 otherwise. Prints a human summary, or a -JSON document with --json. -""" -import json -import sys -from pathlib import Path - -RELAXED = { - "help-flag", "help-cmd", "no-args", "unknown-cmd", - "job-noargs", "job-badsub", - "completions-bash", "completions-zsh", "completions-fish", - "completions-tcsh", - # Dispatch edge cases: exit-code parity required; error wording may - # differ from Click's. - "js-dryrun-eq", "version-junk-arg", "version-junk-pre", - "version-junk-post", "keep-j-zero", "help-job-arg", -} -# Smoke content every RELAXED stdout must still contain (when exit 0). -RELAXED_SMOKE = { - "help-flag": ["init", "keep", "jump", "job", "config", "completions"], - "help-cmd": ["init", "keep", "jump", "job", "config", "completions"], - "completions-bash": ["solx"], - "completions-zsh": ["#compdef", "solx"], - "completions-fish": ["solx"], -} -EXPECTED_DIFF = { - "leaf-json-position", # v0.4.0 rejects trailing --json; later versions accept - "keep-fallback", # deprecation message names the removal version - # -h is a documented v0.5.0 superset: v0.4.0 exits 2, v0.5.0 prints - # help and exits 0. - "dash-h-root", - "dash-h-stop", -} -# Version output changes across versions by definition: exit code must match -# and stdout must look like a bare semver, but the value itself may differ. -VERSION_CASES = {"version-flag", "version-cmd"} -SEMVER = __import__("re").compile(r"^[0-9]+\.[0-9]+\.[0-9]+(-[A-Za-z0-9.]+)?\n$") - - -def read(d: Path, case: str, ext: str) -> str: - p = d / f"{case}.{ext}" - return p.read_text(errors="replace") if p.exists() else "" - - -def main() -> int: - golden, cand = Path(sys.argv[1]), Path(sys.argv[2]) - as_json = "--json" in sys.argv[3:] - cases = sorted(p.stem for p in golden.glob("*.code")) - missing = [c for c in cases if not (cand / f"{c}.code").exists()] - - results = [] - for c in cases: - g_code, c_code = read(golden, c, "code").strip(), read(cand, c, "code").strip() - g_out, c_out = read(golden, c, "out"), read(cand, c, "out") - g_err, c_err = read(golden, c, "err"), read(cand, c, "err") - fields = [] - if g_code != c_code: - fields.append(("code", g_code, c_code)) - if c in VERSION_CASES: - if not SEMVER.match(c_out): - fields.append(("stdout", g_out, c_out)) - elif c in RELAXED: - for needle in RELAXED_SMOKE.get(c, []): - if g_code == "0" == c_code and needle not in c_out: - fields.append(("smoke", needle, "absent")) - else: - if g_out != c_out: - fields.append(("stdout", g_out, c_out)) - if g_err != c_err: - fields.append(("stderr", g_err, c_err)) - status = "pass" - if fields: - status = "expected-diff" if c in EXPECTED_DIFF else "FAIL" - results.append({"case": c, "status": status, - "diffs": [{"field": f, "golden": g[:2000], "candidate": x[:2000]} - for f, g, x in fields]}) - - fails = [r for r in results if r["status"] == "FAIL"] - expected = [r for r in results if r["status"] == "expected-diff"] - summary = { - "total": len(cases), - "pass": sum(1 for r in results if r["status"] == "pass"), - "fail": len(fails), - "expected_diff": len(expected), - "missing_in_candidate": missing, - "failures": fails, - "expected_diffs": expected, - } - if as_json: - print(json.dumps(summary, indent=2)) - else: - print(f"parity: {summary['pass']}/{summary['total']} pass, " - f"{len(fails)} fail, {len(expected)} expected-diff, " - f"{len(missing)} missing") - for r in fails: - print(f"\nFAIL {r['case']}") - for d in r["diffs"]: - print(f" [{d['field']}]") - print(f" golden: {d['golden'][:400]!r}") - print(f" candidate: {d['candidate'][:400]!r}") - for r in expected: - print(f"\nexpected-diff {r['case']}: " - + ", ".join(d["field"] for d in r["diffs"])) - return 1 if fails or missing else 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/evals/parity/fixtures/config-nokeep.toml b/evals/parity/fixtures/config-nokeep.toml deleted file mode 100644 index c3fcf88..0000000 --- a/evals/parity/fixtures/config-nokeep.toml +++ /dev/null @@ -1,8 +0,0 @@ -default_shell = "zsh" -default_template = "default" -start_timeout = "5m" - -[jobs.default] -partition = "lightwork" -time = "1-0" -qos = "public" diff --git a/evals/parity/fixtures/config-sample.toml b/evals/parity/fixtures/config-sample.toml deleted file mode 100644 index 803ca09..0000000 --- a/evals/parity/fixtures/config-sample.toml +++ /dev/null @@ -1,22 +0,0 @@ -default_shell = "zsh" -default_template = "default" -start_timeout = "5m" - -[jobs.default] -partition = "lightwork" -time = "1-0" -qos = "public" - -[jobs.debug] -partition = "htc" -time = "0-1" - -[jobs.gpu] -partition = "public" -gres = "gpu:a100:1" -time = "0-4" -extra_args = ["--mem=64G", "--cpus-per-task=8"] - -[keep] -include = ["/scratch/sparky/proj-a", "/scratch/sparky/proj-b/**"] -exclude = ["**/__pycache__", "**/.venv"] diff --git a/evals/parity/fixtures/home/.solkeep b/evals/parity/fixtures/home/.solkeep deleted file mode 100644 index 9f3352a..0000000 --- a/evals/parity/fixtures/home/.solkeep +++ /dev/null @@ -1,4 +0,0 @@ -# keep these alive -/scratch/sparky/proj-a -!/scratch/sparky/proj-a/tmp -/scratch/sparky/proj-b/** diff --git a/evals/parity/fixtures/home/scratch-dirs-inactive.csv b/evals/parity/fixtures/home/scratch-dirs-inactive.csv deleted file mode 100644 index 71d15fb..0000000 --- a/evals/parity/fixtures/home/scratch-dirs-inactive.csv +++ /dev/null @@ -1,2 +0,0 @@ -User,Directory,Size -sparky,/scratch/sparky/proj-a,12G diff --git a/evals/parity/fixtures/home/scratch-dirs-over-90days.csv b/evals/parity/fixtures/home/scratch-dirs-over-90days.csv deleted file mode 100644 index 7ec50e6..0000000 --- a/evals/parity/fixtures/home/scratch-dirs-over-90days.csv +++ /dev/null @@ -1,2 +0,0 @@ -User,Directory,Size -sparky,/scratch/sparky/proj-b/data,40G diff --git a/evals/parity/fixtures/home/scratch-dirs-pending-removal.csv b/evals/parity/fixtures/home/scratch-dirs-pending-removal.csv deleted file mode 100644 index 21c70ea..0000000 --- a/evals/parity/fixtures/home/scratch-dirs-pending-removal.csv +++ /dev/null @@ -1,3 +0,0 @@ -User,Directory,Size -sparky,/scratch/sparky/proj-a,12G -sparky,/scratch/sparky/other,3G diff --git a/evals/parity/fixtures/solkeep-lossy b/evals/parity/fixtures/solkeep-lossy deleted file mode 100644 index c44586b..0000000 --- a/evals/parity/fixtures/solkeep-lossy +++ /dev/null @@ -1,3 +0,0 @@ -/scratch/sparky/proj-a -!/scratch/sparky/proj-a/tmp -/scratch/sparky/proj-a/tmp/keepme diff --git a/evals/parity/run_matrix.sh b/evals/parity/run_matrix.sh deleted file mode 100755 index 501cbf3..0000000 --- a/evals/parity/run_matrix.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env bash -# Run the solx behavioral parity matrix against one solx binary. -# -# run_matrix.sh /path/to/solx OUTDIR -# -# Each case runs in a fresh fake HOME (+XDG_CONFIG_HOME) with deterministic -# SLURM mocks on PATH, and captures stdout / stderr / exit code into -# OUTDIR/.{out,err,code}. Paths that embed the per-case tempdir are -# normalized to __HOME__ so two runs (or two solx implementations) diff clean. -set -u -SOLX="$1" -OUTDIR="$2" -PARITY="$(cd "$(dirname "$0")" && pwd)" -mkdir -p "$OUTDIR" - -run_case() { - # run_case NAME XDG_FIXTURE(sample|nokeep|empty) HOME_SOLKEEP(yes|no|lossy) [VAR=VAL ...] -- ARGS... - local name="$1" xdg_fix="$2" solkeep="$3"; shift 3 - local envs=() - while [ "$1" != "--" ]; do envs+=("$1"); shift; done - shift # drop -- - - local home; home="$(mktemp -d /tmp/solx-parity-case-XXXXXX)" - mkdir -p "$home/.config/solx" - cp "$PARITY"/fixtures/home/*.csv "$home/" 2>/dev/null - case "$solkeep" in - yes) cp "$PARITY/fixtures/home/.solkeep" "$home/.solkeep" ;; - lossy) cp "$PARITY/fixtures/solkeep-lossy" "$home/.solkeep" ;; - no) ;; - esac - case "$xdg_fix" in - sample) cp "$PARITY/fixtures/config-sample.toml" "$home/.config/solx/config.toml" ;; - nokeep) cp "$PARITY/fixtures/config-nokeep.toml" "$home/.config/solx/config.toml" ;; - empty) ;; - esac - - # Substitute the per-case home into args (for flags that take a path). - local args=() - local a - for a in "$@"; do args+=("${a//__HOMEDIR__/$home}"); done - - env -i \ - PATH="$PARITY/bin:/usr/bin:/bin" \ - HOME="$home" \ - XDG_CONFIG_HOME="$home/.config" \ - USER=sparky \ - LOGNAME=sparky \ - TERM=dumb \ - LC_ALL=C \ - "${envs[@]+"${envs[@]}"}" \ - "$SOLX" "${args[@]+"${args[@]}"}" >"$OUTDIR/$name.out" 2>"$OUTDIR/$name.err" - echo $? > "$OUTDIR/$name.code" - - # Normalize per-case tempdir paths so runs are comparable. - sed -i "s|$home|__HOME__|g" "$OUTDIR/$name.out" "$OUTDIR/$name.err" - rm -rf "$home" -} - -# Special: init-exists needs init run twice in the SAME home. -run_case_init_twice() { - local name="$1" - local home; home="$(mktemp -d /tmp/solx-parity-case-XXXXXX)" - mkdir -p "$home/.config" - env -i PATH="$PARITY/bin:/usr/bin:/bin" HOME="$home" XDG_CONFIG_HOME="$home/.config" \ - USER=sparky LOGNAME=sparky TERM=dumb LC_ALL=C \ - "$SOLX" init >/dev/null 2>&1 - env -i PATH="$PARITY/bin:/usr/bin:/bin" HOME="$home" XDG_CONFIG_HOME="$home/.config" \ - USER=sparky LOGNAME=sparky TERM=dumb LC_ALL=C \ - "$SOLX" init >"$OUTDIR/$name.out" 2>"$OUTDIR/$name.err" - echo $? > "$OUTDIR/$name.code" - sed -i "s|$home|__HOME__|g" "$OUTDIR/$name.out" "$OUTDIR/$name.err" - rm -rf "$home" -} - -# Special: import-solkeep-ok also snapshots the resulting config. -run_case_import_ok() { - local name="$1" - local home; home="$(mktemp -d /tmp/solx-parity-case-XXXXXX)" - mkdir -p "$home/.config/solx" - cp "$PARITY/fixtures/config-nokeep.toml" "$home/.config/solx/config.toml" - cp "$PARITY/fixtures/home/.solkeep" "$home/.solkeep" - env -i PATH="$PARITY/bin:/usr/bin:/bin" HOME="$home" XDG_CONFIG_HOME="$home/.config" \ - USER=sparky LOGNAME=sparky TERM=dumb LC_ALL=C \ - "$SOLX" config import-solkeep >"$OUTDIR/$name.out" 2>"$OUTDIR/$name.err" - echo $? > "$OUTDIR/$name.code" - cp "$home/.config/solx/config.toml" "$OUTDIR/$name.config-after" - sed -i "s|$home|__HOME__|g" "$OUTDIR/$name.out" "$OUTDIR/$name.err" "$OUTDIR/$name.config-after" - rm -rf "$home" -} - -# ---- top level / meta ------------------------------------------------------ -run_case version-flag sample yes -- --version -run_case version-cmd sample yes -- version -run_case help-flag sample yes -- --help -run_case help-cmd sample yes -- help -run_case no-args sample yes -- -run_case unknown-cmd sample yes -- frobnicate -run_case job-noargs sample yes -- job -run_case job-badsub sample yes -- job frobnicate - -# ---- job list -------------------------------------------------------------- -run_case job-list-json sample yes -- --json job list -run_case job-list-piped sample yes -- job list -run_case jobs-alias sample yes -- --json jobs list -run_case job-ls-alias sample yes -- --json job ls -run_case job-list-empty sample yes MOCK_SQUEUE_EMPTY=1 -- --json job list -run_case job-list-fail sample yes MOCK_SQUEUE_FAIL=1 -- job list - -# ---- job time -------------------------------------------------------------- -run_case job-time-inside sample yes SLURM_JOB_ID=54800001 -- --json job time -run_case job-time-arg sample yes -- --json job time 12345 -run_case job-time-mostrecent sample yes -- --json job time -run_case job-time-empty sample yes MOCK_SQUEUE_EMPTY=1 -- --json job time - -# ---- job stop --------------------------------------------------------------- -run_case job-stop-ambig sample yes -- --json job stop -run_case job-stop-dryrun sample yes -- --json job stop 12345 -n -run_case job-stop-yes sample yes -- --json job stop 12345 -y -run_case job-stop-force sample yes -- --json job stop 12345 --force -run_case job-stop-yn sample yes -- job stop 12345 -y -n -run_case job-stop-noninter sample yes -- job stop 12345 -run_case job-stop-self sample yes SLURM_JOB_ID=12345 -- --json job stop 12345 -n - -# ---- job start -------------------------------------------------------------- -run_case job-start-dry sample yes -- --json job start -n -run_case job-start-dry-tmpl sample yes -- --json job start gpu -n -run_case job-start-dry-dashdash sample yes -- --json job start gpu -n -- --mem=128G -run_case job-start-dry-mixed sample yes -- --json job start gpu -n --mem=128G -c 8 -run_case job-start-dry-dd-notmpl sample yes -- --json job start -n -- --mem=128G -run_case job-start-real sample yes -- --json job start -run_case job-start-badtimeout sample yes -- job start --timeout never -run_case job-start-unknown-tmpl sample yes -- --json job start nosuch -n -run_case job-start-timeout-dry sample yes -- --json job start --timeout 30s -n - -# ---- jump --------------------------------------------------------------- -run_case jump-arg sample yes -- --json jump 12345 -q -run_case jump-noarg sample yes -- jump -run_case jump-inside sample yes SLURM_JOB_ID=999 -- jump -run_case jump-mostrecent sample yes MOCK_SQUEUE_TWORUNNING=1 -- jump -run_case job-jump-arg sample yes -- --json job jump 12345 -q - -# ---- keep --------------------------------------------------------------- -run_case keep-dry sample yes -- --json keep -n -run_case keep-dry-stage sample yes -- --json keep -n --stage pending -run_case keep-dry-over90 sample yes -- --json keep -n --stage over90 -run_case keep-dry-verbose sample yes -- keep -n -v -run_case keep-invalid-stage sample yes -- keep --stage bogus -run_case keep-yes sample yes -- --json keep -y -j 1 -run_case keep-solkeep-flag sample yes -- --json keep -n --solkeep __HOMEDIR__/.solkeep -run_case keep-fallback empty yes -- keep -n -run_case keep-nothing empty no -- keep -n - -# ---- config --------------------------------------------------------------- -run_case config-show sample yes -- config show -run_case config-show-json sample yes -- config show --json -run_case config-show-rootjson sample yes -- --json config show -run_case config-edit-ok sample yes EDITOR=true -- config edit -run_case config-edit-flags sample yes EDITOR="/bin/echo -n" -- config edit -run_case config-edit-noconfig empty yes EDITOR=true -- config edit -run_case_import_ok import-solkeep-ok -run_case import-solkeep-exists sample yes -- config import-solkeep -run_case import-solkeep-noconfig empty yes -- config import-solkeep -run_case import-solkeep-lossy nokeep lossy -- config import-solkeep -run_case import-solkeep-lossy-f nokeep lossy -- --json config import-solkeep -f - -# ---- init --------------------------------------------------------------- -run_case init-fresh empty yes -- --json init -run_case_init_twice init-exists -run_case init-force sample yes -- --json init -f - -# ---- completions ----------------------------------------------------------- -run_case completions-bash sample yes -- completions bash -run_case completions-zsh sample yes -- completions zsh -run_case completions-fish sample yes -- completions fish -run_case completions-tcsh sample yes -- completions tcsh - -# ---- dispatch edge cases ----------------------------------------------------- -# `--` shielding: tokens after `--` pass through to sbatch verbatim. -run_case js-dd-shield-n sample yes -- --json job start gpu -- -n -run_case js-dd-shield-n4 sample yes -- --json job start gpu -- -n 4 -run_case js-dd-shield-timeout sample yes -- --json job start -- --timeout 30s -run_case js-dd-dd sample yes -- --json job start gpu -n -- --mem=1G -- -c 2 -run_case js-bundled-shorts sample yes -- --json job start -nn -run_case js-dryrun-eq sample yes -- job start --dry-run=true -run_case version-junk-arg sample yes -- version bogus -run_case version-junk-pre sample yes -- --bogus --version -run_case version-junk-post sample yes -- --version --bogus -run_case keep-j-zero sample yes -- keep -n -j 0 -run_case help-job-arg sample yes -- help job -run_case dash-h-root sample yes -- -h -run_case dash-h-stop sample yes -- job stop 12345 -h - -# ---- known divergence probes (documented, not strict) ----------------------- -run_case leaf-json-position sample yes -- job list --json - -echo "matrix complete: $(ls "$OUTDIR" | grep -c '\.code$') cases -> $OUTDIR" diff --git a/evals/runner/README.md b/evals/runner/README.md index fb1a384..c575ab2 100644 --- a/evals/runner/README.md +++ b/evals/runner/README.md @@ -15,19 +15,19 @@ documented in [`../../DEVELOPMENT.md`](../../DEVELOPMENT.md). and how. - **`bench_solx_latency.sh`** — L3 latency benchmark (real Sol, read-only): times `solx job` commands against the equivalent raw - SLURM call and reports the delta. Quantifies `solx`'s Python/NFS - startup tax that informs the skill's "`solx` vs raw SLURM" rule and the - startup-latency roadmap item. Usage: `evals/runner/bench_solx_latency.sh [N]`. -- **L2 renewal coverage lives in the `solx` package.** - `solx/tests/test_keep.py::test_keep_end_to_end_real_touch` builds a - real tree with stale mtimes (including `.venv`/`__pycache__`), runs - `solx keep`, and asserts the filesystem mutations: kept files - (recursively) are refreshed, carve-outs are left alone, non-kept dirs - are skipped. It is the L2 grader for the `scratch-renewal-*` evals - (`check.l2_script`). Run standalone or in CI: + SLURM call and reports the delta. Quantifies the residual over raw + SLURM that informs the skill's "`solx` vs raw SLURM" rule. Usage: + `evals/runner/bench_solx_latency.sh [N]`. +- **L2 renewal coverage lives in the `solx` crate.** The end-to-end + real-touch test in `solx/tests/cli.rs` builds a real tree with stale + mtimes (including `.venv`/`__pycache__`), runs `solx keep`, and asserts + the filesystem mutations: kept files (recursively) are refreshed, + carve-outs are left alone, non-kept dirs are skipped. It is the L2 + grader for the `scratch-renewal-*` evals (`check.l2_script`). Run + standalone or in CI: ```shell - ( cd solx && uv run pytest tests/test_keep.py -q ) + ( cd solx && cargo test --test cli ) ``` ## What the runner will do (iteration 1) @@ -36,8 +36,8 @@ documented in [`../../DEVELOPMENT.md`](../../DEVELOPMENT.md). `layer` and `check` extensions). - For each eval: - Apply the `setup` block: write requested mock state - (`MOCK_HOSTNAME`, `solx`-shim presence, fake CSVs, fake - `.solkeep`). + (`MOCK_HOSTNAME`, `solx`-shim presence, fake CSVs, a `[keep]` + config). - Spawn the with-skill subagent (`--plugin-dir skills/sol-skill`) and the baseline subagent (no plugin-dir), both inheriting `CLAUDE_CONFIG_DIR` from the parent so neither diff --git a/evals/runner/bench_solx_latency.sh b/evals/runner/bench_solx_latency.sh index aeffa3d..da0c915 100755 --- a/evals/runner/bench_solx_latency.sh +++ b/evals/runner/bench_solx_latency.sh @@ -1,9 +1,9 @@ #!/usr/bin/env bash # Benchmark solx's per-command latency against the equivalent raw SLURM -# command, on Sol. solx wraps squeue/salloc/srun in Python; each invocation -# pays interpreter start plus the command body's imports, which a raw SLURM -# binary does not. This quantifies that cost so the skill's "solx vs raw -# SLURM" guidance is grounded in real numbers rather than a guess. +# command, on Sol. solx is a native binary that wraps squeue/salloc/srun; +# each invocation is one exec plus the squeue subprocess it spawns. This +# quantifies the residual over raw SLURM so the skill's "solx vs raw SLURM" +# guidance is grounded in real numbers rather than a guess. # # This is an L3 (real-Sol) measurement — the numbers only mean anything on a # Sol login/compute node, where the NFS home and a live Slurm controller are @@ -31,7 +31,7 @@ command -v squeue >/dev/null 2>&1 || { exit 2 } command -v solx >/dev/null 2>&1 || { - echo "solx not on PATH — install it first (curl … install.sh | sh)." >&2 + echo "solx not on PATH — install the binary first (see skills/sol-skill/SKILL.md)." >&2 exit 2 } @@ -80,9 +80,7 @@ echo "startup floor:" bench "solx --version" solx --version echo -echo "Takeaway: a raw SLURM read is ~0.08s; a warm solx 'job' read is ~0.13s with" -echo "the recommended single-file (.pyz) install — the same order, so either is" -echo "fine for one-off status reads. A venv install on the NFS home is slower" -echo "(~1s warm): if your numbers above look like that, switch to the .pyz" -echo "channel (curl ... install.sh | sh). See skills/sol-skill/SKILL.md" -echo "('solx vs raw SLURM')." +echo "Takeaway: a raw SLURM read is ~0.08s; a warm solx 'job' read is ~0.12s —" +echo "the same order, so either is fine for one-off status reads. The native" +echo "binary's startup is flat regardless of node load or NFS cache state. See" +echo "skills/sol-skill/SKILL.md ('solx vs raw SLURM')." diff --git a/scripts/build-cheatsheet.sh b/scripts/build-cheatsheet.sh new file mode 100755 index 0000000..236d999 --- /dev/null +++ b/scripts/build-cheatsheet.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Build the Sol cheatsheet PDF from the skill's markdown source. +# Requires pandoc + a LaTeX engine (xelatex/pdflatex). On Sol, `tinytex` +# provides the engine (see SKILL.md "Getting the Software You Need"). +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +SRC="$ROOT/skills/sol-skill/references/cheatsheet.md" +OUT="$ROOT/docs/cheatsheet.pdf" + +command -v pandoc >/dev/null || { echo "error: pandoc not found"; exit 1; } +ENGINE="" +for e in xelatex pdflatex tectonic; do + command -v "$e" >/dev/null 2>&1 && ENGINE="$e" && break +done +[ -n "$ENGINE" ] || { echo "error: no LaTeX engine (xelatex/pdflatex/tectonic)"; exit 1; } + +# Strip the decorative emoji and map a few Unicode glyphs the default +# LaTeX fonts lack to ASCII, so the build is clean and CI-portable. +# (The markdown source keeps the nicer glyphs for terminal/GitHub.) +TMP="$(mktemp --suffix=.md)" +trap 'rm -f "$TMP"' EXIT +sed -e 's/🌵 *//g' \ + -e 's/≤/<=/g' -e 's/≥/>=/g' \ + -e 's/↔/<->/g' -e 's/→/->/g' \ + "$SRC" > "$TMP" + +pandoc "$TMP" -o "$OUT" \ + --pdf-engine="$ENGINE" \ + -V geometry:margin=0.6in -V fontsize=10pt -V colorlinks=true \ + --metadata title="Sol Cheatsheet" + +echo "wrote $OUT ($(du -h "$OUT" | cut -f1), engine: $ENGINE)" diff --git a/skills/sol-skill/SKILL.md b/skills/sol-skill/SKILL.md index 2dfe93d..765133f 100644 --- a/skills/sol-skill/SKILL.md +++ b/skills/sol-skill/SKILL.md @@ -1,6 +1,6 @@ --- name: sol-skill -version: 0.5.1 +version: 1.0.0 description: Conventions and tooling for ASU's Sol supercomputer, built around the `solx` CLI. Use whenever a task is happening on Sol — the user mentions Sol or ASU Research Computing, or is clearly on their Sol account (a Sol /scratch path, an sbatch/interactive job, a login/compute node). It covers renewing /scratch files Sol has flagged for deletion (purge/inactivity warnings) via `solx keep` and where to store datasets and model caches; requesting and managing SLURM jobs (the `solx job` interactive-allocation lifecycle, sbatch for batch, GPU and partition/QOS choice, why a job is pending, fairshare-aware and time-aware job management); installing software without sudo (module load, uv for Python, tinytex for LaTeX); reaching a Sol compute-node service like Jupyter from a laptop browser; detecting login-vs-compute nodes and choosing where to run heavy I/O (the DTN, a compute node, or a batch job); and transferring data to and from Sol. Not for generic SLURM/HPC on other clusters (Phoenix, NERSC, …), cloud GPUs, or purely local-laptop tasks (local virtualenvs, local LaTeX, local file/timestamp cleanup). license: MIT --- @@ -80,12 +80,13 @@ the user's day-to-day tool for templated interactive allocations (`job start` / `job jump`) and scratch renewal (`keep`) — install it when the user is doing that kind of work. -**`solx` is fast enough to be the default.** A warm `solx job` read -costs ~0.13s on Sol with the recommended single-file install, vs ~0.08s -for a raw `squeue` (measured — `evals/runner/bench_solx_latency.sh`), so -one-off reads carry no meaningful `solx` penalty. Raw SLURM stays a full -equivalent (see "`solx` vs raw SLURM" below) — it's the fallback when -`solx` isn't installed, not a faster path to prefer. +**`solx` is fast enough to be the default.** It's a single native binary +(Rust), so a warm `solx job` read costs ~0.12s on Sol vs ~0.08s for a raw +`squeue` (measured — `evals/runner/bench_solx_latency.sh`), and startup is +flat regardless of node load or NFS cache state. One-off reads carry no +meaningful `solx` penalty. Raw SLURM stays a full equivalent (see "`solx` +vs raw SLURM" below) — it's the fallback when `solx` isn't installed, not a +faster path to prefer. **Detect, then install when the task needs it.** Once you've confirmed you're on Sol (see [Detecting the Environment](#detecting-the-environment)): @@ -95,24 +96,22 @@ command -v solx # missing? install it when the user needs job start/jump ``` If it's absent and the task calls for it, **prompt the user to install -it** (then run `solx init`): +it** (then run `solx init`). `solx` is one static binary — no Python, no +`uv`, no toolchain — so installing is a download and a `chmod`: ```shell -# Recommended on Sol: single-file install, fast cold start on the NFS home. -curl -fsSL https://github.com/Shu-Wan/solx/releases/latest/download/install.sh | sh - -# Alternative: as a uv tool (isolated venv on $PATH). -uv tool install git+https://github.com/Shu-Wan/solx.git#subdirectory=solx +mkdir -p ~/.local/bin +curl -fLo ~/.local/bin/solx https://github.com/Shu-Wan/solx/releases/latest/download/solx-x86_64-unknown-linux-musl +chmod +x ~/.local/bin/solx solx --version solx init # writes ~/.config/solx/config.toml (offers a quick walkthrough) ``` -Both paths use [`uv`](https://docs.astral.sh/uv/) to provision a modern -Python (Sol's system `python3` is too old); if `uv` isn't on `PATH`, -install it first. Installing reaches the network and writes to -`~/.local/bin` — propose the command and get the user's go-ahead (or run -it with their OK) rather than installing silently. +The binary is fully static (musl), so it runs on Sol's RHEL 8 as-is. +Installing reaches the network and writes to `~/.local/bin` — propose the +command and get the user's go-ahead (or run it with their OK) rather than +installing silently. Make sure `~/.local/bin` is on `$PATH`. **If the user declines or can't install `solx`,** nothing is lost for the common cases — raw Slurm covers them: `squeue`/`scancel` for status and @@ -131,7 +130,7 @@ version: | `solx job jump` | Drop a shell onto the job's compute node (`srun --pty`). | | `solx job list` · `time` · `stop` | List · time-left · cancel. Raw `squeue`/`scancel` are equivalent (see below). | | `solx keep` | Renew `/scratch` files Sol flagged, filtered by `[keep]`. | -| `solx config import-solkeep` | Migrate a legacy `~/.solkeep` into `[keep]`. | +| `solx cheatsheet` | Print the Sol quick reference (`references/cheatsheet.md`) as text. | `--json` forces JSON — before the subcommand (`solx --json job list`) or after it (`solx job list --json`; exception: after `job start`, tokens @@ -141,12 +140,11 @@ prompt unless `-y`, refuse in a non-interactive session rather than hang, and preview with `-n`. **`solx` vs raw SLURM — equivalent for one-off reads; use either.** A -warm `solx job` read runs in ~0.13s with the single-file install, vs -~0.08s for raw `squeue` (measured — see -`evals/runner/bench_solx_latency.sh`); a venv install on the NFS home is -slower (~1s warm), which is one more reason to prefer the single-file -channel. The raw equivalents, for when `solx` isn't installed or the -user asks for them: +warm `solx job` read runs in ~0.12s, vs ~0.08s for raw `squeue` (measured +— see `evals/runner/bench_solx_latency.sh`); the residual over `squeue` is +just the `squeue` subprocess `solx` spawns, and the native binary's +startup doesn't degrade under node load or a cold NFS cache. The raw +equivalents, for when `solx` isn't installed or the user asks for them: ```shell squeue --me # = solx job list @@ -190,157 +188,31 @@ than necessary, and SLURM gives you cleaner answers. 2. You do not have `sudo` privileges, so maintain a local environment under `/home/$USER/.local` or `/home/$USER/opt`. 3. Use `git` to keep code in sync between local and cluster. -## Filesystem and Storage - -Sol provides two main storage areas: - -| Location | Purpose | Policy | -|------------------|------------------------------|---------------------------------| -| `/home/$USER` | Config, small files | Limited space, backed up | -| `/scratch/$USER` | Large data, caches, outputs | Layered deletion — see Sol docs | - -Always place large data files, model caches, and outputs under -`/scratch/$USER`. - -### Renewing the Scratch Timestamp — `solx keep` - -Sol deletes inactive `/scratch` files on a layered schedule and writes -per-stage CSV warnings into `$HOME`. ASU Research Computing defines the -thresholds, CSV filenames, and warning cadence; their doc is -authoritative: . - -**Use `solx keep`.** It reads those CSVs, keeps only the directories -that match your **keep-list**, and refreshes their timestamps with -`touch`. It only ever touches directories that are **both** flagged by -Sol **and** in your keep-list — so there's nothing to do until Sol -actually flags something, and it never walks `/scratch` wholesale. That -bound is the whole point: it's a tool to extend the life of files you -still use, not to defeat Sol's retention policy. - -**Where the keep-list lives:** the `[keep]` block in -`~/.config/solx/config.toml` (`include` / `exclude`, gitignore-style -globs). Set it up once with `solx config edit`: - -```toml -# Replace `sparky` with your ASURITE. Patterns are gitignore-style; ** = any depth. -[keep] -include = ["/scratch/sparky/my-project", "/scratch/sparky/experiments/**"] -# Don't spend the renewal on regenerable junk — it rebuilds for free. -exclude = ["**/.venv", "**/.git", "**/__pycache__", "**/node_modules"] -``` - -**Preview before the real pass.** `solx keep` rewrites timestamps on -every kept file — potentially hundreds of thousands. Never fire it -blind: run `--dry-run` first and check the plan, *or* get the user's -go-ahead on the scope. It also prompts (`… ? [y/N]`) before touching -unless you pass `-y`; in a non-interactive session it refuses rather -than hang. - -```shell -solx keep --dry-run -v # preview which directories would be renewed -solx keep # renew them (prompts; -y to skip the prompt) -solx keep --stage pending # only the most-urgent CSV -solx --json keep --dry-run # machine-readable plan (counts + a capped sample) -``` - -#### Where to run it - -A renewal is metadata-heavy I/O, not compute — but a touch pass over -tens of thousands of files is exactly the load Sol's **login nodes -throttle**. Check the environment first (see [Detecting the -Environment](#detecting-the-environment)), then branch: - -- **On a compute node** (`$SLURM_JOB_ID` set) — run it directly; you - already hold dedicated resources. -- **On a login node** (`$SLURM_JOB_ID` unset) — don't run the heavy - pass here. Move it to one of, in rough order of convenience: - - the **DTN**: `ssh soldtn ''` (the `dtn` wrapper is literally - `ssh soldtn`). It's tuned for I/O, isn't throttled, and has many - cores — the best home for a large renewal. - - a **compute node**: grab one with `solx job start` (or - `interactive`) and run it there. - - a **batch job**: submit a short `htc` job whose payload is the - renewal, for an unattended pass. +## Submitting Jobs -Match `-j` (parallel workers) to where it actually runs: a 4-core -compute node can't feed more than a couple, while the DTN has many. See -[references/scratch.md](references/scratch.md) for the non-interactive -`PATH` gotcha when invoking over `ssh soldtn`. +Sol uses **Slurm**. Interactive allocations go through `solx`; batch +work goes through `sbatch`. `solx` deliberately doesn't wrap `sbatch` — +for batch, drive Sol's tooling directly. -#### Migrating a legacy `~/.solkeep` +### Know your access first -The older standalone `sol_renew.py` script and the `~/.solkeep` -keep-list it read are **deprecated**. `solx keep` still reads a -`~/.solkeep` if it finds one (so nothing breaks today), but it prints a -deprecation notice and **support is removed in solx 1.0.0**. If you see -a `~/.solkeep`, migrate it into the config once: +Partition and QOS advice is only correct *for this user* — what they can +run on depends on their account and group. Before recommending where a +job goes, check what's actually available to them: ```shell -solx config import-solkeep # folds ~/.solkeep into the [keep] block -solx config show # sanity-check the result +sacctmgr -n show assoc user=$USER format=Account,Partition,QOS +# → e.g. grp_yourpi || debug,htc,private,public +sshare -U -o Account,User,FairShare # low fairshare → prefer a buy-in/preemptible QOS ``` -After migrating, `solx keep` uses `[keep]` and the warning goes away. - -### Sharing Files - -See [references/sharing.md](references/sharing.md) for the -step-by-step procedure to share files with other users on the -cluster. - -## Getting the Software You Need on Sol - -Situation: you need a tool — a compiler, a Python interpreter, an R -package, a LaTeX distribution, a CLI — and the system `PATH` on Sol -either doesn't have it or has too old a version. You don't have -`sudo`. There are four non-sudo paths; pick the one that matches the -kind of software: - -1. **Already on the cluster as a module.** Compilers, MPI stacks, - Python distributions, R, CUDA, common applications — all live - under the `module` system. No modules are loaded when a session - starts, so `module load` them every session (or in every SBATCH - script). See [references/module.md](references/module.md) for - `avail` / `load` / `list` / `purge` and the naming schemes. - -2. **Python — use `uv`.** The system `python3` on Sol is older than - modern code expects. Don't fight it; use - [`uv`](https://docs.astral.sh/uv/) to manage interpreters and - environments instead. (It's also what installs `solx`.) - - - Point `uv`'s cache at `/scratch` so it doesn't fill `/home`: - - ```shell - export UV_CACHE_DIR=/scratch/$USER/.cache/uv - ``` - - For one-file utility scripts, prefer the PEP 723 inline-metadata - shebang `#!/usr/bin/env -S uv run --script` so the script - self-bootstraps its interpreter and dependencies. - -3. **LaTeX — use R's `tinytex`.** Builds a per-user TeX Live tree - under `~/.local/bin/latex`, no sudo: - - 1. `module avail r-4` to find a current R, then `module load` it. - 2. Use the R package `tinytex` to install TeX Live locally. - 3. Install TeX packages on demand: `tlmgr install `. - 4. If `tlmgr` complains "is older than remote repository", refresh - the local TeX Live: load R, then - `Rscript -e "tinytex::reinstall_tinytex(repository='illinois')"`. - -4. **Anything else — install to `~/.local` or `~/opt`.** No `sudo` - on Sol, so anything you build or download from source goes under - your home directory. `~/.local/bin` should be on `PATH` by - default; add it in `~/.bashrc` / `~/.zshrc` if not. - -Across all four: never propose `sudo`. If a tool genuinely requires -root, file a ticket with ASU Research Computing rather than working -around it. - -## Submitting Jobs - -Sol uses **Slurm**. Interactive allocations go through `solx`; batch -work goes through `sbatch`. `solx` deliberately doesn't wrap `sbatch` — -for batch, drive Sol's tooling directly. +The QOS column is the menu. Most users have `public` (default, +non-preemptable) and `debug` (15-min, high-priority); a `private` or +`grp_*` entry means the user's group owns nodes they can run on *longer +than htc's 4 hours* (and preemptibly, for `private`). Tailor the +partition/QOS choice below to that list — don't suggest a QOS the user +can't use. The full partition × QOS table is in the cheat sheet, +[references/cheatsheet.md](references/cheatsheet.md). ### Interactive allocations — `solx job` @@ -369,19 +241,39 @@ each `[jobs.]` sets `partition`, `time`, optional `qos`, `gres`, **The `interactive` wrapper** (the no-`solx` fallback) already defaults to `-p htc -q public -c 1 -t 0-4`. Bare `interactive` gets you a 4-hour `htc` shell — the right shape for most debug or "just need to check -something on a compute node" sessions. Override only when the workload -genuinely needs more (e.g., `interactive -p public -G a100:1` for a GPU -shell). - -**Match the partition to the workload size, not the request size.** -Sol's `htc` partition is the right home for short, lightweight, -debug-class work. Use `public` for real workloads that genuinely need -the larger nodes. If the user describes the work as "quick", "debug", -"lightweight", "just need to check", or specifies under an hour with no -GPU — that's an `htc` request (a sufficient trigger, not a wall-time -cap: `htc` still serves the `interactive` wrapper's 4-hour default). -Don't default to `public` in those cases: defaulting wastes capacity -that someone else is queued for. +something on a compute node" sessions, GPU work included: `htc` carries +A100s, so `interactive -p htc -G a100:1` gets you a quick GPU shell. +Override to `public` only when the run needs more than htc's 4-hour wall. + +**Match the partition to the job's wall-time and priority, not to +whether it uses a GPU.** GPUs live in `htc`, `public`, *and* `general`, +so "it needs an A100" says nothing about where the job goes — *how +long* and *how urgently* do: + +- **≤ 4 h, including GPU work → `htc`.** The default home for debug, + ablations, smoke-tests, and short training. `htc` carries Sol's + largest A100 pool (dozens of `a100:4` nodes, plus H100 / L40 / A30) + and is far less contended than `public`. Sol nudges you toward it — a + ≤4h job submitted to `public` prints `you may consider '-p htc'`, but + it doesn't move the job for you, so pass `-p htc` yourself. A 30-minute + A100 ablation is an `htc` job, not a `public` one. +- **≤ 15 min and you want to jump the queue → `-p public -q debug`** + (or `-p general -q debug`). The `debug` QOS has a 15-minute hard cap + but very high priority and allows GPUs — ideal for "does this even + launch?". It is **not** valid on `htc` (`-p htc -q debug` is rejected), + so pair it with `public`/`general` — and always *with* `-q debug`, since + bare `-p general` (default QOS `public`) is rejected too. One job at a time. +- **> 4 h, non-preemptable → `-p public`** (7-day wall) — real runs + that can't finish or checkpoint inside 4 hours. +- **> 4 h on borrowed private nodes, OK with preemption → `-p general + -q private`.** The `private` QOS has no wall of its own (gated by the + partition) and trades preemptibility — owners can cancel your job — + for running past htc's 4 hours; it often starts sooner. + +The trap is the **"GPU → public" reflex**: sending a short GPU job to +`public` parks it behind multi-day jobs while hundreds of htc A100s sit +one partition over. Only the wall-time clock — or a node shape `htc` +lacks (GH200, Gaudi) — should push GPU work past `htc`. ### Batch jobs — `sbatch` @@ -473,8 +365,8 @@ them. `solx` owns the *interactive-allocation lifecycle*; these own | You want | Command | |---|---| | Your fairshare / scheduling priority | `myfairshare` | -| Your `/scratch` quota | `myquota` | -| Your jobs right now | `myjobs` (or `squeue --me`, `sq`) | +| Your `/scratch` quota | `beegfs-ctl --getquota --uid $USER` | +| Your jobs right now | `myjobs` (or `squeue --me`) | | Estimated start time of a pending job | `thisjob ` | | Efficiency of a finished job | `seff ` | | Free capacity / partitions | `sinfo`, `showparts` | @@ -494,17 +386,144 @@ SLURM command itself** — it's portable and stable — and reach for Sol's | User question | Native SLURM | Sol wrapper (when useful) | |---|---|---| -| What jobs do I have right now? | `squeue --me` | `myjobs` (priority/QOS/GPU columns), `sq` (sorted by priority), `summary` (state counts) | +| What jobs do I have right now? | `squeue --me` | `myjobs` (priority/QOS/GPU columns), `summary` (state counts). NB `sq` is the *whole-cluster* queue — filter with `sq -u $USER` | | Tell me about job N | `scontrol show job N` | `thisjob N` adds a `squeue` row + est. start; `showjob N` also runs `seff` if finished | | What's my historical job activity? | `sacct --user=$USER --starttime=YYYY-mm-dd` | `mysacct` (preset format) | | What accounts and QOS can I submit under? | `sacctmgr -s show user $USER format=User,DefaultAccount,Account,QOS` | `myaccounts` (same call, shorter to type) | | What's my fairshare / scheduling priority? | — | `myfairshare` | -| What's my scratch quota? | — | `myquota` | +| What's my scratch quota? | `beegfs-ctl --getquota --uid $USER` | — | | Why is my job stuck pending? | `squeue --me -t PD -O Reason` | `showlimited` (cluster-wide capacity holds by group/QOS) | | Which partitions have free capacity? | `sinfo` (or `sinfo --Format=...`) | `showparts` (color-coded availability) | | Which GPU nodes have free GPUs? | `scontrol show nodes` (parse `Gres` / `AllocTRES`) | `showgpus` (color-coded per-node) | | How efficient was a finished job? | `seff ` | (no wrapper) | +## Filesystem and Storage + +Sol provides two main storage areas: + +| Location | Purpose | Policy | +|------------------|------------------------------|---------------------------------| +| `/home/$USER` | Config, small files | Limited space, backed up | +| `/scratch/$USER` | Large data, caches, outputs | Layered deletion — see Sol docs | + +Always place large data files, model caches, and outputs under +`/scratch/$USER`. + +### Renewing the Scratch Timestamp — `solx keep` + +Sol deletes inactive `/scratch` files on a layered schedule and writes +per-stage CSV warnings into `$HOME`. ASU Research Computing defines the +thresholds, CSV filenames, and warning cadence; their doc is +authoritative: . + +**Use `solx keep`.** It reads those CSVs, keeps only the directories +that match your **keep-list**, and refreshes their timestamps with +`touch`. It only ever touches directories that are **both** flagged by +Sol **and** in your keep-list — so there's nothing to do until Sol +actually flags something, and it never walks `/scratch` wholesale. That +bound is the whole point: it's a tool to extend the life of files you +still use, not to defeat Sol's retention policy. + +**Where the keep-list lives:** the `[keep]` block in +`~/.config/solx/config.toml` — `include` / `exclude` gitignore-style +globs, set up once with `solx config edit`. Keep regenerable junk +(`.venv`, `__pycache__`, `node_modules`, …) out of `include` so a +renewal isn't spent on files that rebuild for free. `solx` owns the +mechanics — the config schema and a worked example are in +[references/solx.md](references/solx.md). + +**Preview before the real pass.** `solx keep` rewrites timestamps on +every kept file — potentially hundreds of thousands. Never fire it +blind: run `--dry-run` first and check the plan, *or* get the user's +go-ahead on the scope. It also prompts (`… ? [y/N]`) before touching +unless you pass `-y`; in a non-interactive session it refuses rather +than hang. + +```shell +solx keep --dry-run -v # preview which directories would be renewed +solx keep # renew them (prompts; -y to skip the prompt) +solx keep --stage pending # only the most-urgent CSV +solx --json keep --dry-run # machine-readable plan (counts + a capped sample) +``` + +#### Where to run it + +A renewal is metadata-heavy I/O, not compute — but a touch pass over +tens of thousands of files is exactly the load Sol's **login nodes +throttle**. Check the environment first (see [Detecting the +Environment](#detecting-the-environment)), then branch: + +- **On a compute node** (`$SLURM_JOB_ID` set) — run it directly; you + already hold dedicated resources. +- **On a login node** (`$SLURM_JOB_ID` unset) — don't run the heavy + pass here. Move it to one of, in rough order of convenience: + - the **DTN**: `ssh soldtn ''` (the `dtn` wrapper is literally + `ssh soldtn`). It's tuned for I/O, isn't throttled, and has many + cores — the best home for a large renewal. + - a **compute node**: grab one with `solx job start` (or + `interactive`) and run it there. + - a **batch job**: submit a short `htc` job whose payload is the + renewal, for an unattended pass. + +Match `-j` (parallel workers) to where it actually runs: a 4-core +compute node can't feed more than a couple, while the DTN has many. See +[references/scratch.md](references/scratch.md) for the non-interactive +`PATH` gotcha when invoking over `ssh soldtn`. + +### Sharing Files + +See [references/sharing.md](references/sharing.md) for the +step-by-step procedure to share files with other users on the +cluster. + +## Getting the Software You Need on Sol + +Situation: you need a tool — a compiler, a Python interpreter, an R +package, a LaTeX distribution, a CLI — and the system `PATH` on Sol +either doesn't have it or has too old a version. You don't have +`sudo`. There are four non-sudo paths; pick the one that matches the +kind of software: + +1. **Already on the cluster as a module.** Compilers, MPI stacks, + Python distributions, R, CUDA, common applications — all live + under the `module` system. No modules are loaded when a session + starts, so `module load` them every session (or in every SBATCH + script). See [references/module.md](references/module.md) for + `avail` / `load` / `list` / `purge` and the naming schemes. + +2. **Python — use `uv`.** The system `python3` on Sol is older than + modern code expects. Don't fight it; use + [`uv`](https://docs.astral.sh/uv/) to manage interpreters and + environments instead. + + - Point `uv`'s cache at `/scratch` so it doesn't fill `/home`: + + ```shell + export UV_CACHE_DIR=/scratch/$USER/.cache/uv + ``` + - For one-file utility scripts, prefer the PEP 723 inline-metadata + shebang `#!/usr/bin/env -S uv run --script` so the script + self-bootstraps its interpreter and dependencies. + +3. **LaTeX — use R's `tinytex`.** Builds a per-user TeX Live tree + under `~/.local/bin/latex`, no sudo: + + 1. `module avail r-4` to find a current R, then `module load` it. + 2. Use the R package `tinytex` to install TeX Live locally. + 3. Install TeX packages on demand: `tlmgr install `. + 4. If `tlmgr` complains "is older than remote repository", refresh + the local TeX Live: load R, then + `Rscript -e "tinytex::reinstall_tinytex(repository='illinois')"`. + +4. **Anything else — install to `~/.local` or `~/opt`.** No `sudo` + on Sol, so anything you build or download from source goes under + your home directory. `~/.local/bin` should be on `PATH` by + default; add it in `~/.bashrc` / `~/.zshrc` if not. + +Across all four: never propose `sudo`. If a tool genuinely requires +root, file a ticket with ASU Research Computing rather than working +around it. + ## Using a Service That Runs on Sol, From Your Laptop The canonical version of this situation: the user wants a Jupyter diff --git a/skills/sol-skill/references/cheatsheet.md b/skills/sol-skill/references/cheatsheet.md new file mode 100644 index 0000000..bb157e7 --- /dev/null +++ b/skills/sol-skill/references/cheatsheet.md @@ -0,0 +1,154 @@ +# 🌵 Sol Cheatsheet + +Quick reference for ASU's Sol supercomputer — SLURM basics, the `solx` +CLI and its raw-SLURM equivalents, partitions & QOS, Sol's own wrappers, +and getting at a compute-node service from your laptop. + +> A rendered PDF lives at [`docs/cheatsheet.pdf`](../../../docs/cheatsheet.pdf) +> (build it with `scripts/build-cheatsheet.sh`). In a terminal on Sol, run +> `solx cheatsheet` to print this page. + +--- + +## Know your access first + +What partitions, QOS, and group account *you* can use — the answer +drives every job decision below: + +```shell +sacctmgr -n show assoc user=$USER format=Account,Partition,QOS +# → e.g. grp_yourpi || debug,htc,private,public +sshare -U # your fairshare (lower = back off / use a buy-in QOS) +``` + +--- + +## Partitions — pick by wall-time, not by "is it a GPU job?" + +GPUs live in `htc`, `public`, **and** `general`. The deciding question is +*how long* and *how urgently*, not CPU-vs-GPU. + +| Partition | Wall limit | GPUs | Use it for | +|-------------|-----------:|----------------------------------------|------------| +| `htc` | **4 h** | large A100 pool + H100/L40/A30/H200 | the default for anything ≤4 h, **GPU included** — least contended | +| `public` | 7 days | A100 (+ A100-MIG, A30) | runs that need >4 h, non-preemptable | +| `general` | 14 days | A100/H100/H200/L40 | privately-owned nodes (via `-q private` or your `grp_*`) | +| `lightwork` | 1 day | a100.20gb | the `vscode` tunnel's home; light dev | +| `highmem` | 7 days | — | up to 2 TB RAM | + +## QOS — priority & preemption, and which partitions accept it + +| QOS | Wall cap | Notes | +|-----------|-----------------|-------| +| `public` | (partition's) | default, non-preemptable | +| `debug` | **15 min** | very high priority; GPUs OK; **`public`/`general` only — rejected on `htc`**; one job at a time | +| `private` | (partition's) | preemptible access to buy-in nodes — owners can cancel you; runs past htc's 4 h | +| `grp_*` | up to 30 days | your group's owned nodes (if you're in one) | +| `class` | 1 day | course users; GPU-minute caps | + +**Routing in one line:** ≤4 h (incl. GPU) → `htc` · ≤15 min & urgent → +`-p public -q debug` · >4 h → `-p public` · >4 h preemptible → `-p general +-q private`. Never `-p htc -q debug` (invalid). + +--- + +## SLURM basics + +```shell +sbatch job.sh # submit a batch script +squeue --me # your jobs (alias: myjobs; bare `sq` = whole cluster) +scancel # cancel +scontrol show job # full detail / why pending +sbatch --test-only job.sh # validate partition/QOS/time/gres WITHOUT submitting +interactive # quick shell; defaults to -p htc -q public -c 1 -t 0-4 +``` + +`#SBATCH` header skeleton (time format is `D-HH:MM:SS`): + +```bash +#!/bin/bash +#SBATCH -p htc # partition (htc = ≤4h, has GPUs) +#SBATCH -q public # QOS +#SBATCH -t 0-04:00:00 # wall-time +#SBATCH -c 8 # cores +#SBATCH --gres=gpu:a100:1 # GPU(s) +#SBATCH --mem=64G +#SBATCH -o slurm.%j.out +``` + +> Start from Sol's templates, don't hand-roll headers: +> `/packages/public/sol-sbatch-templates/templates/`. + +--- + +## `solx` ↔ raw SLURM + +`solx` owns the interactive-allocation lifecycle; raw SLURM is the +equivalent fallback for one-off reads. + +| `solx` | raw SLURM equivalent | +|------------------------------|----------------------| +| `solx job start [TEMPLATE]` | `salloc` / `interactive` from a config template, *waits for the grant* | +| `solx job jump` | `srun --pty $SHELL` onto the compute node | +| `solx job list` | `squeue --me` | +| `solx job time` | `squeue -h -j "$SLURM_JOB_ID" -o %L` | +| `solx job stop` | `scancel ` | +| `solx keep` | renew the mtime on `/scratch` files Sol flagged (filtered by `[keep]`) | +| `solx job start gpu -- …` | anything after `--` is appended to `salloc` (last flag wins) | + +Config lives at `~/.config/solx/config.toml` (`solx config edit`). Add +`--json` for machine output; `-n` to preview; `-y` to skip prompts. + +--- + +## Sol's own `my*` / `show*` wrappers + +| You want | Command | +|----------|---------| +| Your fairshare / priority | `myfairshare` | +| Your `/scratch` quota | `beegfs-ctl --getquota --uid $USER` | +| Your jobs right now | `myjobs` (or `squeue --me`) | +| Estimated start of a pending job | `thisjob ` | +| Efficiency of a finished job | `seff ` | +| Free capacity / partitions | `sinfo`, `showparts` | +| Free GPUs per node | `showgpus` | + +--- + +## Reaching a compute-node service from your laptop + +```shell +# VS Code: from a Sol login node, register a tunnel (wraps srun on lightwork) +vscode # then open the tunnel named sol_$USER + +# Manual port-forward (e.g. Jupyter on $NODE:8888), run from your LAPTOP: +ssh -N -L 8888:localhost:8888 -J $USER@login.sol.rc.asu.edu $USER@$NODE +``` + +`$NODE` is the compute node your allocation landed on (`squeue --me` → +NODELIST). Bind services to `localhost`, never `0.0.0.0`, on shared nodes. + +--- + +## Storage & caches + +| Path | For | Lifetime | +|------|-----|----------| +| `/scratch/$USER` | datasets, model caches, run outputs | **purged after inactivity** — renew with `solx keep` | +| `/home/$USER` | code, configs, `~/.local` installs | persistent, small quota | + +Point heavyweight caches at `/scratch`, not `/home`: + +```shell +export HF_HOME=/scratch/$USER/.cache/huggingface +export UV_CACHE_DIR=/scratch/$USER/.cache/uv +``` + +--- + +## Heavy I/O — where to run it + +Login nodes are throttled. For a big metadata pass (e.g. touching +hundreds of thousands of files) use the **DTN** (`ssh soldtn`), a +**compute node** (`interactive`), or a short **`htc` batch job** — never +the login node. diff --git a/skills/sol-skill/references/scratch.md b/skills/sol-skill/references/scratch.md index 6a111de..ee7e098 100644 --- a/skills/sol-skill/references/scratch.md +++ b/skills/sol-skill/references/scratch.md @@ -29,8 +29,8 @@ run already refreshed but Sol hasn't dropped from the CSV yet; that's expected. ## What the keep-list matches `solx keep` renews a directory only when it is **both** flagged by Sol **and** -matched by your keep-list (`[keep]` in the config; a legacy `~/.solkeep` is read -as a deprecated fallback). Patterns are gitignore-style and match the +matched by your keep-list (the `[keep]` block in the config). Patterns are +gitignore-style and match the **directory paths** in the CSVs — so matching decides which *whole flagged directories* get touched, not individual files within them. A bare path matches that directory and everything under it; `**` matches any depth. Carve out @@ -53,12 +53,6 @@ not have on `PATH`, so prepend it: ssh soldtn 'export PATH=$HOME/.local/bin:$PATH; solx keep --stage inactive -j 24 -y' ``` -## Migrating off a legacy `~/.solkeep` - -The old `sol_renew.py` script (removed) and its `~/.solkeep` keep-list are -deprecated; `solx keep` still reads `~/.solkeep` (with a notice) until support -drops in **1.0.0**. Migrate once with `solx config import-solkeep`. - ## Emergency single-path touch (no `solx`) To refresh one path outside the CSV workflow — the same primitive `solx keep` diff --git a/skills/sol-skill/references/sessions.md b/skills/sol-skill/references/sessions.md index 1f291ee..54123b0 100644 --- a/skills/sol-skill/references/sessions.md +++ b/skills/sol-skill/references/sessions.md @@ -56,29 +56,34 @@ From the laptop, SSH to the login node: ssh $ME@$SOL ``` -On the login node, request an allocation. **Match the partition to -the workload size**, not the request size — Sol's `htc` partition is -the right home for short, lightweight, debug-class shells (anything -under ~1 hour, modest CPU/RAM, no GPU). Save `public` for real -work that genuinely needs the larger nodes. Picking `public` for a -30-minute debug shell wastes capacity that someone else is queued for. +On the login node, request an allocation. **Match the partition to the +job's wall-time**, not the request size — Sol's `htc` partition is the +right home for anything that fits its 4-hour wall (debug, smoke-tests, +short GPU runs). `htc` carries a large GPU pool (hundreds of A100s, +plus H100 / L40 / A30), so a short *GPU* shell belongs there too — not +just CPU work. Save `public` (7-day wall) for runs that genuinely need +more than 4 hours. Picking `public` for a 30-minute shell — GPU or not +— wastes capacity that someone else is queued for. ```shell -# Lightweight debug — short shell, no GPU, modest resources. -# Use this for "I just want to test a command" or "I need to inspect -# something on a compute node for a few minutes." +# Lightweight debug — short shell, modest resources. +# "I just want to test a command" / "inspect a compute node briefly." interactive -p htc -t 0-01:00 -c 4 --mem=16G -# General-purpose interactive shell — hours of CPU work -interactive -p public -t 0-04:00 -c 8 --mem=32G +# Short GPU shell — an A100 for a quick test/ablation; fits htc's 4h. +interactive -p htc -t 0-04:00 -c 8 --mem=64G -G a100:1 -# GPU -interactive -p public -t 0-04:00 -c 8 --mem=64G -G a100:1 +# Longer run that needs more than htc's 4-hour wall → public (7-day). +interactive -p public -t 1-00:00 -c 8 --mem=32G ``` If the user describes the work as "quick", "debug", "lightweight", -"just need to check", or specifies under an hour with no GPU — that's -an `htc` request. Don't default to `public` in those cases. +"just need to check", or names a wall-time at or under 4 hours — that's +an `htc` request, GPU or not. Don't default to `public` in those cases; +only a need for more than 4 hours (or a node shape htc lacks) does. For +a ≤15-minute test that needs to start *now*, the `debug` QOS (`-p public +-q debug`, very high priority, GPUs allowed — but not valid on `htc`) is +the fast lane. When the allocation lands, the prompt changes and you are now on a compute node. **Capture the node hostname** — you will need it from diff --git a/skills/sol-skill/references/slurm.md b/skills/sol-skill/references/slurm.md index 309b913..9dffc4d 100644 --- a/skills/sol-skill/references/slurm.md +++ b/skills/sol-skill/references/slurm.md @@ -272,7 +272,7 @@ doesn't (and shouldn't) wrap them. | `thisjob ` | Job info including the estimated start time. | | `seff ` | Slurm efficiency (CPU + memory used) for a finished job. | | `myfairshare` | Your real fairshare score. | -| `myquota` | Your `$SCRATCH` quota. | +| `beegfs-ctl --getquota --uid $USER` | Your `/scratch` (BeeGFS) quota — there is no `myquota` wrapper. | | `sinfo` / `showparts` | Cluster / partition capacity (`showparts` is color-coded). | | `showgpus` | Free GPUs per node (color-coded). | | `ns` | Command-line version of the cluster status page. | diff --git a/skills/sol-skill/references/solx.md b/skills/sol-skill/references/solx.md index c5ef32c..302fb29 100644 --- a/skills/sol-skill/references/solx.md +++ b/skills/sol-skill/references/solx.md @@ -11,11 +11,13 @@ Sol and reported on stdout (results) / stderr (diagnostics). ## Install + first run +`solx` is one static binary — no Python, no `uv`, no toolchain. Install +is a download and a `chmod`: + ```shell -# Recommended on Sol — single-file install (fast cold start on the NFS home): -curl -fsSL https://github.com/Shu-Wan/solx/releases/latest/download/install.sh | sh -# Alternative — as a uv tool (isolated venv on $PATH): -uv tool install git+https://github.com/Shu-Wan/solx.git#subdirectory=solx +mkdir -p ~/.local/bin +curl -fLo ~/.local/bin/solx https://github.com/Shu-Wan/solx/releases/latest/download/solx-x86_64-unknown-linux-musl +chmod +x ~/.local/bin/solx solx --version solx init # writes ~/.config/solx/config.toml (mode 0600) @@ -23,18 +25,19 @@ solx config edit # fill in templates + [keep] paths solx config show # sanity-check the resolved config ``` -Both paths use [`uv`](https://docs.astral.sh/uv/) to provision a Python -≥ 3.11 (Sol's system `python3` is too old). Installing reaches the -network and writes `~/.local/bin/solx` — propose it and get the user's -OK rather than installing silently. +The binary is fully static (musl), so it runs on Sol's RHEL 8 as-is. +Installing reaches the network and writes `~/.local/bin/solx` (make sure +that's on `$PATH`) — propose it and get the user's OK rather than +installing silently. ## When to use `solx` vs raw SLURM For one-off reads the two are equivalent — use either. A warm `solx job` -read runs in ~0.13s on Sol with the single-file install, vs ~0.08s for -raw `squeue` (measured — `evals/runner/bench_solx_latency.sh`); a venv -install on the NFS home is slower (~1s warm), so prefer the single-file -channel. The raw equivalents, for when `solx` isn't installed: +read runs in ~0.12s on Sol, vs ~0.08s for raw `squeue` (measured — +`evals/runner/bench_solx_latency.sh`); the residual over `squeue` is just +the `squeue` subprocess `solx` spawns, and the native binary's startup +doesn't degrade under node load or a cold NFS cache. The raw equivalents, +for when `solx` isn't installed: `squeue --me` (= `job list`), `squeue -h -j "$SLURM_JOB_ID" -o %L` (= `job time`), `scancel ` (= `job stop -y `). `solx` adds the most on the multi-step ops: `job start` (templated allocation that @@ -53,7 +56,6 @@ renewal). | `solx job time [JOBID]` | Print remaining wall-time (`D-HH:MM:SS`). | | `solx keep` | Renew `/scratch` files Sol flagged, filtered by `[keep]` (prompts unless `-y`). | | `solx config show` / `edit` | Show / edit the config. | -| `solx config import-solkeep` | Migrate a legacy `~/.solkeep` into the `[keep]` block. | | `solx completions ` | Print a shell-completion script. | | `solx version` / `--version`, `solx help` / `--help` | Version / help. | @@ -96,7 +98,7 @@ partition = "htc" # the fast queue — good for quick tests time = "0-1" [jobs.gpu] -partition = "public" +partition = "htc" # htc carries A100s; a 4h GPU run fits its wall gres = "gpu:a100:1" time = "0-4" extra_args = ["--mem=64G", "--cpus-per-task=8"] @@ -111,7 +113,8 @@ exclude = ["**/.venv", "**/.git", "**/__pycache__", "**/node_modules"] `default_shell`, `default_template`, and ≥1 `[jobs.]` are required; `qos`/`gres`/`extra_args`/`[keep]` are optional. Anything after `--` on `solx job start` is appended to `salloc` (last flag wins): -`solx job start gpu -- --mem=128G --time=8:00:00`. +`solx job start gpu -- -p public --time=8:00:00` jumps to `public` when a +run needs more than htc's 4-hour wall. ## Leaving out the job id (verb-aware resolution) @@ -174,13 +177,8 @@ keep-list, and `touch`es them. It only ever touches directories that are **both** flagged by Sol **and** in your keep-list — nothing to do until Sol flags something, and it never walks `/scratch` wholesale. -Keep-list source, in precedence order: - -1. `--solkeep ` — a specific gitignore-style keep-list, if passed. -2. the `[keep]` block in the config (`include` / `exclude`). **Preferred.** -3. `~/.solkeep` — the **deprecated** legacy keep-list. Still read if - present (so existing setups keep working), but `solx keep` prints a - deprecation notice and **support is removed in solx 1.0.0**. +The keep-list is the `[keep]` block in the config (`include` / `exclude`), +matched gitignore-style. It's the only keep-list source. ```shell solx keep --dry-run -v # preview which directories would be renewed @@ -188,29 +186,14 @@ solx keep # renew them (prompts; -y to skip) solx keep --stage pending # only the most-urgent CSV ``` -Flags: `--solkeep FILE`, `--stage {pending,over90,inactive,all}`, -`--csv-dir DIR` (default `$HOME`), `-j N` (parallel workers — default -small on purpose; `/scratch` is networked storage), `-y` / `-n` / `-v`. +Flags: `--stage {pending,over90,inactive,all}`, `--csv-dir DIR` (default +`$HOME`), `-j N` (parallel workers — default small on purpose; `/scratch` +is networked storage), `-y` / `-n` / `-v`. This is metadata-heavy NFS I/O, which login nodes throttle — run a big pass on a compute node or the DTN (`ssh soldtn`). See [scratch.md](scratch.md) for the CSV schema and performance notes. -## Migrating off `~/.solkeep` - -The old standalone `sol_renew.py` and the `~/.solkeep` keep-list are -deprecated. Migrate an existing `~/.solkeep` into the config once: - -```shell -solx config import-solkeep # folds ~/.solkeep into the [keep] block -solx config show # review the result -``` - -It appends a `[keep]` block to `config.toml` (refusing if one already -exists, since a second `[keep]` table is invalid TOML — merge by hand -with `solx config edit` in that case). After migrating, `solx keep` uses -`[keep]` and the deprecation notice goes away. - ## Shell completion `solx completions ` prints a fully static completion script — diff --git a/solx/.gitignore b/solx/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/solx/.gitignore @@ -0,0 +1 @@ +/target diff --git a/solx/Cargo.lock b/solx/Cargo.lock new file mode 100644 index 0000000..0a2b04d --- /dev/null +++ b/solx/Cargo.lock @@ -0,0 +1,912 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "assert_cmd" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2aa3a22042e45de04255c7bf3626e239f450200fd0493c1e382263544b20aea6" +dependencies = [ + "anstyle", + "bstr", + "libc", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "filetime" +version = "0.2.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759" +dependencies = [ + "cfg-if", + "libc", +] + +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] + +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "ignore" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b915661dd01db3f05050265b2477bcc6527b3792388e2749b41623cc592be67d" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.1", + "serde", + "serde_core", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "log" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" + +[[package]] +name = "memchr" +version = "2.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" + +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "predicates" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ada8f2932f28a27ee7b70dd6c1c39ea0675c55a36879ab92f3a715eaa1e63cfe" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cad38746f3166b4031b1a0d39ad9f954dd291e7854fcc0eed52ee41a0b50d144" + +[[package]] +name = "predicates-tree" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0de1b847b39c8131db0467e9df1ff60e6d0562ab8e9a16e568ad0fdb372e2f2" +dependencies = [ + "predicates-core", + "termtree", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "solx" +version = "1.0.0" +dependencies = [ + "assert_cmd", + "clap", + "csv", + "filetime", + "ignore", + "libc", + "predicates", + "regex", + "serde", + "serde_json", + "shlex", + "tempfile", + "toml", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen 0.57.1", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen 0.51.0", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/solx/Cargo.toml b/solx/Cargo.toml new file mode 100644 index 0000000..50d01e7 --- /dev/null +++ b/solx/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "solx" +version = "1.0.0" +edition = "2021" +description = "CLI for ASU's Sol supercomputer." +license = "MIT" + +[[bin]] +name = "solx" +path = "src/main.rs" + +[dependencies] +clap = { version = "4", features = ["derive"] } +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1", features = ["preserve_order"] } +toml = { version = "0.8", features = ["preserve_order"] } +csv = "1" +ignore = "0.4" +filetime = "0.2" +shlex = "1" +regex = "1" +libc = "0.2" +tempfile = "3" + +[dev-dependencies] +assert_cmd = "2" +predicates = "3" diff --git a/solx/DEVELOPMENT.md b/solx/DEVELOPMENT.md index 1d1c3fb..59a3e9d 100644 --- a/solx/DEVELOPMENT.md +++ b/solx/DEVELOPMENT.md @@ -1,340 +1,110 @@ -# solx — development - -Contributor + maintainer guide for the `solx` CLI. End-user docs live -in [`README.md`](README.md). The agent skill at `../skills/sol-skill/` -drives `solx` and ships on the same version line; see -[`../DEVELOPMENT.md`](../DEVELOPMENT.md) for the skill + eval harness. - -## Architecture - -Small Python modules, each with one job: - -``` -solx/src/solx/ -├── __init__.py # version constant -├── __main__.py # `python -m solx` entry -├── main.py # entry point (solx.main:main): argparse tree + dispatch -├── _completions.py # static bash/zsh/fish completion scripts, rendered -│ # from one description of the command surface -├── config.py # XDG TOML loader + dataclasses + pathspec compilation -├── output.py # Out: JSON-vs-Rich auto-detect + stdout/stderr split -├── side.py # Sol-vs-not-Sol guard (each subcommand asks require_sol) -├── slurm.py # squeue/scancel/salloc/srun wrappers + verb-aware resolution -├── jobs.py # `solx job *` command bodies -├── keep.py # `solx keep` (CSV-driven renewal, file-level sharded) -└── init.py # `solx init` (write starter config.toml) +# Developing solx + +solx is a single native binary (Rust): the CLI for interactive Slurm jobs +and scratch renewal on Sol. + +## Behavior contract + +These invariants are load-bearing — preserve them when you touch any +user-visible output. The crate's own suite ([Tests](#tests)) is what +locks them. + +* JSON renders like `json.dumps(obj, indent=2)`: two-space indent, + `\uXXXX` escapes for non-ASCII, insertion-ordered keys. +* Results go to stdout, diagnostics to stderr as single plain lines (no + markup, no color). +* Exit codes: 0 success, 1 runtime failure, 2 usage error / missing + config / refused action. +* Help/usage and completion scripts are clap-rendered — assert on content, + not exact wording. + +## Module map + +| Module | Contents | +| ----------------- | ----------------------------------------------------- | +| `src/main.rs` | clap command tree, dispatch, `config show`/`edit` | +| `src/side.rs` | Sol host detection (`hostname -a` + kernel hostname) | +| `src/slurm.rs` | squeue/scancel/salloc/srun wrappers, jobid resolution | +| `src/config.rs` | TOML config parsing, `[keep]` rules | +| `src/jobs.rs` | job list/start/stop/jump/time bodies | +| `src/keep.rs` | CSV plan, enumeration, touch pipeline | +| `src/init.rs` | `solx init` starter config + walkthrough | +| `src/output.rs` | TTY detection, JSON writer, plain-text diagnostics | +| `src/completions.rs` | embedded static completion scripts | + +Notable design decisions: + +* **CLI parsing.** clap handles the tree; two paths are parsed by hand + because their semantics predate clap conventions: the leading global + flags (`--json`, eager `--version`) and the whole `job start` tail, where + the first unconsumed bare token — even after `--` — is the template and + every other leftover token passes through to salloc in order. +* **`[keep]` matching.** `ignore::gitignore::Gitignore` rooted at `/` with + `matched_path_or_any_parents`, so a bare path pattern matches the + directory and everything under it and `!` negations win last-match style. + The keep-matching vectors live as unit tests in `config.rs` and + `keep.rs`; run them before touching matcher code. +* **Enumeration.** `ignore::WalkBuilder` with every ignore facility off + (`hidden(false)`, `ignore(false)`, `git_*(false)`, `parents(false)`, + `follow_links(false)`), files only — semantics equal `find DIR -type f`, + hidden files included. +* **Touch.** `filetime::set_file_times` to now; a missing path is a silent + skip and nothing is ever created (`touch -c` semantics). +* **Completion scripts.** `assets/` holds the static bash/zsh/fish scripts, + embedded via `include_str!`. Edit them as a set so the three shells stay + in sync with the command surface; `tests/cli.rs` smoke-checks that each + emits without error. + +## Tests + +Toolchain setup on Sol (rustup user-install, `CARGO_TARGET_DIR`, crates.io +access, glibc vs musl) is covered in +[`README.md` → Toolchain on Sol](README.md#toolchain-on-sol). + +```console +$ export CARGO_TARGET_DIR=/tmp/solx-target +$ cargo fmt --all --check +$ cargo clippy --all-targets -- -D warnings +$ cargo test ``` -Runtime dependencies: `rich` (human tables and prompts only) and -`pathspec` (keep-list globs), plus the `tomli` backport on Python 3.10. -The dispatch layer itself is stdlib `argparse`. Both entry points — -`[project.scripts] solx = "solx.main:main"` and the zipapp's -`-m "solx.main:main"` — go through `main.py`. +* Unit tests live next to each module (slurm parsing, config, keep + matching/planning, JSON formatting). +* `tests/cli.rs` drives the compiled binary end-to-end with the SLURM mocks + in `tests/mocks/bin` and a tempdir HOME/XDG, asserting stdout, stderr, + and exit codes for the core flows. -### Design notes worth knowing about +CI (`.github/workflows/ci.yml`) runs the same three commands (`check` +job) on every push to main and every PR, plus a `build` job that +compiles the portable binary and uploads it (see below). -- **Startup latency is a budget.** `solx`'s home is NFS, where every - module import is a network round-trip, so the import graph on the hot - path is deliberately tiny: importing `main.py` loads nothing beyond - the interpreter baseline, `--version`/`version` return before the - argparse tree is built, and each handler imports its own command body - (with its `rich`/`pathspec` trees) on demand. `--json` and piped runs - never import `rich`. `tests/test_main.py` guards the budget (e.g. a - fresh-interpreter check that dispatch loads no third-party CLI - framework); `evals/runner/bench_solx_latency.sh` measures the result - on a real Sol node. -- **Completions are static, generated from one table.** - `_completions.py::COMMANDS` describes the command surface (commands, - flags, choices, help strings) once; `bash_script()` / `zsh_script()` / - `fish_script()` render it into scripts that never exec `solx` at - completion time. The zsh script's footer keys on `$zsh_eval_context` - so the same output works both eval/sourced and autoloaded from - `fpath`. When you add or rename a command or flag in `main.py`, update - `COMMANDS` to match — `tests/test_completions.py` checks the surface. -- **No persistent state.** `solx` queries `squeue -u $USER` whenever it - needs to know what jobs you have. There's no `session.json`, no - stale-state class of bugs. Cost: one squeue call per command — fine - on a login node. -- **Slurm is the source of truth.** Job-id resolution - (`stop`/`jump`/`time`) reads `$SLURM_JOB_ID` if set (compute-node - default), then asks squeue. It's **verb-aware** (`slurm.Resolution`): - with ≥2 jobs, `time`/`jump` auto-pick the most recent (highest job id, - `most_recent()`), while `stop` never guesses and exits 2 with the - candidate list — a wrong cancel is irreversible. Acting from inside an - allocation triggers a nesting heads-up (`jump`) or self-cancel confirm - (`stop`). Rationale lives in the design panel synthesis; summary in - [`../docs/solx.md`](../docs/solx.md#leaving-out-the-job-id). -- **Output is `Out` (`output.py`), not bare `print`/`Console`.** Each - command body takes an `Out` that decides JSON vs Rich (auto: JSON when - stdout isn't a TTY; global `--json` forces it) and splits streams — - results to stdout, every diagnostic to stderr. Destructive commands - refuse (`exit 2`) in a non-interactive session rather than hang on a - prompt. Tests build an `Out` over `StringIO` consoles with an explicit - mode (see `make_out` in `tests/test_jobs.py` / `test_keep.py`). -- **`Runner` injection** in `slurm.py`. Every subprocess call goes - through a `Runner` callable that takes argv and returns - `(returncode, stdout, stderr)`. Tests pass synthetic runners that - return canned output without spawning subprocesses. The real runner - is `slurm.real_runner`. -- **`salloc --no-shell`, not `sbatch --wrap='sleep infinity'`.** Sol - has Slurm 25.x; the native primitive is available. Cleaner `seff` - output, no `sleep` process billed against the allocation. Jobid is - parsed from salloc's stderr (`Granted job allocation N`) — well-known - Slurm output that's been stable for years. -- **No `[shared]` merge in config.** Each `[jobs.]` is - self-contained. The trade: simpler schema, slightly more typing if - you want a flag in every template. Worth it; merge logic was - contributing more confusion than savings. -- **`keep`** renews CSV-flagged scratch dirs filtered by the keep-list - (`--stage`, `--csv-dir`, `-j`, `-n`, `-v`); it only renews what Sol has - flagged. The keep-list lives in the `[keep]` config block; a legacy - `~/.solkeep` is read as a **deprecated** fallback (warned, removed in - 1.0.0 — see `keep.SOLKEEP_REMOVED_IN`; `solx config import-solkeep` - migrates it). Execution is **file-level sharded** (PR #18): - `_pick_lister` (fd/rg/find) → `enumerate_dir` → `shard` → `touch_files` - on a bounded streaming window, so `-j` scales the biggest single - directory, not just the directory count. `_execute` has a serial - `jobs_n<=1` fast path (no process pool) used by tests and the - end-to-end real-touch test. -- **Top-level shortcut for `jump`.** `solx jump` and `solx job jump` - both work. The verb you reach for most earns the shortcut. No other - verbs get this treatment; it'd make help-text noisy. +## Building and installing -### Aliases — what's wired +A native development build, for running on the same machine: -- `solx jobs *` → `solx job *` and `solx job ls` → `solx job list`: - `main()` rewrites the tokens before parsing, so the aliases never - appear in `--help`. -- `solx jump` (top-level) and `solx job jump` are separate subparsers - sharing the same handler (`_cmd_jump`). -- `solx version` / `solx help` are subcommands aliasing `--version` / - `--help`; `version` and `--version` short-circuit in `main()` before - the parser tree is built. -- All exercised by `tests/test_main.py` (`test_*alias*`, - `test_top_level_jump_*`, `test_version_*`) — if you change a command - name, those tests fail loudly. - -## Testing - -```shell -cd solx -uv sync # one-time -uv run pytest # full suite -uv run pytest -v # verbose -uv run pytest tests/test_jobs.py::test_start_passthrough_appended -v +```console +$ export CARGO_TARGET_DIR=/tmp/solx-target # keep artifacts off the NFS home +$ cargo build --release # -> $CARGO_TARGET_DIR/release/solx +$ cp "$CARGO_TARGET_DIR/release/solx" ~/.local/bin-test/solx ``` -The whole suite runs in a few seconds — the only subprocesses are the -shell syntax checks and the fresh-interpreter import-budget guards; -everything else stays in-process with no real disk other than -`tmp_path`. - -For black-box regression of the whole CLI surface (stdout/stderr/exit -codes against deterministic SLURM mocks), see the parity matrix at -[`../evals/parity/`](../evals/parity/README.md). - -### Coverage targets +This links the host's glibc, so it runs on the box it was built on (Sol +included). For a binary that runs anywhere — the form CI uploads and a +release ships — build the statically linked musl target: -| Module | What's tested | -|---|---| -| `side.py` | `detect()` parsing branches (Sol login, Sol compute, not-Sol, FQDN-only fallback). | -| `config.py` | TOML schema parse, every required-key error, type errors, `pathspec` glob compilation, `parse_duration`, XDG fallback, **starter config round-trips through `load()`** (so `solx init` output is always valid), **starter config has no maintainer name baked in** (`sparky` only). | -| `output.py` | `Out.auto` force/auto-detect, stdout/stderr split, clean JSON emission, `emit` json-vs-human branch. | -| `slurm.py` | `squeue` row parsing; verb-aware `resolve_jobid` (arg / env / single / zero, stop-ambiguous-no-autopick, time-most-recent, jump-running-only + no-running); `most_recent` (highest id, array ids); every argv builder; `parse_granted_jobid`; `run_salloc` success + failure. | -| `jobs.py` | `cmd_list` (empty / populated / squeue-fail / **JSON**), `cmd_start` (default template, dry-run, passthrough, salloc failure, unknown template, **JSON jobid**), `cmd_stop` (`-y`/`-n` mutex, dry-run, prompt proceed/abort, **non-interactive refuse**, ambiguous-no-autopick + JSON candidates, self-cancel warning, JSON), `cmd_jump` (arg, **inside warn-and-proceed**, `-q` suppress, most-recent, no-running), `cmd_time` (arg, JSON, most-recent). | -| `keep.py` | CSV parsing, `build_plan` filter + dedup + exclude carve-out, `shard`/`enumerate_dir`/`touch_files` units, `cmd_keep` `-y`/`-n` mutex, no-`[keep]` exit 2, dry-run no-execute, prompt branches, **non-interactive refuse**, single-stage filter, failure propagation, JSON summary + dry-run plan, **end-to-end real-touch** (recursion + carve-out + non-kept). | -| `init.py` | Fresh write, parent-dir creation, mode 0600, refuse-existing-without-force, `--force` overwrite, prompt-and-confirm. | -| `main.py` | Every command + alias path dispatches (bodies mocked — wiring, not behavior); `--json` in both positions; the `job start` tail parser (passthrough, `--timeout`, template after `--`, trailing `--json` stays passthrough); no option abbreviation; import-budget guards (entry module stays lean, dispatch loads no third-party CLI framework). | -| `_completions.py` | Every command/flag appears in each shell's script; zsh dual-mode footer; path-valued flags complete files/dirs; each script passes its shell's syntax check (`zsh -n` / `bash -n` / `fish --no-execute`, skipped when the shell is absent). | - -### Test fixtures - -- `tests/conftest.py::_isolate_slurm_env` (autouse) clears `SLURM_*` - env vars before each test. The dev machine is Sol itself; pytest - may be invoked from inside an allocation. Tests that *want* - `$SLURM_JOB_ID` set must `monkeypatch.setenv` it explicitly. -- `config_path` / `write_config` for TOML round-trip tests. -- `SAMPLE_CONFIG_TOML` exports a known-good full config. - -## Building and installing locally - -Build the single-file zipapp from the worktree and install it under a -throwaway prefix, so it never shadows the `solx` you already have on -`PATH`: - -```shell -cd solx -bash scripts/build-pyz.sh # -> dist/solx.pyz -SOLX_INSTALL_DIR="$HOME/.local/bin-test" bash scripts/install.sh dist/solx.pyz -"$HOME/.local/bin-test/solx" --version +```console +$ rustup target add x86_64-unknown-linux-musl # one-time; no musl-gcc needed +$ cargo build --release --target x86_64-unknown-linux-musl ``` -`install.sh` re-stamps the shebang with the destination machine's -interpreter, so always install the `.pyz` through it — running the raw -`dist/solx.pyz` relies on whatever interpreter path was baked in at build -time. +The result is a self-contained executable (`ldd` reports "statically +linked") with no libc-version dependency. -**From a PR, without building.** Every push and pull request runs the -`build` job in `.github/workflows/ci.yml`, which attaches the zipapp as -the `solx-pyz` artifact. Download it from the PR's *Checks → Artifacts*, -then install it the same way (the CI build's shebang points at a runner -path, so `install.sh` re-stamping it is required, not optional): +**From a PR, without a toolchain.** The `build` job attaches the musl +binary as the `solx-x86_64-linux-musl` artifact on every push/PR. +Download it from the PR's *Checks → Artifacts*, `chmod +x`, and run it +on Sol as-is — no install step, no toolchain: -```shell -SOLX_INSTALL_DIR="$HOME/.local/bin-test" bash solx/scripts/install.sh ~/Downloads/solx.pyz +```console +$ chmod +x solx && ./solx --version ``` - -## Manual smoke on Sol - -The unit tests cover every code path that doesn't require real -cluster round-trips. The smoke checklist below validates the round -trips. `htc`/`debug` queues in seconds, so a full lifecycle takes -under two minutes. - -After `ssh sparky@sol.asu.edu` (with your ASURITE): - -1. **Install fresh**: - ```shell - uv tool install --reinstall git+https://github.com/Shu-Wan/solx.git#subdirectory=solx - # or the single-file channel: curl -fsSL .../releases/latest/download/install.sh | sh - solx --version - ``` - -2. **Init + show**: - ```shell - solx init - solx config show - solx config show --json | jq . - ``` - -3. **Edit config** to add a real `[keep]` include path you actually - own: - ```shell - solx config edit - ``` - -4. **Dry-run before any live allocation**: - ```shell - solx job start debug --dry-run - # prints the salloc argv; verify partition/time/qos look right - ``` - -5. **Live allocation lifecycle**: - ```shell - solx job start debug - # waits a few seconds for queue grant, prints "allocated job N" - solx job list - # table shows the new job, state RUNNING - solx job time - # prints D-HH:MM:SS remaining (no-arg path: sole running job) - solx job jump - # drops into your default_shell on the compute node - exit - # back to login shell; allocation still alive - solx job stop - # prompts "Cancel job N? [y/N]" — type y - solx job list - # the job is gone - ``` - -6. **`-y` skip + `-n` preview** for `solx job stop`: - ```shell - solx job start debug - jid=$(solx job list | awk 'NR==2 {print $1}') # or eyeball it - solx job stop "$jid" -n - # prints scancel argv; nothing happens - solx job stop "$jid" -y - # cancels without prompting - ``` - -7. **Verb-aware job-id resolution edge cases**: - ```shell - # no jobs: solx job time → exit 1, "no jobs found" - # start two debug jobs, then with NO arg: - # solx job time → picks the most recent (higher jobid), note on stderr, exit 0 - # solx job jump → attaches to the most recent running job, exit 0 - # solx job stop → prints the candidate table, exit 2 (never guesses) - # inside an allocation (after `solx job jump`): - # solx job stop → "Cancel job N (the one you're inside)?" self-cancel confirm - # solx job jump → warns about nesting, still attaches (-q silences) - ``` - -7a. **Agent / non-interactive behavior** (no TTY): - ```shell - solx job list | jq . # JSON array (auto-detected off-TTY) - solx job time /dev/null - # if any exist: - solx keep --dry-run -v - # plan summary; verify the kept list looks right - solx keep - # prompts "Touch mtimes on N directories? [y/N]" — type y - ``` - -10. **Wrong-side guard** (run on a laptop, not Sol): - ```shell - solx --version # works - solx --help # works - solx job list # exit 2 with "solx is Sol-only — SSH first" - solx keep # same - ``` - -11. **Aliases**: - ```shell - solx jobs list # same as solx job list - solx job ls # same - solx jump 12345 # same as solx job jump 12345 - ``` - -12. **Completions**: - ```shell - solx completions zsh > /tmp/solx.zsh - source /tmp/solx.zsh - solx # subcommands appear - solx job s # start, stop appear - ``` - -## Releasing - -The CLI and the skill share one version line, and CI publishes the -release. To cut `vX.Y.Z`: - -1. Bump `solx/src/solx/__init__.py::__version__`, - `solx/pyproject.toml::version`, and the `version:` field in - `../skills/sol-skill/SKILL.md` (keep all three matched), then refresh - the lock (`uv lock`). -2. Move the `[Unreleased]` notes under a `## [X.Y.Z]` heading in - `../CHANGELOG.md`; update `../docs/coverage.md`. -3. Run the full test suite + at least the smoke flow above. -4. Tag `vX.Y.Z` and push it. `.github/workflows/release.yml` verifies the - tag matches `solx --version`, builds `solx.pyz`, and publishes a - GitHub Release with `solx.pyz` + `install.sh` attached. - -## When in doubt - -- The user-facing behavior of `solx` lives in the manual - [`../docs/solx.md`](../docs/solx.md); the roadmap and design decisions are - in [`../docs/ROADMAP.md`](../docs/ROADMAP.md). When code and docs disagree, raise - it — usually the code is right and the doc needs an update, but check. -- The agent skill at `../skills/sol-skill/` drives `solx` - (`references/solx.md` is its CLI reference). Keep the skill's user - guidance there and `solx` architecture/test detail here. -- The repo root `README.md` and `DEVELOPMENT.md` cover the whole project - (CLI + skill + evals); this file is the `solx` package's internals. diff --git a/solx/README.md b/solx/README.md index 3fd5c5f..b759b08 100644 --- a/solx/README.md +++ b/solx/README.md @@ -1,250 +1,108 @@ -# ☀️ solx +# solx -A command-line tool for daily work on ASU's -[Sol supercomputer](https://docs.rc.asu.edu/). `solx` wraps the -handful of Slurm operations a terminal-driven user actually does: list -jobs, request an interactive allocation, drop into a shell on the -compute node, cancel, query remaining time, and renew `/scratch` files -that Sol has flagged for deletion. +The `solx` CLI for ASU's Sol supercomputer: interactive Slurm job +management (`solx job start/stop/jump/time/list`), scratch renewal +(`solx keep`), and a single TOML config (`solx config`, `solx init`). -SSH to Sol, then run `solx` from a login or compute node. - -## Status - -This is a personal toolkit. Active development; expect breaking changes -between minor versions until 1.0. The project is **not affiliated with -or endorsed by ASU Research Computing**. The authoritative docs for Sol -are at . +A single native binary (Rust). The command surface, JSON output, +diagnostics, and exit codes are locked by the crate's end-to-end and unit +tests; see [`../docs/solx.md`](../docs/solx.md) for the full command +reference. One binary, no interpreter or virtualenv on the critical path — +a cold start from NFS home is a single exec. ## Install -`solx` provisions its own Python via [`uv`](https://docs.astral.sh/uv/) -(Sol's system `python3` is older than the Python ≥ 3.10 `solx` needs). -Install `uv` from [astral.sh/uv](https://docs.astral.sh/uv/) first if it -isn't on your `$PATH`. +The supported install is a prebuilt single binary from a CI release: no +Rust toolchain, no Python, no `uv` on the box. Download it, make it +executable, and put it anywhere on `PATH`: -```shell -# Recommended on Sol: single-file install — one file open at cold start on -# the NFS home, so startup stays fast. Re-run it to upgrade. -curl -fsSL https://github.com/Shu-Wan/solx/releases/latest/download/install.sh | sh +```console +$ mkdir -p ~/.local/bin +$ curl -fLo solx https://github.com/Shu-Wan/solx/releases/latest/download/solx-x86_64-unknown-linux-musl +$ chmod +x solx +$ mv solx ~/.local/bin/ +``` -# Alternative: as a uv tool — isolated venv, on $PATH automatically. -uv tool install git+https://github.com/Shu-Wan/solx.git#subdirectory=solx +Then set up as usual: -solx --version -solx init # writes ~/.config/solx/config.toml -solx config edit # tune partitions, [keep] paths, etc. -solx config show # sanity-check +```console +$ solx init # write the starter config +$ solx completions zsh > ~/.zfunc/_solx # optional tab completion ``` -### Shell completion +## Toolchain on Sol -`solx completions ` prints a fully static completion script — -completing never runs `solx`, so the first Tab is instant. Add it to your -shell's startup file, then restart your shell: +Contributor setup for building from source — users installing a release +binary never need any of this. None of it requires sudo. -```shell -# bash — add to ~/.bashrc -eval "$(solx completions bash)" +* **Rust via rustup, user-install.** -# zsh — add to ~/.zshrc (after compinit) -eval "$(solx completions zsh)" + ```console + $ curl https://sh.rustup.rs | sh -s -- -y --profile minimal + ``` -# fish — add to ~/.config/fish/config.fish -solx completions fish | source -``` + Installs to `~/.cargo` and works on both login and compute nodes. + `rust-toolchain.toml` pins the channel; rustup fetches it on first build. -For zsh the same script also works installed on `fpath` -(`mkdir -p ~/.zfunc && solx completions zsh > ~/.zfunc/_solx`, then -`fpath+=(~/.zfunc)` before `compinit` in `~/.zshrc`). Regenerate installed -scripts after upgrading so new commands and flags complete. - -## Quick start - -```shell -solx init # one-time: write ~/.config/solx/config.toml -solx config edit # tune templates + [keep] paths -solx job start debug # request an interactive allocation -solx job list # see it (RUNNING) -solx job time # how much time is left -solx job jump # drop into a shell on the compute node -# ... do work ... -exit # back to login node; allocation still alive -solx job stop # cancel (prompts; -y to skip) -solx keep --dry-run # preview which scratch files would be renewed -solx keep # renew them (prompts) -``` +* **Build artifacts on node-local storage.** Build artifacts on the NFS + home are painfully slow; point `CARGO_TARGET_DIR` at node-local storage. + The `~/.cargo` registry cache staying on NFS is a one-time acceptable + cost. + + ```console + $ export CARGO_TARGET_DIR=/tmp/solx-target + ``` + +* **crates.io connectivity.** crates.io is reachable from compute nodes + but rejects UA-less HEAD probes with 403, so `curl -I` reports failure + on a working connection. Verify with a real GET: -## Design philosophy - -`solx` is designed to be usable by both a person at a terminal and an agent -running shell commands on their behalf. The CLI keeps behavior explicit and -machine-readable without hiding Slurm as the source of truth. - -- **Run on Sol.** `solx` is a Sol-side tool. It does not construct SSH chains, - read `~/.ssh/*`, or manage laptop state. -- **Prefer declared state.** One TOML config defines shells, job templates, and - scratch keep paths. Job state comes from Slurm, not a persistent session file. -- **Expose parseable output.** TTY output is human-readable; piped output or - `--json` is JSON. Results go to stdout, diagnostics to stderr, and exit codes - distinguish success, operational no-op, and under-specified input. -- **Make destructive operations explicit.** `job stop` and `keep` show the plan - first, support `--dry-run`, prompt by default, and refuse non-interactive runs - unless `--yes` or `--dry-run` is supplied. -- **Bound filesystem changes.** `keep` only updates timestamps for directories - that are both configured by the user and flagged by Sol's warning CSVs. It - never blanket-touches `/scratch`, and it never reads, moves, or deletes file - contents. -- **Do not replace every Slurm command.** `solx` wraps repeated interactive - workflows. For one-off status reads or known-job cancellation, raw Slurm can - still be the right tool. - -## Command reference - -`solx` is a flat-ish CLI. Common ergonomics: noun-verb subgroups for -related operations, top-level shortcuts where they earn it. - -| Command | What it does | -|---|---| -| `solx init [-f]` | Write a starter `config.toml`. On a terminal, offers a short walkthrough — pick your shell and (if present) import your `~/.solkeep` into `[keep]`. Refuses to overwrite without `-f` (or interactive `y`). | -| `solx job list` | List my Sol jobs (Rich table on a TTY, JSON when piped). Aliases: `solx jobs list`, `solx job ls`, `solx jobs ls`. | -| `solx job start [TEMPLATE] [-n] [--timeout T] [-- ...]` | Request an interactive allocation via `salloc --no-shell`. `TEMPLATE` defaults to `default_template`; tail after `--` is appended verbatim to `salloc`. | -| `solx job stop [JOBID] [-y] [-n]` | Cancel a job. Prompts unless `-y`; `-n` previews the `scancel` invocation. | -| `solx job jump [JOBID] [-q]` | Drop into `default_shell` on the compute node via `srun --pty`. Also reachable as `solx jump [JOBID]`. `-q/--quiet` silences the nesting / most-recent heads-up. | -| `solx job time [JOBID]` | Print remaining time in Slurm's `D-HH:MM:SS` format. | -| `solx keep [--solkeep F] [--stage S] [--csv-dir D] [-j N] [-y] [-n] [-v]` | Renew CSV-flagged scratch files. Keep-list source: `--solkeep` > the `[keep]` config block > `~/.solkeep` (auto-detected, so an existing `.solkeep` from the skill just works). | -| `solx config show [--json]` | Print the resolved config. | -| `solx config edit` | Open `config.toml` in `$EDITOR`. | -| `solx config import-solkeep` | Migrate a legacy `~/.solkeep` into the config's `[keep]` block. | -| `solx completions ` | Emit a shell completion script. | -| `solx --version`, `--help` | — | - -`--json` is accepted before the subcommand (`solx --json job list`) or -after it (`solx job list --json`) — except after `job start`, where -post-command tokens pass through to `salloc`. See the full manual at -[`docs/solx.md`](../docs/solx.md). - -### Aliases - -- The `job` subgroup is also reachable as `jobs`. Both `solx job list` - and `solx jobs list` work. -- The `list` verb is also reachable as `ls`. -- `solx jump` is shorthand for `solx job jump`. The verb you reach for - most often earns the top-level slot. - -### Default-jobid resolution (verb-aware) - -When you omit `[JOBID]`: an explicit arg wins, else `$SLURM_JOB_ID` (you're -inside an allocation), else `squeue -u $USER`. With **≥2 matching jobs** the -verbs differ — `time`/`jump` auto-pick the **most recent** (highest job id), -while `stop` **never** guesses and exits 2 to disambiguate. Acting from inside -an allocation warns about nesting (`jump`, `-q` to silence) or self-cancel -(`stop`). Full rules: [`docs/solx.md`](../docs/solx.md#leaving-out-the-job-id). - -### Destructive-command confirmation contract - -`solx job stop` and `solx keep` mutate state — cancel a running -allocation, or `touch` mtimes under `/scratch`. Both follow: - -| Flag | Behavior | -|---|---| -| (none) | Print what's about to happen, then prompt `Proceed? [y/N]`. Default no. | -| `-y`/`--yes` (or `-f`/`--force`) | Skip the prompt and execute. For scripts. | -| `-n`, `--dry-run` | Print the plan without executing. **No prompt** — nothing destructive is about to happen. | - -`-y` and `-n` together exit 2 (mutually exclusive). In a **non-interactive -session** (no stdin TTY) without `-y`/`-n`, both commands **refuse with exit 2** -rather than hang on a prompt — safe to drive from an agent or cron. - -### Output: human or CLI agent - -Output auto-detects — **JSON when stdout is not a TTY**, Rich tables on a -terminal; `--json` (before or after the subcommand) forces JSON anywhere. A -human at a terminal gets tables with no flag. Results go to stdout, all -diagnostics to stderr, so `solx --json job list | jq …` and `solx job time` -(bare duration) both pipe cleanly. Exit codes: `0` success, -`1` operational/nothing-to-do, `2` under-specified or unconfirmed. This is the -[issue #16](https://github.com/Shu-Wan/solx/issues/16) "design for -agents" behavior; details in [`docs/solx.md`](../docs/solx.md#output-for-scripts). - -Other commands (`init`, `job start`, `job list`, `job jump`, `job time`, -`config show`, `config edit`) don't prompt. `solx init` has its own -overwrite prompt for an existing `config.toml`. - -## Configuration - -A single TOML file at `$XDG_CONFIG_HOME/solx/config.toml` (fallback -`~/.config/solx/config.toml`), created mode `0600` by `solx init`. - -```toml -default_shell = "bash" -default_template = "default" -start_timeout = "10m" # cap on `job start` polling; --timeout overrides - -[jobs.default] -partition = "lightwork" -time = "1-0" -qos = "public" - -[jobs.debug] -partition = "htc" -time = "0-1" - -[jobs.gpu] -partition = "public" -gres = "gpu:a100:1" -time = "0-4" -extra_args = ["--mem=64G", "--cpus-per-task=8"] - -# Scratch paths to keep alive when Sol flags them in a warning CSV -# *and* `solx keep` runs. Replace `sparky` with your ASURITE. -[keep] -include = ["/scratch/sparky/your-project", "/scratch/sparky/experiments/**"] -exclude = ["**/__pycache__", "**/.venv"] + ```console + $ curl -fsS https://index.crates.io/config.json + ``` + +* **glibc.** A binary built on Sol links against RHEL 8's glibc 2.28 and + runs on Sol. CI releases target `x86_64-unknown-linux-musl` (fully + static) for portability. + +With the toolchain in place: + +```console +$ cd solx +$ cargo build --release +$ "${CARGO_TARGET_DIR:-target}/release/solx" --version ``` -### Schema - -| Key | Type | Required | Notes | -|---|---|---|---| -| `default_shell` | string | yes | Used by `solx job jump` when dropping into the compute node. | -| `default_template` | string | yes | Template name for `solx job start` when invoked without an argument. Must match one of `[jobs.*]`. | -| `start_timeout` | string (e.g. `"10m"`) | no, default `"10m"` | Cap on how long `solx job start` waits for the queue. CLI flag `--timeout` overrides per-run. | -| `[jobs.]` | table | yes (≥1) | Interactive job templates. | -| `[jobs.].partition` | string | yes | `-p` | -| `[jobs.].time` | string | yes | `-t` | -| `[jobs.].qos` | string | no | `-q` | -| `[jobs.].gres` | string | no | `--gres=` | -| `[jobs.].extra_args` | array of strings | no | Verbatim Slurm flags passed to `salloc` (e.g. `["--mem=64G", "--mail-type=END"]`). | -| `[keep]` | table | no | Scratch renewal config. If absent, `solx keep` exits 2 with a redirect message. | -| `[keep].include` | array of glob strings | yes when `[keep]` present | Recursive globs (`**` supported via `pathspec`). Gitignore-style. | -| `[keep].exclude` | array of glob strings | no | Carve-outs from `include` (e.g. `**/__pycache__`). | - -There is no `[shared]` merge — each `[jobs.]` table is -self-contained. Repeat flags across templates if you need them in -multiple places. Trade: simpler config; slightly more typing. - -CLI passthrough: anything after `--` on `solx job start` is appended to -the underlying `salloc` command after `extra_args`. Slurm's -last-flag-wins lets the tail override template defaults for one run: - -```shell -solx job start gpu -- --mem=128G --time=8:00:00 +To run a local build, copy it onto `PATH`: + +```console +$ install -m 755 "${CARGO_TARGET_DIR:-target}/release/solx" ~/.local/bin/solx ``` -## Under the hood +## Output contract + +* stdout is the data channel: JSON when piped or under `--json`, a plain + table on a terminal. +* All diagnostics, progress, and prompts go to stderr. +* Exit codes: 0 success, 1 runtime failure, 2 usage error / missing config / + refused action. -The headless-allocation model behind `solx job start` / `jump` and the -CSV-∩-keep-list mechanism behind `solx keep` are documented in the manual: -[`docs/solx.md`](../docs/solx.md#under-the-hood). A legacy `~/.solkeep` still -works but is deprecated (support removed in 1.0.0) — migrate with -`solx config import-solkeep`. +## UX notes -## Contributing / development +* Human tables are plain aligned columns; nothing emits color. +* Confirmation prompts are plain `[y/N]` lines on stderr (TTY only; + non-interactive sessions require `-y` or `-n`). +* `solx completions` emits static scripts (no runtime completion callback + into the binary). -See [`DEVELOPMENT.md`](DEVELOPMENT.md) for architecture, testing -approach, and the manual smoke checklist. +## Development -## License +```console +$ cargo fmt --all +$ cargo clippy --all-targets -- -D warnings +$ cargo test +``` -MIT. See repo root. +`cargo test` runs the unit suites plus end-to-end tests that drive the real +binary against deterministic SLURM mocks in `tests/mocks/bin`. See +`DEVELOPMENT.md` for the module map and the behavior contract. diff --git a/solx/assets/_solx.zsh b/solx/assets/_solx.zsh new file mode 100644 index 0000000..aece276 --- /dev/null +++ b/solx/assets/_solx.zsh @@ -0,0 +1,185 @@ +#compdef solx + +_solx_job() { + local curcontext="$curcontext" state line + typeset -A opt_args + + _arguments -C \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1: :->subcommand' \ + '*:: :->subargs' + + case $state in + (subcommand) + local -a subcommands + subcommands=( + 'list:Print my Sol jobs.' + 'start:Start an interactive allocation from a config template.' + 'stop:Cancel a job (prompts unless -y).' + 'jump:Drop into a shell on the job'\''s compute node.' + 'time:Print remaining time (D-HH\:MM\:SS).' + ) + _describe -t commands 'solx job command' subcommands + ;; + (subargs) + case $words[1] in + (list) + _arguments \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + (start) + _arguments \ + '(-n --dry-run)'{-n,--dry-run}'[Print salloc argv without submitting.]' \ + '--timeout[Override start_timeout (e.g. "5m", "1h").]:value:' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1:template:' + ;; + (stop) + _arguments \ + '(-y --yes -f --force)'{-y,--yes,-f,--force}'[Skip confirmation prompt.]' \ + '(-n --dry-run)'{-n,--dry-run}'[Print scancel argv without executing.]' \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1:jobid:' + ;; + (jump) + _arguments \ + '(-q --quiet)'{-q,--quiet}'[Suppress the nesting / most-recent heads-up.]' \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1:jobid:' + ;; + (time) + _arguments \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1:jobid:' + ;; + esac + ;; + esac +} + +_solx_config() { + local curcontext="$curcontext" state line + typeset -A opt_args + + _arguments -C \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1: :->subcommand' \ + '*:: :->subargs' + + case $state in + (subcommand) + local -a subcommands + subcommands=( + 'show:Print the resolved config.' + 'edit:Open the config in $EDITOR.' + ) + _describe -t commands 'solx config command' subcommands + ;; + (subargs) + case $words[1] in + (show) + _arguments \ + '--json[Emit JSON.]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + (edit) + _arguments \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + esac + ;; + esac +} + +_solx() { + local curcontext="$curcontext" state line + typeset -A opt_args + + _arguments -C \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '--version[Show version and exit.]' \ + '--json[Force JSON output (machine-readable).]' \ + '1: :->command' \ + '*:: :->args' + + case $state in + (command) + local -a commands + commands=( + 'init:Write a starter config.toml.' + 'keep:Renew CSV-flagged scratch files filtered by the keep block in config.' + 'jump:Drop into a shell on the job'\''s compute node (= solx job jump).' + 'job:Manage interactive Slurm jobs on Sol (alias\: jobs).' + 'config:Inspect and edit the solx config.' + 'completions:Emit a shell completion script (bash, zsh, or fish).' + 'cheatsheet:Print the Sol cheat sheet (SLURM + solx quick reference) as text.' + 'version:Show version and exit (alias of --version).' + 'help:Show help and exit (alias of --help).' + ) + _describe -t commands 'solx command' commands + ;; + (args) + case $words[1] in + (init) + _arguments \ + '(-f --force -y --yes)'{-f,--force,-y,--yes}'[Overwrite without prompting.]' \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + (keep) + _arguments \ + '--stage[Which warning CSVs to read.]:value:(all pending over90 inactive)' \ + '--csv-dir[Directory holding Sol'\''s warning CSVs.]:directory:_files -/' \ + '(-j --jobs)'{-j,--jobs}'[Parallel touch workers.]:value:' \ + '(-y --yes -f --force)'{-y,--yes,-f,--force}'[Skip confirmation prompt.]' \ + '(-n --dry-run)'{-n,--dry-run}'[Print plan without executing.]' \ + '(-v --verbose)'{-v,--verbose}'[Verbose plan + progress.]' \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + (jump) + _arguments \ + '(-q --quiet)'{-q,--quiet}'[Suppress the nesting / most-recent heads-up.]' \ + '--json[Force JSON output (machine-readable).]' \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1:jobid:' + ;; + (job|jobs) + _solx_job + ;; + (config) + _solx_config + ;; + (completions) + _arguments \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' \ + '1:shell:(bash zsh fish)' + ;; + (cheatsheet) + _arguments \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + (version) + _arguments \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + (help) + _arguments \ + '(-h --help)'{-h,--help}'[Show this help message and exit.]' + ;; + esac + ;; + esac +} + +if [[ $zsh_eval_context[-1] == loadautofunc ]]; then + # autoload from fpath, call function directly + _solx "$@" +else + # eval/source/. command, register function for later + compdef _solx solx +fi diff --git a/solx/assets/solx.bash b/solx/assets/solx.bash new file mode 100644 index 0000000..d61047c --- /dev/null +++ b/solx/assets/solx.bash @@ -0,0 +1,125 @@ +# bash completion for solx +_solx() { + local cur prev + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + + # On a mid-word Tab, COMP_WORDS carries the whole word; complete against + # only the part left of the cursor. + if [[ -n "${COMP_LINE-}" ]]; then + local left="${COMP_LINE:0:COMP_POINT}" + while [[ -n "$cur" && "${left%"$cur"}" == "$left" ]]; do + cur="${cur%?}" + done + fi + + # First two non-flag words decide the (sub)command context. + local i word cmd="" sub="" + for ((i = 1; i < COMP_CWORD; i++)); do + word="${COMP_WORDS[i]}" + [[ "$word" == -* ]] && continue + if [[ -z "$cmd" ]]; then + cmd="$word" + elif [[ -z "$sub" ]]; then + sub="$word" + fi + done + + # Option values. Path candidates go through mapfile (no word splitting, + # no glob expansion — spaces and metacharacters survive) and `compopt -o + # filenames` (where available) so readline escapes what it inserts. + case "$prev" in + --csv-dir) + type compopt &> /dev/null && compopt -o filenames 2> /dev/null + mapfile -t COMPREPLY < <(compgen -d -- "$cur") + return + ;; + --stage) + mapfile -t COMPREPLY < <(compgen -W "all pending over90 inactive" -- "$cur") + return + ;; + -j|--jobs|--timeout) + return + ;; + esac + + if [[ -z "$cmd" ]]; then + if [[ "$cur" == -* ]]; then + mapfile -t COMPREPLY < <(compgen -W "-h --help --version --json" -- "$cur") + else + mapfile -t COMPREPLY < <(compgen -W "init keep jump job config completions cheatsheet version help" -- "$cur") + fi + return + fi + + local flags="" words="" + case "$cmd" in + init) + flags="-f --force -y --yes --json -h --help" + words="" + ;; + keep) + flags="--stage --csv-dir -j --jobs -y --yes -f --force -n --dry-run -v --verbose --json -h --help" + words="" + ;; + jump) + flags="-q --quiet --json -h --help" + words="" + ;; + completions) + flags="-h --help" + words="bash zsh fish" + ;; + cheatsheet) + flags="-h --help" + words="" + ;; + version) + flags="-h --help" + words="" + ;; + help) + flags="-h --help" + words="" + ;; + job|jobs) + if [[ -z "$sub" ]]; then + if [[ "$cur" != -* ]]; then + mapfile -t COMPREPLY < <(compgen -W "list start stop jump time" -- "$cur") + return + fi + flags="-h --help" + fi + case "$sub" in + list) flags="--json -h --help" ;; + start) flags="-n --dry-run --timeout -h --help" ;; + stop) flags="-y --yes -f --force -n --dry-run --json -h --help" ;; + jump) flags="-q --quiet --json -h --help" ;; + time) flags="--json -h --help" ;; + esac + ;; + config) + if [[ -z "$sub" ]]; then + if [[ "$cur" != -* ]]; then + mapfile -t COMPREPLY < <(compgen -W "show edit" -- "$cur") + return + fi + flags="-h --help" + fi + case "$sub" in + show) flags="--json -h --help" ;; + edit) flags="-h --help" ;; + esac + ;; + esac + if [[ "$cur" == -* ]]; then + mapfile -t COMPREPLY < <(compgen -W "$flags" -- "$cur") + elif [[ -n "$words" && -z "$sub" ]]; then + # $words holds positional choices; offer them only until the + # positional is filled. + mapfile -t COMPREPLY < <(compgen -W "$words" -- "$cur") + fi +} + +complete -F _solx solx diff --git a/solx/assets/solx.fish b/solx/assets/solx.fish new file mode 100644 index 0000000..411b186 --- /dev/null +++ b/solx/assets/solx.fish @@ -0,0 +1,59 @@ +# fish completion for solx +complete -c solx -f +complete -c solx -n __fish_use_subcommand -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -l version -d 'Show version and exit.' +complete -c solx -n __fish_use_subcommand -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n __fish_use_subcommand -a init -d 'Write a starter config.toml.' +complete -c solx -n '__fish_seen_subcommand_from init' -s f -l force -s y -l yes -d 'Overwrite without prompting.' +complete -c solx -n '__fish_seen_subcommand_from init' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from init' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a keep -d 'Renew CSV-flagged scratch files filtered by the keep block in config.' +complete -c solx -n '__fish_seen_subcommand_from keep' -l stage -x -a 'all pending over90 inactive' -d 'Which warning CSVs to read.' +complete -c solx -n '__fish_seen_subcommand_from keep' -l csv-dir -r -F -d 'Directory holding Sol\'s warning CSVs.' +complete -c solx -n '__fish_seen_subcommand_from keep' -s j -l jobs -x -d 'Parallel touch workers.' +complete -c solx -n '__fish_seen_subcommand_from keep' -s y -l yes -s f -l force -d 'Skip confirmation prompt.' +complete -c solx -n '__fish_seen_subcommand_from keep' -s n -l dry-run -d 'Print plan without executing.' +complete -c solx -n '__fish_seen_subcommand_from keep' -s v -l verbose -d 'Verbose plan + progress.' +complete -c solx -n '__fish_seen_subcommand_from keep' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from keep' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a jump -d 'Drop into a shell on the job\'s compute node (= solx job jump).' +complete -c solx -n '__fish_seen_subcommand_from jump' -s q -l quiet -d 'Suppress the nesting / most-recent heads-up.' +complete -c solx -n '__fish_seen_subcommand_from jump' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from jump' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a job -d 'Manage interactive Slurm jobs on Sol (alias: jobs).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and not __fish_seen_subcommand_from list start stop jump time' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and not __fish_seen_subcommand_from list start stop jump time' -a list -d 'Print my Sol jobs.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from list' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from list' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and not __fish_seen_subcommand_from list start stop jump time' -a start -d 'Start an interactive allocation from a config template.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from start' -s n -l dry-run -d 'Print salloc argv without submitting.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from start' -l timeout -x -d 'Override start_timeout (e.g. "5m", "1h").' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from start' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and not __fish_seen_subcommand_from list start stop jump time' -a stop -d 'Cancel a job (prompts unless -y).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from stop' -s y -l yes -s f -l force -d 'Skip confirmation prompt.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from stop' -s n -l dry-run -d 'Print scancel argv without executing.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from stop' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from stop' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and not __fish_seen_subcommand_from list start stop jump time' -a jump -d 'Drop into a shell on the job\'s compute node.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from jump' -s q -l quiet -d 'Suppress the nesting / most-recent heads-up.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from jump' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from jump' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and not __fish_seen_subcommand_from list start stop jump time' -a time -d 'Print remaining time (D-HH:MM:SS).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from time' -l json -d 'Force JSON output (machine-readable).' +complete -c solx -n '__fish_seen_subcommand_from job jobs; and __fish_seen_subcommand_from time' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a config -d 'Inspect and edit the solx config.' +complete -c solx -n '__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show edit' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show edit' -a show -d 'Print the resolved config.' +complete -c solx -n '__fish_seen_subcommand_from config; and __fish_seen_subcommand_from show' -l json -d 'Emit JSON.' +complete -c solx -n '__fish_seen_subcommand_from config; and __fish_seen_subcommand_from show' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from config; and not __fish_seen_subcommand_from show edit' -a edit -d 'Open the config in $EDITOR.' +complete -c solx -n '__fish_seen_subcommand_from config; and __fish_seen_subcommand_from edit' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a completions -d 'Emit a shell completion script (bash, zsh, or fish).' +complete -c solx -n '__fish_seen_subcommand_from completions' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n '__fish_seen_subcommand_from completions; and not __fish_seen_subcommand_from bash zsh fish' -a 'bash zsh fish' +complete -c solx -n __fish_use_subcommand -a cheatsheet -d 'Print the Sol cheat sheet (SLURM + solx quick reference) as text.' +complete -c solx -n '__fish_seen_subcommand_from cheatsheet' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a version -d 'Show version and exit (alias of --version).' +complete -c solx -n '__fish_seen_subcommand_from version' -s h -l help -d 'Show this help message and exit.' +complete -c solx -n __fish_use_subcommand -a help -d 'Show help and exit (alias of --help).' +complete -c solx -n '__fish_seen_subcommand_from help' -s h -l help -d 'Show this help message and exit.' diff --git a/solx/pyproject.toml b/solx/pyproject.toml deleted file mode 100644 index d18903b..0000000 --- a/solx/pyproject.toml +++ /dev/null @@ -1,52 +0,0 @@ -[project] -name = "solx" -version = "0.5.1" -description = "CLI for ASU's Sol supercomputer." -readme = "README.md" -requires-python = ">=3.10" -license = { text = "MIT" } -authors = [ - { name = "Shu Wan" }, -] -keywords = ["asu", "sol", "hpc", "slurm", "cli"] -classifiers = [ - "Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Topic :: System :: Distributed Computing", -] -dependencies = [ - "rich>=13", - "pathspec>=0.12", - "tomli>=2.0; python_version < '3.11'", # tomllib backport for 3.10 -] - -[project.scripts] -solx = "solx.main:main" - -[project.urls] -Homepage = "https://github.com/Shu-Wan/solx" -Issues = "https://github.com/Shu-Wan/solx/issues" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/solx"] - -[dependency-groups] -dev = [ - "pytest>=8", - "ruff>=0.13", -] - -[tool.pytest.ini_options] -testpaths = ["tests"] -addopts = ["-ra", "--strict-markers"] diff --git a/solx/rust-toolchain.toml b/solx/rust-toolchain.toml new file mode 100644 index 0000000..292fe49 --- /dev/null +++ b/solx/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "stable" diff --git a/solx/scripts/build-pyz.sh b/solx/scripts/build-pyz.sh deleted file mode 100755 index ef6b3b6..0000000 --- a/solx/scripts/build-pyz.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash -# Build dist/solx.pyz — solx and its dependencies as a single-file zipapp. -# -# Why a zipapp: on an NFS home (Sol), a venv install pays one network -# round-trip per module file at cold start; a .pyz is one file open, so a -# cold `solx` start stays fast no matter how many modules are inside. -# -# Bytecode is precompiled in legacy layout (compileall -b puts `mod.pyc` -# beside `mod.py`) because that is the layout zipimport loads — it never -# writes a bytecode cache of its own. The .pyc format is interpreter- -# specific, so the version here must match the shebang install.sh stamps: -# both default to PYVER below and read SOLX_PYTHON to override together. -# 3.11 is the floor with native tomllib (solx supports 3.10+ via the tomli -# backport, but the artifact targets one interpreter). -set -euo pipefail - -PYVER="${SOLX_PYTHON:-3.11}" -ROOT="$(cd "$(dirname "$0")/.." && pwd)" -STAGE="$ROOT/build/pyz" - -uv python find "$PYVER" >/dev/null 2>&1 || uv python install "$PYVER" -PY="$(uv python find "$PYVER")" - -rm -rf "$STAGE" -mkdir -p "$STAGE" "$ROOT/dist" - -# Install the LOCKED dependency set so the shipped artifact matches the -# environment CI tested (`uv run --frozen`), not whatever the resolver picks -# today — `uv pip install "$ROOT"` re-resolves and can drift. Export the -# locked deps, install those, then add solx itself with --no-deps so nothing -# re-resolves. -uv export --frozen --no-dev --no-emit-project --project "$ROOT" -o "$STAGE/requirements.txt" -uv pip install --python "$PY" --target "$STAGE" --quiet -r "$STAGE/requirements.txt" -uv pip install --python "$PY" --target "$STAGE" --quiet --no-deps "$ROOT" -rm -f "$STAGE/requirements.txt" -rm -rf "$STAGE/bin" # entry-point scripts; the zipapp __main__ replaces them - -"$PY" -m compileall -b -q "$STAGE" -# -p stamps the build interpreter's absolute path as the shebang, so the -# artifact is directly executable on the build machine (./dist/solx.pyz). -# install.sh rebuilds the archive around the destination machine's interpreter -# (the shebang can't be swapped in place — the offsets are absolute). -"$PY" -m zipapp "$STAGE" -o "$ROOT/dist/solx.pyz" -m "solx.main:main" -c -p "$PY" - -echo "built $ROOT/dist/solx.pyz ($(du -h "$ROOT/dist/solx.pyz" | cut -f1), python $PYVER)" diff --git a/solx/scripts/install.sh b/solx/scripts/install.sh deleted file mode 100755 index 44d3493..0000000 --- a/solx/scripts/install.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/sh -# Install solx as a single-file zipapp at ~/.local/bin/solx. -# -# Usage: -# install.sh # download the latest release artifact -# install.sh path/to/solx.pyz # install a local build (testing) -# -# Environment: -# SOLX_INSTALL_DIR install location (default: $XDG_BIN_HOME, falling -# back to ~/.local/bin) -# SOLX_PYTHON interpreter to bind the zipapp to. A version (e.g. -# 3.11, the default) is resolved — and installed if -# missing — via uv; a path (anything with a slash) is -# used as-is, so uv is not required. Either way it must -# match the version build-pyz.sh compiled with: the -# embedded bytecode is interpreter-specific. -# -# Sol's system python3 is older than solx supports, so by default the script -# resolves a uv-managed interpreter and binds the .pyz to it via an absolute -# shebang. uv is only needed at install time, not at runtime. -set -eu - -PYREQ="${SOLX_PYTHON:-3.11}" -BIN="${SOLX_INSTALL_DIR:-${XDG_BIN_HOME:-$HOME/.local/bin}}" -SRC="${1:-https://github.com/Shu-Wan/solx/releases/latest/download/solx.pyz}" - -case "$PYREQ" in - */*) - # An explicit interpreter path — used as given; no uv needed. - PY="$PYREQ" - [ -x "$PY" ] || { - echo "solx install: SOLX_PYTHON=$PY is not an executable interpreter." >&2 - exit 1 - } - ;; - *) - command -v uv >/dev/null 2>&1 || { - echo "solx install: uv is required to provision Python $PYREQ" >&2 - echo "(or set SOLX_PYTHON to an existing interpreter path)." >&2 - echo "Install uv first: https://docs.astral.sh/uv/" >&2 - exit 1 - } - uv python find "$PYREQ" >/dev/null 2>&1 || uv python install "$PYREQ" - PY="$(uv python find "$PYREQ")" - ;; -esac - -TMP="$(mktemp)" -STAGE="$(mktemp -d)" -trap 'rm -rf "$TMP" "$STAGE"' EXIT -case "$SRC" in - http://* | https://*) curl -fsSL "$SRC" -o "$TMP" ;; - *) cp "$SRC" "$TMP" ;; -esac - -# A zipapp records its central-directory offsets as absolute file positions -# that include the shebang line, so the interpreter cannot be rebound by -# swapping the shebang bytes — a different-length path shifts every offset and -# zipimport (which runs the archive) refuses it with "bad central directory". -# Extract the payload and rebuild the archive around this machine's -# interpreter instead, which regenerates the offsets. zipfile reads the -# build machine's shebang prefix fine; only zipimport is strict. -"$PY" -m zipfile -e "$TMP" "$STAGE" - -mkdir -p "$BIN" -# Remove first: a previous `uv tool install` leaves a symlink here, and -# writing through it would clobber the tool venv's entry point instead. -rm -f "$BIN/solx" - -# Rebuild the zipapp around $1 and confirm it actually runs. A correctly -# built archive runs under any matching interpreter; the smoke test is the -# guard that we never install a solx that can't start. -build_solx() { - "$1" -m zipapp "$STAGE" -o "$BIN/solx" -p "$1" || return 1 - chmod +x "$BIN/solx" - "$BIN/solx" --version >/dev/null 2>&1 -} - -if ! build_solx "$PY"; then - # The resolved interpreter can't run the archive (a system python may be - # built without working zipapp support). Provision a uv-managed one of the - # same version and retry — that is what `uv python install` guarantees. - PYVER="$("$PY" -c 'import sys; print("%d.%d" % sys.version_info[:2])')" - command -v uv >/dev/null 2>&1 || { - echo "solx install: $PY can't run a zipapp, and uv is not available to" >&2 - echo "provision one. Set SOLX_PYTHON to a Python $PYVER that can." >&2 - exit 1 - } - echo "solx install: $PY can't run a zipapp; provisioning a uv-managed Python $PYVER." >&2 - UV_PYTHON_PREFERENCE=only-managed uv python install "$PYVER" >/dev/null 2>&1 || true - PY="$(UV_PYTHON_PREFERENCE=only-managed uv python find "$PYVER")" - build_solx "$PY" || { - echo "solx install: could not produce a working solx with $PY." >&2 - exit 1 - } -fi - -echo "installed $BIN/solx (solx $("$BIN/solx" --version))" -case ":$PATH:" in - *":$BIN:"*) ;; - *) echo "note: $BIN is not on your PATH" >&2 ;; -esac diff --git a/solx/src/cheatsheet.rs b/solx/src/cheatsheet.rs new file mode 100644 index 0000000..a9d11fe --- /dev/null +++ b/solx/src/cheatsheet.rs @@ -0,0 +1,36 @@ +//! `solx cheatsheet` — print the Sol quick-reference as text. +//! +//! Single source of truth: the skill's `references/cheatsheet.md`, embedded +//! at build time. The CLI text, the rendered PDF +//! (`scripts/build-cheatsheet.sh`), and the skill reference all read the same +//! file, so they cannot drift. `solx` is always built from the repo (users +//! get prebuilt binaries, never `cargo publish`), so the relative path +//! resolves. + +/// The cheat sheet, embedded from the skill's markdown source. +pub const CHEATSHEET: &str = include_str!("../../skills/sol-skill/references/cheatsheet.md"); + +/// Print the cheat sheet to stdout. Works anywhere — no Sol required. +pub fn cmd_cheatsheet() -> i32 { + print!("{CHEATSHEET}"); + 0 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cheatsheet_has_the_key_sections() { + for needle in [ + "Know your access", + "Partition", + "QOS", + "debug", + "htc", + "solx", + ] { + assert!(CHEATSHEET.contains(needle), "cheatsheet missing {needle:?}"); + } + } +} diff --git a/solx/src/completions.rs b/solx/src/completions.rs new file mode 100644 index 0000000..ae7789e --- /dev/null +++ b/solx/src/completions.rs @@ -0,0 +1,56 @@ +//! `solx completions ` — emit a static shell completion script. +//! +//! The scripts live under `assets/` and are embedded at build time; they are +//! synced from the Python package's completion generator so both +//! implementations install the same scripts. + +use crate::output::py_repr; + +const BASH: &str = include_str!("../assets/solx.bash"); +const ZSH: &str = include_str!("../assets/_solx.zsh"); +const FISH: &str = include_str!("../assets/solx.fish"); + +/// Print the completion script for `shell`; unknown shells exit 2. +pub fn cmd_completions(shell: &str) -> i32 { + let shell = shell.to_lowercase(); + let script = match shell.as_str() { + "bash" => BASH, + "zsh" => ZSH, + "fish" => FISH, + _ => { + eprintln!( + "unknown shell {}; choose bash, zsh, or fish.", + py_repr(&shell) + ); + return 2; + } + }; + print!("{script}"); + 0 +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn scripts_embed_the_command_tree() { + for script in [BASH, ZSH, FISH] { + for needle in [ + "solx", + "init", + "keep", + "jump", + "completions", + "cheatsheet", + "config", + ] { + assert!(script.contains(needle), "missing {needle}"); + } + } + assert!(ZSH.starts_with("#compdef")); + // fpath/autoload installs need the dual-mode footer. + assert!(ZSH.contains("loadautofunc")); + assert!(ZSH.contains("compdef _solx solx")); + } +} diff --git a/solx/src/config.rs b/solx/src/config.rs new file mode 100644 index 0000000..046120f --- /dev/null +++ b/solx/src/config.rs @@ -0,0 +1,675 @@ +//! Single-file config under `$XDG_CONFIG_HOME/solx/config.toml`. +//! +//! The user runs `solx init` to write a starter file; everything else just +//! reads it. No `[shared]` merge — each `[jobs.]` table is +//! self-contained, which keeps the schema obvious at the cost of repeating +//! a flag across templates if someone really wants that. + +use std::fmt; +use std::path::{Path, PathBuf}; + +use crate::gitwild::GitIgnoreSpec; +use crate::output::{py_repr, strip_markup}; + +pub const CONFIG_FILENAME: &str = "config.toml"; +pub const DEFAULT_START_TIMEOUT: &str = "10m"; + +/// Any user-facing config problem (missing file, bad schema). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ConfigError(pub String); + +impl ConfigError { + /// Build a config error whose message renders the way the plain + /// (non-TTY) diagnostic channel does: bracketed style-tag lookalikes + /// such as `[jobs.default]` / `[keep]` are stripped from the text (see + /// [`strip_markup`]). + fn new(msg: String) -> Self { + ConfigError(strip_markup(&msg)) + } +} + +impl fmt::Display for ConfigError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl std::error::Error for ConfigError {} + +/// One `[jobs.]` table. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct JobTemplate { + pub name: String, + pub partition: String, + pub time: String, + pub qos: Option, + pub gres: Option, + pub extra_args: Vec, +} + +/// Resolved `[keep]` include/exclude as compiled gitignore matchers +/// (see [`crate::gitwild`] for the dialect). +pub struct KeepRules { + include: GitIgnoreSpec, + exclude: GitIgnoreSpec, + pub raw_include: Vec, + pub raw_exclude: Vec, +} + +impl std::fmt::Debug for KeepRules { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("KeepRules") + .field("raw_include", &self.raw_include) + .field("raw_exclude", &self.raw_exclude) + .finish() + } +} + +impl KeepRules { + pub fn new(include: &[String], exclude: &[String]) -> Self { + KeepRules { + include: GitIgnoreSpec::from_lines(include), + exclude: GitIgnoreSpec::from_lines(exclude), + raw_include: include.to_vec(), + raw_exclude: exclude.to_vec(), + } + } + + /// Return `true` if `path` is included and not excluded. + /// + /// Matching follows gitignore semantics on absolute paths: a bare path + /// pattern matches that directory and everything under it (including a + /// path written with a trailing slash), and a `!` negation flips the + /// most specific / latest match. + pub fn matches(&self, path: &str) -> bool { + if !self.include.match_file(path) { + return false; + } + !self.exclude.match_file(path) + } +} + +#[derive(Debug)] +pub struct Config { + pub default_shell: String, + pub default_template: String, + pub start_timeout_seconds: i64, + /// `[jobs.]` tables in file order. + pub templates: Vec<(String, JobTemplate)>, + pub keep: Option, +} + +impl Config { + /// Look up a template by name; `ConfigError` if missing. + pub fn template(&self, name: &str) -> Result<&JobTemplate, ConfigError> { + if let Some((_, t)) = self.templates.iter().find(|(n, _)| n == name) { + return Ok(t); + } + let mut names: Vec<&str> = self.templates.iter().map(|(n, _)| n.as_str()).collect(); + names.sort_unstable(); + let available = if names.is_empty() { + "(none)".to_string() + } else { + names.join(", ") + }; + Err(ConfigError::new(format!( + "unknown job template {}. defined: {available}", + py_repr(name) + ))) + } +} + +/// The user's home directory (`$HOME`). +pub fn home_dir() -> PathBuf { + PathBuf::from(std::env::var("HOME").unwrap_or_else(|_| "/".to_string())) +} + +/// Resolve the config path honoring `XDG_CONFIG_HOME` with the usual fallback. +pub fn config_path() -> PathBuf { + let base = match std::env::var("XDG_CONFIG_HOME") { + Ok(v) if !v.is_empty() => PathBuf::from(v), + _ => home_dir().join(".config"), + }; + base.join("solx").join(CONFIG_FILENAME) +} + +/// Load and validate the config from `path`. +pub fn load(path: &Path) -> Result { + if !path.exists() { + return Err(ConfigError::new(format!( + "no config at {}. run `solx init` to write a starter file.", + path.display() + ))); + } + let text = std::fs::read_to_string(path).map_err(|e| { + ConfigError::new(format!("unable to read config at {}: {e}", path.display())) + })?; + let raw: toml::Table = text.parse().map_err(|e| { + ConfigError::new(format!( + "invalid TOML in {}: {}", + path.display(), + toml_error_line(&e) + )) + })?; + parse(&raw, &path.display().to_string()) +} + +/// Render a TOML parse error as one line: the message text plus its +/// location, e.g. `invalid array (at line 1, column 18)`. Every solx +/// diagnostic is a single stderr line, so the multi-line annotated form the +/// TOML library renders is collapsed. +pub fn toml_error_line(e: &toml::de::Error) -> String { + let msg = e.message().split('\n').collect::>().join("; "); + let first = e.to_string(); + let first = first.lines().next().unwrap_or_default().to_string(); + match first.strip_prefix("TOML parse error ") { + Some(loc) if !loc.is_empty() => format!("{msg} ({loc})"), + _ => msg, + } +} + +fn parse(raw: &toml::Table, source: &str) -> Result { + let default_shell = require_str(raw, "default_shell", source)?; + let default_template = require_str(raw, "default_template", source)?; + let timeout_str = match raw.get("start_timeout") { + None => DEFAULT_START_TIMEOUT.to_string(), + Some(toml::Value::String(s)) => s.clone(), + Some(_) => { + return Err(ConfigError::new(format!( + "{source}: `start_timeout` must be a string like \"10m\"" + ))) + } + }; + let start_timeout_seconds = parse_duration(&timeout_str)?; + + let jobs_raw = match raw.get("jobs") { + Some(toml::Value::Table(t)) if !t.is_empty() => t, + _ => { + return Err(ConfigError::new(format!( + "{source}: at least one [jobs.] table is required" + ))) + } + }; + let mut templates = Vec::new(); + for (name, body) in jobs_raw { + templates.push((name.clone(), parse_template(name, body, source)?)); + } + if !templates.iter().any(|(n, _)| n == &default_template) { + return Err(ConfigError::new(format!( + "{source}: default_template={} is not defined under [jobs.*]", + py_repr(&default_template) + ))); + } + + let keep = parse_keep(raw.get("keep"), source)?; + + Ok(Config { + default_shell, + default_template, + start_timeout_seconds, + templates, + keep, + }) +} + +fn parse_template( + name: &str, + body: &toml::Value, + source: &str, +) -> Result { + let body = match body { + toml::Value::Table(t) => t, + _ => { + return Err(ConfigError::new(format!( + "{source}: [jobs.{name}] must be a table" + ))) + } + }; + let ctx = format!("{source}:[jobs.{name}]"); + Ok(JobTemplate { + name: name.to_string(), + partition: require_str(body, "partition", &ctx)?, + time: require_str(body, "time", &ctx)?, + qos: optional_str(body, "qos", &ctx)?, + gres: optional_str(body, "gres", &ctx)?, + extra_args: optional_str_list(body, "extra_args", &ctx)?, + }) +} + +pub fn parse_keep( + body: Option<&toml::Value>, + source: &str, +) -> Result, ConfigError> { + let body = match body { + None => return Ok(None), + Some(toml::Value::Table(t)) => t, + Some(_) => { + return Err(ConfigError::new(format!( + "{source}: [keep] must be a table" + ))) + } + }; + let ctx = format!("{source}:[keep]"); + let include = optional_str_list(body, "include", &ctx)?; + let exclude = optional_str_list(body, "exclude", &ctx)?; + if include.is_empty() { + return Err(ConfigError::new(format!( + "{source}: [keep].include must be a non-empty array" + ))); + } + Ok(Some(KeepRules::new(&include, &exclude))) +} + +fn require_str(body: &toml::Table, key: &str, ctx: &str) -> Result { + match body.get(key) { + None => Err(ConfigError::new(format!( + "{ctx}: required key `{key}` is missing" + ))), + Some(toml::Value::String(s)) if !s.is_empty() => Ok(s.clone()), + Some(_) => Err(ConfigError::new(format!( + "{ctx}: `{key}` must be a non-empty string" + ))), + } +} + +fn optional_str(body: &toml::Table, key: &str, ctx: &str) -> Result, ConfigError> { + match body.get(key) { + None => Ok(None), + Some(toml::Value::String(s)) if !s.is_empty() => Ok(Some(s.clone())), + Some(_) => Err(ConfigError::new(format!( + "{ctx}: `{key}` must be a non-empty string" + ))), + } +} + +fn optional_str_list(body: &toml::Table, key: &str, ctx: &str) -> Result, ConfigError> { + let err = || ConfigError::new(format!("{ctx}: `{key}` must be an array of strings")); + match body.get(key) { + None => Ok(Vec::new()), + Some(toml::Value::Array(items)) => items + .iter() + .map(|v| match v { + toml::Value::String(s) => Ok(s.clone()), + _ => Err(err()), + }) + .collect(), + Some(_) => Err(err()), + } +} + +/// Parse a string like `"10m"` / `"30s"` / `"1h"` into seconds. +pub fn parse_duration(text: &str) -> Result { + let invalid = || { + ConfigError::new(format!( + "invalid duration {}; use forms like \"30s\", \"10m\", \"1h\"", + py_repr(text) + )) + }; + let t = text.trim(); + let digits_end = t.find(|c: char| !c.is_ascii_digit()).ok_or_else(invalid)?; + if digits_end == 0 { + return Err(invalid()); + } + let (digits, rest) = t.split_at(digits_end); + let rest = rest.trim_start(); + let mut chars = rest.chars(); + let unit = chars.next().ok_or_else(invalid)?; + if !chars.as_str().trim().is_empty() { + return Err(invalid()); + } + let n: i64 = digits.parse().map_err(|_| invalid())?; + let mult = match unit.to_ascii_lowercase() { + 's' => 1, + 'm' => 60, + 'h' => 3600, + _ => return Err(invalid()), + }; + Ok(n * mult) +} + +/// The text that `solx init` writes to a fresh config.toml. +/// +/// The `[keep]` block is a commented placeholder using the `sparky` +/// placeholder. `default_shell` sets the `default_shell` value (the +/// `solx init` walkthrough can pick it). +pub fn starter_config_text(default_shell: &str) -> String { + let base = STARTER_CONFIG_BASE.replace( + "default_shell = \"bash\"", + &format!("default_shell = {}", toml_str(default_shell)), + ); + base + KEEP_PLACEHOLDER +} + +/// Render `s` as a TOML basic string, escaping every char TOML forbids. +/// +/// Besides backslash and double-quote, control characters (other than tab) +/// are illegal in a TOML basic string and must be `\uXXXX`-escaped — +/// otherwise a keep pattern carrying a stray control byte would render an +/// unparseable config. Tab is emitted as `\t`. +pub fn toml_str(s: &str) -> String { + let mut out = String::with_capacity(s.len() + 2); + out.push('"'); + for ch in s.chars() { + match ch { + '\\' => out.push_str("\\\\"), + '"' => out.push_str("\\\""), + '\t' => out.push_str("\\t"), + c if (c as u32) < 0x20 || c as u32 == 0x7f => { + out.push_str(&format!("\\u{:04x}", c as u32)); + } + c => out.push(c), + } + } + out.push('"'); + out +} + +const STARTER_CONFIG_BASE: &str = r#"# solx config — see https://github.com/Shu-Wan/solx/blob/main/solx/README.md +# +# Used by `solx job jump` when dropping into a shell on a compute node. +default_shell = "bash" + +# Default template for `solx job start` when invoked without an argument. +default_template = "default" + +# Cap on how long `solx job start` waits for the queue. CLI flag --timeout +# overrides per-run. +start_timeout = "10m" + + +# Job templates. Run `solx job start ` to allocate one. +# Each table is self-contained; repeat flags across templates if needed. + +[jobs.default] +partition = "lightwork" +time = "1-0" +qos = "public" + +[jobs.debug] +partition = "htc" +time = "0-1" + + +"#; + +const KEEP_PLACEHOLDER: &str = r#"# Scratch paths to keep alive when Sol flags them in a warning CSV +# *and* `solx keep` runs. Replace `sparky` with your ASURITE. +# Patterns use gitignore-style globs (** for recursion). +# Uncomment + edit to enable: +# +# [keep] +# include = ["/scratch/sparky/your-project", "/scratch/sparky/experiments/**"] +# exclude = ["**/__pycache__", "**/.venv"] +"#; + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + pub const SAMPLE_CONFIG_TOML: &str = r#"default_shell = "zsh" +default_template = "default" +start_timeout = "5m" + +[jobs.default] +partition = "lightwork" +time = "1-0" +qos = "public" + +[jobs.debug] +partition = "htc" +time = "0-1" + +[jobs.gpu] +partition = "public" +gres = "gpu:a100:1" +time = "0-4" +extra_args = ["--mem=64G", "--cpus-per-task=8"] + +[keep] +include = ["/scratch/sparky/proj-a", "/scratch/sparky/proj-b/**"] +exclude = ["**/__pycache__", "**/.venv"] +"#; + + fn write_config(dir: &Path, text: &str) -> PathBuf { + let p = dir.join("config.toml"); + fs::write(&p, text).unwrap(); + p + } + + #[test] + fn load_full_config() { + let dir = tempfile::tempdir().unwrap(); + let c = load(&write_config(dir.path(), SAMPLE_CONFIG_TOML)).unwrap(); + assert_eq!(c.default_shell, "zsh"); + assert_eq!(c.default_template, "default"); + assert_eq!(c.start_timeout_seconds, 300); + let names: Vec<&str> = c.templates.iter().map(|(n, _)| n.as_str()).collect(); + assert_eq!(names, ["default", "debug", "gpu"]); // file order preserved + + let gpu = c.template("gpu").unwrap(); + assert_eq!(gpu.partition, "public"); + assert_eq!(gpu.gres.as_deref(), Some("gpu:a100:1")); + assert_eq!(gpu.time, "0-4"); + assert_eq!(gpu.qos, None); + assert_eq!(gpu.extra_args, ["--mem=64G", "--cpus-per-task=8"]); + } + + #[test] + fn template_lookup_missing_errors() { + let dir = tempfile::tempdir().unwrap(); + let c = load(&write_config(dir.path(), SAMPLE_CONFIG_TOML)).unwrap(); + let err = c.template("nonexistent").unwrap_err(); + assert_eq!( + err.0, + "unknown job template 'nonexistent'. defined: debug, default, gpu" + ); + } + + #[test] + fn load_missing_file() { + let dir = tempfile::tempdir().unwrap(); + let err = load(&dir.path().join("absent.toml")).unwrap_err(); + assert!(err.0.contains("run `solx init`")); + } + + #[test] + fn invalid_toml() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config(dir.path(), "default_shell = [unclosed array"); + let err = load(&p).unwrap_err(); + assert!(err.0.contains("invalid TOML")); + } + + #[test] + fn required_default_shell() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config( + dir.path(), + "default_template = \"default\"\n[jobs.default]\npartition = \"x\"\ntime = \"1-0\"\n", + ); + let err = load(&p).unwrap_err(); + assert!(err.0.contains("default_shell")); + assert!(err.0.contains("required key")); + } + + #[test] + fn required_default_template() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config( + dir.path(), + "default_shell = \"bash\"\n[jobs.default]\npartition = \"x\"\ntime = \"1-0\"\n", + ); + let err = load(&p).unwrap_err(); + assert!(err.0.contains("default_template")); + } + + #[test] + fn at_least_one_jobs_table() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config( + dir.path(), + "default_shell = \"bash\"\ndefault_template = \"x\"\n", + ); + let err = load(&p).unwrap_err(); + assert!(err + .0 + .contains("at least one [jobs.] table is required")); + } + + #[test] + fn default_template_must_exist() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config( + dir.path(), + "default_shell = \"bash\"\ndefault_template = \"missing\"\n\n[jobs.default]\npartition = \"x\"\ntime = \"1-0\"\n", + ); + let err = load(&p).unwrap_err(); + assert!(err + .0 + .contains("default_template='missing' is not defined under [jobs.*]")); + } + + #[test] + fn template_required_keys() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config( + dir.path(), + "default_shell = \"bash\"\ndefault_template = \"default\"\n\n[jobs.default]\npartition = \"x\"\n", + ); + let err = load(&p).unwrap_err(); + assert!(err.0.contains("`time`")); + } + + #[test] + fn extra_args_must_be_string_array() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config( + dir.path(), + "default_shell = \"bash\"\ndefault_template = \"default\"\n\n[jobs.default]\npartition = \"x\"\ntime = \"1-0\"\nextra_args = [1, 2]\n", + ); + let err = load(&p).unwrap_err(); + assert!(err.0.contains("extra_args")); + } + + #[test] + fn keep_match_include_only() { + let keep = KeepRules::new(&["/scratch/sparky/proj-a/**".to_string()], &[]); + assert!(keep.matches("/scratch/sparky/proj-a/data.csv")); + assert!(!keep.matches("/scratch/sparky/proj-b/data.csv")); + } + + #[test] + fn keep_exclude_carve_out() { + let keep = KeepRules::new( + &["/scratch/sparky/proj-a/**".to_string()], + &["**/__pycache__/**".to_string(), "**/.venv/**".to_string()], + ); + assert!(keep.matches("/scratch/sparky/proj-a/run/data.csv")); + assert!(!keep.matches("/scratch/sparky/proj-a/run/__pycache__/x.pyc")); + assert!(!keep.matches("/scratch/sparky/proj-a/.venv/lib/x.py")); + } + + #[test] + fn keep_bare_path_matches_dir_and_descendants() { + let keep = KeepRules::new(&["/scratch/sparky/proj-a".to_string()], &[]); + assert!(keep.matches("/scratch/sparky/proj-a")); + assert!(keep.matches("/scratch/sparky/proj-a/deep/file.bin")); + assert!(!keep.matches("/scratch/sparky/proj-ab")); + } + + #[test] + fn keep_exclude_dir_pattern_matches_descendant_dirs() { + // The config-sample shape: exclude ["**/__pycache__", "**/.venv"] + // must filter a flagged __pycache__ leaf directory. + let keep = KeepRules::new( + &["/scratch/sparky/proj/**".to_string()], + &["**/__pycache__".to_string(), "**/.venv".to_string()], + ); + assert!(keep.matches("/scratch/sparky/proj/run-1")); + assert!(!keep.matches("/scratch/sparky/proj/__pycache__")); + assert!(!keep.matches("/scratch/sparky/proj/sub/.venv")); + } + + #[test] + fn keep_requires_include() { + let mut table = toml::Table::new(); + table.insert( + "exclude".to_string(), + toml::Value::Array(vec![toml::Value::String("x".to_string())]), + ); + let err = parse_keep(Some(&toml::Value::Table(table)), "t").unwrap_err(); + assert!(err.0.contains("non-empty array")); + } + + #[test] + fn keep_absent_is_none() { + assert!(parse_keep(None, "t").unwrap().is_none()); + } + + #[test] + fn parse_duration_forms() { + assert_eq!(parse_duration("30s").unwrap(), 30); + assert_eq!(parse_duration("10m").unwrap(), 600); + assert_eq!(parse_duration("1h").unwrap(), 3600); + assert_eq!(parse_duration(" 5M ").unwrap(), 300); + } + + #[test] + fn parse_duration_invalid() { + let err = parse_duration("never").unwrap_err(); + assert_eq!( + err.0, + "invalid duration 'never'; use forms like \"30s\", \"10m\", \"1h\"" + ); + assert!(parse_duration("10x").is_err()); + assert!(parse_duration("m").is_err()); + assert!(parse_duration("10m extra").is_err()); + } + + #[test] + fn config_path_honors_xdg() { + // Avoid mutating process env in-test; exercise via integration tests. + // Here just confirm the suffix shape. + let p = config_path(); + assert!(p.ends_with("solx/config.toml")); + } + + #[test] + fn starter_config_loads_clean() { + let dir = tempfile::tempdir().unwrap(); + let p = write_config(dir.path(), &starter_config_text("bash")); + let c = load(&p).unwrap(); + assert_eq!(c.default_shell, "bash"); + assert_eq!(c.default_template, "default"); + assert!(c.template("default").is_ok()); + assert!(c.template("debug").is_ok()); + assert!(c.keep.is_none()); // commented out in starter; user uncomments + } + + #[test] + fn starter_config_no_maintainer_name() { + let text = starter_config_text("bash"); + assert!(!text.contains("swan16")); + assert!(!text.contains("")); + assert!(text.contains("sparky")); // in the commented [keep] example + } + + #[test] + fn load_unreadable_is_config_error() { + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("config.toml"); + fs::create_dir(&p).unwrap(); // exists, but reading a directory fails + let err = load(&p).unwrap_err(); + assert!(err.0.contains("unable to read")); + } + + #[test] + fn toml_str_escapes() { + assert_eq!(toml_str("plain"), "\"plain\""); + assert_eq!(toml_str("a\"b"), "\"a\\\"b\""); + assert_eq!(toml_str("a\\b"), "\"a\\\\b\""); + assert_eq!(toml_str("a\tb"), "\"a\\tb\""); + assert_eq!(toml_str("a\u{1}b"), "\"a\\u0001b\""); + } +} diff --git a/solx/src/gitwild.rs b/solx/src/gitwild.rs new file mode 100644 index 0000000..10f83f5 --- /dev/null +++ b/solx/src/gitwild.rs @@ -0,0 +1,458 @@ +//! Gitignore-style pattern matching for the config `[keep]` block. +//! +//! This is a port of Python `pathspec`'s `GitIgnoreSpec` (the matcher the +//! Python solx compiles keep rules with), so include/exclude decisions are +//! byte-identical between the two implementations. The dialect is gitignore, +//! not a general glob language: +//! +//! * `*` and `?` never cross a `/`; `**` spans directories. +//! * `[...]` is a character class; a class with no closing `]` makes the +//! whole pattern invalid, and an invalid pattern is discarded (it matches +//! nothing) — git's behavior. +//! * `{a,b}` braces are literal characters, not alternation. +//! * A pattern of exactly `/` matches nothing. +//! * A pattern with no `/` (or only a trailing one) matches at any depth; +//! one with an internal `/` is anchored to the root. +//! * `!` negates. The last matching pattern decides, except that an exact +//! (non-ancestor) match takes precedence over ancestor-directory matches — +//! git's re-include-from-excluded-directory edge case. +//! +//! Paths are matched as strings: one leading `/` (or a leading `./`) is +//! stripped, nothing else is canonicalized, and a pattern for a directory +//! also matches any path under it (including forms with a trailing slash). + +use regex::Regex; + +/// One compiled pattern line: negated or not, plus its anchored regex. +/// +/// The regex carries a `ps_d` capture group on the slash that separates the +/// matched directory from a descendant path; a match where `ps_d` +/// participates is an ancestor-directory match (lower precedence), one +/// without it is an exact match. +struct CompiledPattern { + include: bool, + regex: Regex, +} + +/// An ordered set of gitignore pattern lines compiled for matching. +pub struct GitIgnoreSpec { + patterns: Vec, +} + +/// The regex group name marking an ancestor-directory match. +const DIR_MARK: &str = "ps_d"; + +impl GitIgnoreSpec { + /// Compile pattern lines. Blank lines, comments, and invalid patterns + /// are no-ops. + pub fn from_lines(lines: I) -> Self + where + I: IntoIterator, + S: AsRef, + { + let mut patterns = Vec::new(); + for line in lines { + if let Some((raw_regex, include)) = pattern_to_regex(line.as_ref()) { + if let Ok(regex) = Regex::new(&raw_regex) { + patterns.push(CompiledPattern { include, regex }); + } + } + } + GitIgnoreSpec { patterns } + } + + /// Whether `path` is matched (included) by this spec. + /// + /// Patterns are checked last-to-first; the first exact match decides, + /// and an ancestor-directory match is used only when no pattern matches + /// exactly. + pub fn match_file(&self, path: &str) -> bool { + let norm = normalize_file(path); + let mut dir_match: Option = None; + for pat in self.patterns.iter().rev() { + let Some(caps) = pat.regex.captures(norm) else { + continue; + }; + if caps.name(DIR_MARK).is_some() { + if dir_match.is_none() { + dir_match = Some(pat.include); + } + } else { + return pat.include; + } + } + dir_match.unwrap_or(false) + } +} + +/// Strip one leading `/` (absolute paths match root-anchored patterns) or a +/// leading `./`. +fn normalize_file(path: &str) -> &str { + if let Some(rest) = path.strip_prefix('/') { + rest + } else if let Some(rest) = path.strip_prefix("./") { + rest + } else { + path + } +} + +/// Translate one gitignore pattern line into `(regex, include)`. +/// `None` for a no-op line: blank, comment, the bare `/` pattern, or a +/// pattern with invalid range notation (discarded, like git). +fn pattern_to_regex(pattern: &str) -> Option<(String, bool)> { + // Trailing whitespace is stripped unless escaped (`\ ` at end). + let pattern = if pattern.ends_with("\\ ") { + pattern + } else { + pattern.trim_end() + }; + + if pattern.is_empty() || pattern.starts_with('#') || pattern == "/" { + return None; + } + + let (include, pattern) = match pattern.strip_prefix('!') { + Some(rest) => (false, rest), + None => (true, pattern), + }; + + let mut segs: Vec<&str> = pattern.split('/').collect(); + let is_dir_pattern = segs.last() == Some(&""); + + // Normalize the segments. + if segs[0].is_empty() { + // Leading slash: anchored to the root. + segs.remove(0); + } else if segs.len() == 1 || (segs.len() == 2 && segs[1].is_empty()) { + // Single segment (with or without trailing slash): match at any + // depth, i.e. `**/{pattern}`. + if segs[0] != "**" { + segs.insert(0, "**"); + } + } + if segs.is_empty() { + return None; + } + if segs.last() == Some(&"") { + // Trailing slash: match everything under the directory. + *segs.last_mut().unwrap() = "**"; + } + // Collapse consecutive `**` segments. + segs.dedup_by(|a, b| *a == "**" && *b == "**"); + + let dir_mark_cg = format!("(?P<{DIR_MARK}>/)"); + + // Whole-pattern special cases. + if segs == ["**"] { + return Some(( + if is_dir_pattern { + dir_mark_cg + } else { + ".".into() + }, + include, + )); + } + if segs == ["**", "*"] { + return Some((".".to_string(), include)); + } + if segs == ["**", "*", "**"] { + return Some(( + if is_dir_pattern { + dir_mark_cg + } else { + "/".into() + }, + include, + )); + } + + // Translate segment by segment. + let mut regex = String::new(); + let mut need_slash = false; + let end = segs.len() - 1; + for (i, seg) in segs.iter().enumerate() { + if *seg == "**" { + if i == 0 { + regex.push_str("^(?:.+/)?"); + } else if i < end { + regex.push_str("(?:/.+)?"); + need_slash = true; + } else { + // Trailing `**`: any descendant (dir patterns mark the + // separating slash). + if is_dir_pattern { + regex.push_str(&dir_mark_cg); + } else { + regex.push('/'); + } + } + } else { + if i == 0 { + regex.push('^'); + } + if need_slash { + regex.push('/'); + } + if *seg == "*" { + regex.push_str("[^/]+"); + } else { + regex.push_str(&translate_segment_glob(seg)?); + } + if i == end { + // Match the path itself, or anything under it. + regex.push_str(&format!("(?:{dir_mark_cg}|$)")); + } + need_slash = true; + } + } + Some((regex, include)) +} + +/// Translate one path-segment glob to a regex fragment. `None` when the +/// segment carries invalid range notation (an unclosed `[`), which discards +/// the whole pattern. +fn translate_segment_glob(seg: &str) -> Option { + let chars: Vec = seg.chars().collect(); + let mut regex = String::new(); + let mut escape = false; + let mut i = 0; + while i < chars.len() { + let c = chars[i]; + i += 1; + + if escape { + escape = false; + push_literal(&mut regex, c); + } else if c == '\\' { + escape = true; + } else if c == '*' { + regex.push_str("[^/]*"); + } else if c == '?' { + regex.push_str("[^/]"); + } else if c == '[' { + // Character class: find the closing bracket. A leading `!`/`^` + // negates; a `]` right after the (optional) negation is literal. + let mut j = i; + if j < chars.len() && (chars[j] == '!' || chars[j] == '^') { + j += 1; + } + if j < chars.len() && chars[j] == ']' { + j += 1; + } + while j < chars.len() && chars[j] != ']' { + j += 1; + } + if j >= chars.len() { + // Unclosed class: invalid range notation, discard pattern. + return None; + } + j += 1; // one past the closing bracket + regex.push('['); + if chars[i] == '!' || chars[i] == '^' { + regex.push('^'); + i += 1; + } + // Copy the class body. Backslashes are literal characters here; + // characters this regex dialect treats specially inside a class + // (`]` at the start, `[`, `&`, `~`) are escaped so the class + // keeps plain gitignore semantics (ranges via `-` still work). + for (k, &b) in chars[i..j].iter().enumerate() { + match b { + '\\' => regex.push_str("\\\\"), + ']' if k + 1 < j - i => regex.push_str("\\]"), + '[' | '&' | '~' => { + regex.push('\\'); + regex.push(b); + } + _ => regex.push(b), + } + } + i = j; + } else { + push_literal(&mut regex, c); + } + } + if escape { + // Trailing bare backslash: invalid pattern. + return None; + } + Some(regex) +} + +/// Append `c` to `regex` as a literal character. +fn push_literal(regex: &mut String, c: char) { + if matches!( + c, + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' + ) { + regex.push('\\'); + } + regex.push(c); +} + +#[cfg(test)] +mod tests { + use super::*; + + fn spec(lines: &[&str]) -> GitIgnoreSpec { + GitIgnoreSpec::from_lines(lines) + } + + // Vectors generated from Python pathspec's GitIgnoreSpec — the + // reference implementation this module must agree with. + // Each row: (pattern lines, path, expected match). + const VECTORS: &[(&[&str], &str, bool)] = &[ + // Braces are literal characters, never alternation. + (&["/scratch/sparky/run{1,2}"], "/scratch/sparky/run1", false), + (&["/scratch/sparky/run{1,2}"], "/scratch/sparky/run2", false), + ( + &["/scratch/sparky/run{1,2}"], + "/scratch/sparky/run1/sub", + false, + ), + ( + &["/scratch/sparky/run{1,2}"], + "/scratch/sparky/run{1,2}", + true, + ), + ( + &["/scratch/sparky/run{1,2}"], + "/scratch/sparky/run{1,2}/sub", + true, + ), + // A pattern of exactly `/` matches nothing. + (&["/"], "/scratch/sparky/anything", false), + (&["/"], "/x", false), + (&["/"], "/", false), + // Unclosed `[` discards the pattern entirely. + (&["/scratch/sparky/run[1"], "/scratch/sparky/run[1", false), + (&["/scratch/sparky/run[1"], "/scratch/sparky/run1", false), + (&["/scratch/sparky/run[1"], "/scratch/sparky/run[", false), + // Trailing-slash path forms still match their include root. + (&["/scratch/sparky/proj-a"], "/scratch/sparky/proj-a", true), + (&["/scratch/sparky/proj-a"], "/scratch/sparky/proj-a/", true), + ( + &["/scratch/sparky/proj-a"], + "/scratch/sparky/proj-a//", + true, + ), + ( + &["/scratch/sparky/proj-a"], + "/scratch/sparky/proj-a/.", + true, + ), + ( + &["/scratch/sparky/proj-a"], + "/scratch/sparky/proj-a/deep/file.bin", + true, + ), + // Only one leading slash is stripped; `//x` is not `/x`. + (&["/scratch/sparky/proj-a"], "//scratch/sparky/proj", false), + ( + &["/scratch/sparky/proj-a"], + "/scratch/sparky/proj-ab", + false, + ), + // `dir/**` matches strict descendants, not the directory itself. + ( + &["/scratch/sparky/proj-b/**"], + "/scratch/sparky/proj-b", + false, + ), + ( + &["/scratch/sparky/proj-b/**"], + "/scratch/sparky/proj-b/x", + true, + ), + // Negation: last match wins, exact beats ancestor-directory. + ( + &["/scratch/sparky/proj", "!**/__pycache__"], + "/scratch/sparky/proj/run", + true, + ), + ( + &["/scratch/sparky/proj", "!**/__pycache__"], + "/scratch/sparky/proj/__pycache__", + false, + ), + ( + &["/scratch/sparky/proj", "!**/__pycache__"], + "/scratch/sparky/proj/a/__pycache__", + false, + ), + ( + &["/scratch/sparky/proj", "!**/__pycache__"], + "/scratch/sparky/x", + false, + ), + (&["/a", "!/a/tmp", "/a/tmp/keepme"], "/a/x", true), + (&["/a", "!/a/tmp", "/a/tmp/keepme"], "/a/tmp", false), + (&["/a", "!/a/tmp", "/a/tmp/keepme"], "/a/tmp/other", false), + (&["/a", "!/a/tmp", "/a/tmp/keepme"], "/a/tmp/keepme", true), + ( + &["/a", "!/a/tmp", "/a/tmp/keepme"], + "/a/tmp/keepme/sub", + true, + ), + // Directory-only patterns (trailing slash) skip the bare path. + (&["/scratch/sparky/exp*/"], "/scratch/sparky/exp1", false), + (&["/scratch/sparky/exp*/"], "/scratch/sparky/exp1/f", true), + (&["/scratch/sparky/exp*/"], "/scratch/sparky/exp", false), + // Character classes. + (&["/scratch/sparky/run[12]"], "/scratch/sparky/run1", true), + (&["/scratch/sparky/run[12]"], "/scratch/sparky/run2", true), + (&["/scratch/sparky/run[12]"], "/scratch/sparky/run3", false), + (&["/scratch/sparky/run[12]"], "/scratch/sparky/run12", false), + (&["/scratch/sparky/run[!1]"], "/scratch/sparky/run1", false), + (&["/scratch/sparky/run[!1]"], "/scratch/sparky/run2", true), + // `?` matches exactly one non-slash character. + (&["/scratch/sparky/r?n"], "/scratch/sparky/run", true), + (&["/scratch/sparky/r?n"], "/scratch/sparky/rn", false), + (&["/scratch/sparky/r?n"], "/scratch/sparky/r/n", false), + // `**` / `*` whole-pattern forms. + (&["**"], "/anything", true), + (&["**"], "/a/b", true), + (&["*"], "/anything", true), + (&["*"], "/a/b", true), + (&["/scratch/**/deep"], "/scratch/deep", true), + (&["/scratch/**/deep"], "/scratch/a/deep", true), + (&["/scratch/**/deep"], "/scratch/a/b/deep", true), + (&["/scratch/**/deep"], "/scratchdeep", false), + // Anchoring rules for slash-less vs slash-ful patterns. + (&["bare-name"], "/scratch/sparky/bare-name", true), + (&["bare-name"], "/bare-name", true), + (&["bare-name"], "/x/bare-name/y", true), + (&["dir/sub"], "/dir/sub", true), + (&["dir/sub"], "/x/dir/sub", false), + (&["dir/sub"], "/dir/sub/y", true), + // Spaces and other shell-special characters are plain literals. + (&["/scratch/sparky/a b/c*"], "/scratch/sparky/a b/cx", true), + (&["/scratch/sparky/a b/c*"], "/scratch/sparky/a b/d", false), + // Comments and blanks are no-ops. + ( + &["# comment", "", "/scratch/sparky/p"], + "/scratch/sparky/p", + true, + ), + ]; + + #[test] + fn matches_pathspec_reference_vectors() { + for (lines, path, expected) in VECTORS { + let got = spec(lines).match_file(path); + assert_eq!( + got, *expected, + "patterns {lines:?} vs path {path:?}: got {got}, want {expected}" + ); + } + } + + #[test] + fn unclosed_class_in_negation_is_discarded() { + // The discarded `!`-pattern carves nothing out. + let s = spec(&["/scratch/sparky", "!/scratch/sparky/skip[1"]); + assert!(s.match_file("/scratch/sparky/skip[1")); + } +} diff --git a/solx/src/init.rs b/solx/src/init.rs new file mode 100644 index 0000000..b9b1503 --- /dev/null +++ b/solx/src/init.rs @@ -0,0 +1,99 @@ +//! `solx init` — write a starter `config.toml`. + +use std::io::Write; +use std::os::unix::fs::PermissionsExt; + +use serde_json::json; + +use crate::config as cfg; +use crate::output::{confirm, Out}; + +pub const SHELLS: [&str; 3] = ["bash", "zsh", "fish"]; + +/// Interactive first-run walkthrough. Returns the chosen `default_shell`, or +/// `None` if declined. TTY-only; plain-text prompts on stderr. +fn walkthrough(out: &Out) -> Option { + if !confirm("Walk through a quick setup?", false) { + return None; + } + + out.status("\nShell"); + let shell = loop { + eprint!( + "Which shell should `solx job jump` open on the compute node? \ + ({}) [bash] ", + SHELLS.join("/") + ); + let _ = std::io::stderr().flush(); + let mut line = String::new(); + if std::io::stdin().read_line(&mut line).is_err() { + break "bash".to_string(); + } + let answer = line.trim().to_string(); + if answer.is_empty() { + break "bash".to_string(); + } + if SHELLS.contains(&answer.as_str()) { + break answer; + } + out.status(&format!("please pick one of: {}", SHELLS.join(", "))); + }; + + Some(shell) +} + +pub fn cmd_init(force: bool, out: &Out) -> i32 { + let p = cfg::config_path(); + + if p.exists() && !force { + // Never block on the overwrite prompt in a non-interactive session. + if !out.interactive { + out.error(&format!( + "error: {} already exists. pass -f to overwrite.", + p.display() + )); + return 2; + } + if !confirm( + &format!("{} already exists. Overwrite?", p.display()), + false, + ) { + out.status("aborted"); + return 1; + } + } + + // Optional interactive walkthrough — skipped entirely in a + // non-interactive session (an agent/cron just gets the defaults, never a + // hung prompt). + let mut default_shell = "bash".to_string(); + if out.interactive { + if let Some(shell) = walkthrough(out) { + default_shell = shell; + } + } + + if let Some(parent) = p.parent() { + if let Err(e) = std::fs::create_dir_all(parent) { + out.error(&format!( + "error: unable to create {}: {e}", + parent.display() + )); + return 1; + } + } + let text = cfg::starter_config_text(&default_shell); + if let Err(e) = std::fs::write(&p, text) { + out.error(&format!("error: unable to write {}: {e}", p.display())); + return 1; + } + // Mode 0600 — config may eventually contain user-specific paths or + // mail-user etc.; keep it readable only by the owner. + let _ = std::fs::set_permissions(&p, std::fs::Permissions::from_mode(0o600)); + + out.status("edit it with `solx config edit`, then `solx job start`."); + out.emit(&json!({"wrote": p.display().to_string()}), || { + Some(format!("wrote {}", p.display())) + }); + 0 +} diff --git a/solx/src/jobs.rs b/solx/src/jobs.rs new file mode 100644 index 0000000..173e2ef --- /dev/null +++ b/solx/src/jobs.rs @@ -0,0 +1,636 @@ +//! `solx job` subcommands: list, start, stop, jump, time. +//! +//! Output obeys [`crate::output::Out`]: JSON on a non-TTY stdout, plain +//! tables on a TTY, all diagnostics on stderr. Jobid resolution is +//! verb-aware (see [`crate::slurm::resolve_jobid`]): read/attach verbs +//! auto-pick the most recent job, the destructive `stop` never does, and +//! acting from inside an allocation carries a nesting / self-cancel guard. + +use serde_json::{json, Value}; + +use crate::config::Config; +use crate::output::Out; +use crate::slurm::{self, Job, Runner, Verb}; + +// --- shared rendering ------------------------------------------------------- + +const JOB_COLUMNS: [&str; 7] = [ + "JOBID", + "NAME", + "STATE", + "TIME", + "LEFT", + "PARTITION", + "NODE / REASON", +]; + +/// Render jobs as plain aligned columns (kubectl-style) for a TTY. +fn jobs_table(jobs: &[Job]) -> String { + let rows: Vec<[&str; 7]> = jobs + .iter() + .map(|j| { + [ + j.job_id.as_str(), + j.name.as_str(), + j.state.as_str(), + j.time_used.as_str(), + j.time_left.as_str(), + j.partition.as_str(), + j.node_list.as_str(), + ] + }) + .collect(); + let mut widths: Vec = JOB_COLUMNS.iter().map(|c| c.len()).collect(); + for row in &rows { + for (i, cell) in row.iter().enumerate() { + widths[i] = widths[i].max(cell.chars().count()); + } + } + let render = |cells: &[&str; 7]| -> String { + let mut line = String::new(); + for (i, cell) in cells.iter().enumerate() { + if i > 0 { + line.push_str(" "); + } + line.push_str(cell); + if i + 1 < cells.len() { + for _ in cell.chars().count()..widths[i] { + line.push(' '); + } + } + } + line.trim_end().to_string() + }; + let mut out = vec![render(&JOB_COLUMNS)]; + out.extend(rows.iter().map(render)); + out.join("\n") +} + +fn jobs_payload(jobs: &[Job]) -> Value { + Value::Array( + jobs.iter() + .map(|j| { + json!({ + "job_id": j.job_id, + "name": j.name, + "state": j.state, + "time_used": j.time_used, + "time_left": j.time_left, + "partition": j.partition, + "node_list": j.node_list, + }) + }) + .collect(), + ) +} + +/// Surface a candidate set for a verb that won't auto-pick (stop). +fn print_candidates(out: &Out, jobs: &[Job], reason: &str) { + if out.json_mode { + out.json(&json!({"error": reason, "jobs": jobs_payload(jobs)})); + } else { + out.error(&format!("{reason} — specify a JOBID:")); + out.error(&jobs_table(jobs)); + } +} + +// --- list -------------------------------------------------------------------- + +pub fn cmd_list(runner: Runner, out: &Out) -> i32 { + let jobs = match slurm::squeue_user_jobs(None, runner) { + Ok(jobs) => jobs, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + out.emit(&jobs_payload(&jobs), || { + Some(if jobs.is_empty() { + "no jobs in queue".to_string() + } else { + jobs_table(&jobs) + }) + }); + 0 +} + +// --- start --------------------------------------------------------------------- + +/// The `job start` tail, parsed Click-style (see [`parse_start_tail`]). +#[derive(Debug, Default, PartialEq, Eq)] +pub struct StartTail { + pub template: Option, + pub dry_run: bool, + pub timeout: Option, + pub passthrough: Vec, + pub help: bool, +} + +/// Parse everything after `job start`. +/// +/// The grammar matches an ignore-unknown-options + allow-extra-args command: +/// +/// * `-n` / `--dry-run`, `-h` / `--help`, and `--timeout VALUE` (or +/// `--timeout=VALUE`) are consumed wherever they appear before the first +/// `--`. An explicit value on the flag form (`--dry-run=...`) is a usage +/// error. +/// * The first `--` is dropped; everything after it is treated as bare +/// tokens (no option parsing). +/// * The first unconsumed bare token — even one after `--` that looks like +/// a flag — becomes the TEMPLATE; every other leftover token is salloc +/// passthrough, in original order. +pub fn parse_start_tail(args: &[String]) -> Result { + let mut tail = StartTail::default(); + let mut leftovers: Vec = Vec::new(); + let mut after_dashdash = false; + let mut i = 0; + while i < args.len() { + let tok = &args[i]; + if after_dashdash { + leftovers.push(tok.clone()); + i += 1; + continue; + } + if tok == "--" { + after_dashdash = true; + i += 1; + continue; + } + if tok == "-n" || tok == "--dry-run" { + tail.dry_run = true; + i += 1; + continue; + } + if tok.starts_with("--dry-run=") { + return Err("Option '--dry-run' does not take a value.".to_string()); + } + if tok == "-h" || tok == "--help" { + tail.help = true; + i += 1; + continue; + } + if tok == "--timeout" { + let value = args + .get(i + 1) + .ok_or("Option '--timeout' requires an argument.")?; + tail.timeout = Some(value.clone()); + i += 2; + continue; + } + if let Some(v) = tok.strip_prefix("--timeout=") { + tail.timeout = Some(v.to_string()); + i += 1; + continue; + } + if tok.starts_with('-') && tok.len() > 1 && !tok.starts_with("--") { + // A short-option cluster: peel known shorts, keep the rest. + let mut unknown = String::new(); + for c in tok.chars().skip(1) { + if c == 'n' { + tail.dry_run = true; + } else { + unknown.push(c); + } + } + if !unknown.is_empty() { + leftovers.push(format!("-{unknown}")); + } + i += 1; + continue; + } + // Unknown long option or bare token: leave it for template/passthrough. + leftovers.push(tok.clone()); + i += 1; + } + let mut it = leftovers.into_iter(); + tail.template = it.next(); + tail.passthrough = it.collect(); + Ok(tail) +} + +pub fn cmd_start( + config: &Config, + template_name: Option<&str>, + dry_run: bool, + timeout_override: Option, + passthrough: &[String], + salloc_runner: Option, + out: &Out, +) -> i32 { + let name = template_name + .unwrap_or(&config.default_template) + .to_string(); + let template = match config.template(&name) { + Ok(t) => t, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + + let argv = slurm::salloc_argv(template, passthrough); + + if dry_run { + out.status("dry-run — would run:"); + out.emit( + &json!({"dry_run": true, "template": name, "argv": argv}), + || Some(format!(" {}", slurm::shell_join(&argv))), + ); + return 0; + } + + let timeout = timeout_override.unwrap_or(config.start_timeout_seconds); + out.status(&format!("submitting: {}", slurm::shell_join(&argv))); + out.status(&format!( + "waiting up to {timeout}s for the queue to grant the allocation…" + )); + let jobid = match slurm::run_salloc(&argv, timeout, salloc_runner) { + Ok(j) => j, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + + out.status(&format!("allocated job {jobid}")); + out.status(&format!( + "attach: solx job jump {jobid} (or: srun --jobid={jobid} --overlap --pty {})", + config.default_shell + )); + if out.json_mode { + out.json(&json!({"jobid": jobid, "template": name})); + } + 0 +} + +// --- stop ---------------------------------------------------------------------- + +pub fn cmd_stop( + jobid_arg: Option<&str>, + yes: bool, + dry_run: bool, + runner: Runner, + out: &Out, +) -> i32 { + if yes && dry_run { + out.error("error: --yes and --dry-run are mutually exclusive"); + return 2; + } + + let res = match slurm::resolve_jobid(jobid_arg, Verb::Stop, None, None, runner) { + Ok(r) => r, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + if let Some(err) = &res.error { + out.error(&format!("error: {err}")); + return 1; + } + if res.ambiguous { + print_candidates(out, &res.candidates, "multiple jobs running"); + return 2; + } + + let jid = res.job_id.clone().expect("resolved job id"); + let argv = slurm::scancel_argv(&jid); + + // Acting on the job you're sitting inside ends this session — surface it + // in every path, including a dry-run preview, so the resolver's decision + // is never a surprise. + let self_cancel = res.acting_on_current(); + if self_cancel { + out.status(&format!( + "warning: job {jid} is the allocation you're inside ($SLURM_JOB_ID); \ + cancelling it will end this session." + )); + } + + if dry_run { + out.status("dry-run — would run:"); + out.emit( + &json!({ + "dry_run": true, + "jobid": jid, + "argv": argv, + "inside_allocation": self_cancel, + }), + || Some(format!(" {}", slurm::shell_join(&argv))), + ); + return 0; + } + + if !yes { + if !out.interactive { + out.error(&format!( + "error: non-interactive session — pass -y to cancel job {jid}, \ + or -n to preview." + )); + return 2; + } + let prompt = if self_cancel { + format!("Cancel job {jid} (the one you're inside)?") + } else { + format!("Cancel job {jid}?") + }; + if !crate::output::confirm(&prompt, false) { + out.status("aborted"); + return 1; + } + } + + let (code, _, err) = runner(&argv); + if code != 0 { + out.error(&format!("scancel failed: {}", err.trim())); + return 1; + } + out.status(&format!("cancelled job {jid}")); + if out.json_mode { + out.json(&json!({"cancelled": jid})); + } + 0 +} + +// --- jump ---------------------------------------------------------------------- + +/// Drop the user into a shell on the job's compute node. +/// +/// Exec-replaces the current process with `srun --pty` so the user's shell +/// history and signal handling are clean. +/// +/// Nesting heads-up: attaching from *inside* an allocation ($SLURM_JOB_ID +/// set) spawns a nested step. Unlike `stop`, attach is non-destructive and +/// Ctrl-D-recoverable, so the command WARNS-AND-PROCEEDS (not refuses) — +/// `-q/--quiet` silences the heads-up. +pub fn cmd_jump( + config: &Config, + jobid_arg: Option<&str>, + quiet: bool, + runner: Runner, + out: &Out, +) -> i32 { + let res = match slurm::resolve_jobid(jobid_arg, Verb::Jump, None, None, runner) { + Ok(r) => r, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + if let Some(err) = &res.error { + out.error(&format!("error: {err}")); + return 1; + } + + if !quiet { + if res.acting_on_current() { + out.status(&format!( + "already inside job {} — opening a nested srun step here burns \ + extra resources. `exit` to leave, or pass another JOBID. \ + Attaching anyway.", + res.inside_job_id.as_deref().unwrap_or("") + )); + } else if res.inside { + out.status(&format!( + "nesting: you're inside job {}; attaching to job {} opens a \ + step on another allocation. Proceeding.", + res.inside_job_id.as_deref().unwrap_or(""), + res.job_id.as_deref().unwrap_or("") + )); + } + if res.source == "most-recent" { + out.status(&format!( + "multiple running jobs; attaching to most recent {} \ + (pass JOBID to choose another)", + res.job_id.as_deref().unwrap_or("") + )); + } + } + + let jid = res.job_id.expect("resolved job id"); + let argv = slurm::srun_pty_argv(&jid, &config.default_shell); + exec_replace(&argv, out) +} + +/// Replace the current process with `argv` (returns only on failure). +fn exec_replace(argv: &[String], out: &Out) -> i32 { + use std::os::unix::process::CommandExt; + + let err = std::process::Command::new(&argv[0]).args(&argv[1..]).exec(); + out.error(&format!("error: failed to exec {}: {err}", argv[0])); + 1 +} + +// --- time ---------------------------------------------------------------------- + +pub fn cmd_time(jobid_arg: Option<&str>, runner: Runner, out: &Out) -> i32 { + let res = match slurm::resolve_jobid(jobid_arg, Verb::Time, None, None, runner) { + Ok(r) => r, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + if let Some(err) = &res.error { + out.error(&format!("error: {err}")); + return 1; + } + if res.source == "most-recent" { + out.status(&format!( + "multiple jobs; showing most recent {} (pass JOBID to choose another)", + res.job_id.as_deref().unwrap_or("") + )); + } + + let jid = res.job_id.expect("resolved job id"); + let argv = slurm::squeue_time_left_argv(&jid); + let (code, out_text, err) = runner(&argv); + if code != 0 || out_text.trim().is_empty() { + let detail = if err.trim().is_empty() { + "(empty output)".to_string() + } else { + err.trim().to_string() + }; + out.error(&format!("squeue failed for jobid {jid}: {detail}")); + return 1; + } + let time_left = out_text.trim().to_string(); + out.emit(&json!({"jobid": jid, "time_left": time_left}), || { + Some(time_left.clone()) + }); + 0 +} + +#[cfg(test)] +mod tests { + use super::*; + + fn strs(items: &[&str]) -> Vec { + items.iter().map(|s| s.to_string()).collect() + } + + // ---- start-tail parsing (the Click-faithful algorithm) ------------------ + + #[test] + fn start_tail_empty() { + let t = parse_start_tail(&[]).unwrap(); + assert_eq!(t, StartTail::default()); + } + + #[test] + fn start_tail_dry_run_only() { + let t = parse_start_tail(&strs(&["-n"])).unwrap(); + assert!(t.dry_run); + assert_eq!(t.template, None); + assert!(t.passthrough.is_empty()); + } + + #[test] + fn start_tail_template_and_flags() { + let t = parse_start_tail(&strs(&["gpu", "-n"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert!(t.dry_run); + } + + #[test] + fn start_tail_dashdash_passthrough() { + let t = parse_start_tail(&strs(&["gpu", "-n", "--", "--mem=128G"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert_eq!(t.passthrough, ["--mem=128G"]); + } + + #[test] + fn start_tail_unknown_options_interleaved() { + let t = parse_start_tail(&strs(&["gpu", "-n", "--mem=128G", "-c", "8"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert!(t.dry_run); + assert_eq!(t.passthrough, ["--mem=128G", "-c", "8"]); + } + + #[test] + fn start_tail_first_token_after_dashdash_is_template() { + // Even an option-looking token becomes the template after `--`. + let t = parse_start_tail(&strs(&["-n", "--", "--mem=128G"])).unwrap(); + assert!(t.dry_run); + assert_eq!(t.template.as_deref(), Some("--mem=128G")); + assert!(t.passthrough.is_empty()); + } + + #[test] + fn start_tail_timeout_separate_and_equals() { + let t = parse_start_tail(&strs(&["--timeout", "30s", "-n"])).unwrap(); + assert_eq!(t.timeout.as_deref(), Some("30s")); + assert!(t.dry_run); + let t = parse_start_tail(&strs(&["--timeout=1h", "gpu"])).unwrap(); + assert_eq!(t.timeout.as_deref(), Some("1h")); + assert_eq!(t.template.as_deref(), Some("gpu")); + } + + #[test] + fn start_tail_timeout_missing_value() { + let err = parse_start_tail(&strs(&["--timeout"])).unwrap_err(); + assert_eq!(err, "Option '--timeout' requires an argument."); + } + + #[test] + fn start_tail_second_dashdash_is_literal() { + let t = parse_start_tail(&strs(&["--", "gpu", "--", "-x"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert_eq!(t.passthrough, ["--", "-x"]); + } + + #[test] + fn start_tail_options_after_dashdash_not_consumed() { + let t = parse_start_tail(&strs(&["gpu", "--", "-n", "--timeout", "5m"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert!(!t.dry_run); + assert_eq!(t.timeout, None); + assert_eq!(t.passthrough, ["-n", "--timeout", "5m"]); + } + + #[test] + fn start_tail_short_cluster_peels_known() { + let t = parse_start_tail(&strs(&["-nc"])).unwrap(); + assert!(t.dry_run); + assert_eq!(t.template.as_deref(), Some("-c")); + } + + #[test] + fn start_tail_bundled_dry_run_shorts() { + // `-nn` unbundles to two dry-run flags (golden js-bundled-shorts). + let t = parse_start_tail(&strs(&["-nn"])).unwrap(); + assert!(t.dry_run); + assert_eq!(t.template, None); + assert!(t.passthrough.is_empty()); + } + + #[test] + fn start_tail_dashdash_shields_dry_run_for_salloc() { + // golden js-dd-shield-n / js-dd-shield-n4: with the template slot + // filled, `--` forwards -n (and its value) to salloc. + let t = parse_start_tail(&strs(&["gpu", "--", "-n"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert!(!t.dry_run); + assert_eq!(t.passthrough, ["-n"]); + + let t = parse_start_tail(&strs(&["gpu", "--", "-n", "4"])).unwrap(); + assert_eq!(t.passthrough, ["-n", "4"]); + } + + #[test] + fn start_tail_dashdash_option_fills_template_slot() { + // golden js-dd-shield-timeout: the first token after `--` becomes + // the template even when it looks like a flag. + let t = parse_start_tail(&strs(&["--", "--timeout", "30s"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("--timeout")); + assert_eq!(t.timeout, None); + assert_eq!(t.passthrough, ["30s"]); + } + + #[test] + fn start_tail_double_dashdash_forwards_literal() { + // golden js-dd-dd: only the first `--` is consumed. + let t = parse_start_tail(&strs(&["gpu", "-n", "--", "--mem=1G", "--", "-c", "2"])).unwrap(); + assert_eq!(t.template.as_deref(), Some("gpu")); + assert!(t.dry_run); + assert_eq!(t.passthrough, ["--mem=1G", "--", "-c", "2"]); + } + + #[test] + fn start_tail_dry_run_with_value_is_usage_error() { + let err = parse_start_tail(&strs(&["--dry-run=true"])).unwrap_err(); + assert_eq!(err, "Option '--dry-run' does not take a value."); + let err = parse_start_tail(&strs(&["gpu", "--dry-run="])).unwrap_err(); + assert_eq!(err, "Option '--dry-run' does not take a value."); + } + + #[test] + fn start_tail_short_help_token() { + let t = parse_start_tail(&strs(&["-h"])).unwrap(); + assert!(t.help); + // After `--`, -h is passthrough-bound, not help. + let t = parse_start_tail(&strs(&["gpu", "--", "-h"])).unwrap(); + assert!(!t.help); + assert_eq!(t.passthrough, ["-h"]); + } + + // ---- table rendering ---------------------------------------------------- + + #[test] + fn jobs_table_aligns_columns() { + let jobs = vec![Job { + job_id: "54800001".to_string(), + name: "solx-default".to_string(), + state: "RUNNING".to_string(), + time_used: "1:23".to_string(), + time_left: "2-03:04:05".to_string(), + partition: "general".to_string(), + node_list: "sc042".to_string(), + }]; + let table = jobs_table(&jobs); + let lines: Vec<&str> = table.lines().collect(); + assert_eq!(lines.len(), 2); + assert!(lines[0].starts_with("JOBID")); + assert!(lines[1].starts_with("54800001 solx-default")); + } +} diff --git a/solx/src/keep.rs b/solx/src/keep.rs new file mode 100644 index 0000000..0663337 --- /dev/null +++ b/solx/src/keep.rs @@ -0,0 +1,910 @@ +//! `solx keep` — renew scratch files Sol has flagged, filtered by `[keep]`. +//! +//! Read Sol's warning CSVs from `--csv-dir`, intersect the flagged +//! directories with the `[keep]` include/exclude globs from config, and +//! refresh timestamps (`touch -a -m -c` semantics) on only the intersection. +//! Only what Sol has explicitly flagged is renewed — never a wholesale +//! `/scratch` walk. +//! +//! Execution is file-level-sharded: a streaming pipeline over one worker +//! pool — enumerate a kept directory, split its files into evenly-sized +//! batches, and touch the batches across the pool. A single huge directory +//! fans out into many batches, so `-j` scales the parallelism of the whole +//! run including its largest directory, not just the count of directories. +//! +//! This is metadata-heavy NFS I/O. On Sol run it on a compute node or the +//! DTN (`ssh soldtn`), not a throttled login node. + +use std::collections::HashSet; +use std::collections::VecDeque; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::{Condvar, Mutex}; + +use filetime::FileTime; +use serde_json::{json, Value}; + +use crate::config::KeepRules; +use crate::output::{confirm, to_python_json, Out}; + +pub const STAGE_ORDER: [&str; 3] = ["pending", "over90", "inactive"]; +pub const STAGES_ALL: &str = "all"; + +pub fn stage_file(stage: &str) -> &'static str { + match stage { + "pending" => "scratch-dirs-pending-removal.csv", + "over90" => "scratch-dirs-over-90days.csv", + "inactive" => "scratch-dirs-inactive.csv", + _ => unreachable!("stage validated by the caller"), + } +} + +/// Files per touch shard. Big enough that per-batch overhead is negligible, +/// small enough that one huge directory fans out into many batches and +/// keeps every worker busy. +pub const BATCH: usize = 2000; + +/// Cap on how many dirs are inlined into a JSON payload. Sol's warning CSVs +/// can list thousands of flagged dirs; emitting them all makes a +/// multi-megabyte document that blows an agent's context. The inlined +/// sample is capped and the true totals + a `*_truncated` flag are always +/// reported. Counts are always exact; the lists are a sample. +pub const JSON_LIST_CAP: usize = 100; + +/// The default `-j` worker count: `max(1, min(8, ncpus / 4))`. +/// +/// `ncpus` is the count of ONLINE system CPUs (`sysconf(_SC_NPROCESSORS_ONLN)`, +/// i.e. Python `os.cpu_count()` semantics), NOT the cgroup/affinity-limited +/// parallelism of the current process — inside a 4-core Slurm allocation on a +/// 128-CPU node the default is still 8. +pub fn default_jobs() -> u64 { + let n = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) }; + let cpus = if n > 0 { n as u64 } else { 2 }; + (cpus / 4).clamp(1, 8) +} + +/// The directories `solx keep` would touch (`kept`) vs filter out (`skipped`), +/// each tagged with the warning stage that flagged it. +#[derive(Debug, Default, Clone)] +pub struct Plan { + pub kept: Vec<(String, String)>, + pub skipped: Vec<(String, String)>, +} + +// --- planning ---------------------------------------------------------------- + +/// Return the `Directory` column from one of Sol's warning CSVs. +/// +/// A missing file is fine — Sol only drops the CSV when there's something +/// to flag. An empty result means nothing to do for that stage. An existing +/// file that can't be read or decoded is a hard error (the command must +/// fail loudly rather than treat the stage as "nothing flagged"). +/// +/// A UTF-8 BOM is treated as part of the first header cell's name (so a +/// BOM'd `Directory` header is not the `Directory` column and the file +/// yields no directories). +pub fn load_csv_dirs(csv_path: &Path) -> Result, String> { + if !csv_path.exists() { + return Ok(Vec::new()); + } + let read_err = + |e: &dyn std::fmt::Display| format!("unable to read {}: {e}", csv_path.display()); + let has_bom = std::fs::File::open(csv_path) + .and_then(|mut f| { + use std::io::Read; + let mut head = [0u8; 3]; + let n = f.read(&mut head)?; + Ok(n == 3 && head == [0xEF, 0xBB, 0xBF]) + }) + .map_err(|e| read_err(&e))?; + let mut reader = csv::ReaderBuilder::new() + .flexible(true) + .from_path(csv_path) + .map_err(|e| read_err(&e))?; + let headers = reader.headers().map_err(|e| read_err(&e))?; + let dir_idx = match headers + .iter() + .enumerate() + .position(|(i, name)| name == "Directory" && !(i == 0 && has_bom)) + { + Some(i) => i, + None => return Ok(Vec::new()), + }; + let mut dirs = Vec::new(); + for record in reader.records() { + let record = record.map_err(|e| read_err(&e))?; + if let Some(d) = record.get(dir_idx) { + let d = d.trim(); + if !d.is_empty() { + dirs.push(d.to_string()); + } + } + } + Ok(dirs) +} + +/// Walk the chosen stages' CSVs and split flagged dirs into kept/skipped. +pub fn build_plan(csv_dir: &Path, stages: &[String], keep: &KeepRules) -> Result { + let mut plan = Plan::default(); + let mut seen: HashSet = HashSet::new(); + for stage in stages { + for d in load_csv_dirs(&csv_dir.join(stage_file(stage)))? { + if !seen.insert(d.clone()) { + continue; + } + let entry = (stage.clone(), d.clone()); + if keep.matches(&d) { + plan.kept.push(entry); + } else { + plan.skipped.push(entry); + } + } + } + Ok(plan) +} + +// --- enumeration + touching --------------------------------------------------- +// +// Two task kinds run on one worker pool: +// enumerate_dir -- walk a kept directory, return its files +// touch_files -- refresh timestamps on a batch of those files +// touch is the expensive half (one metadata write per file), so it is +// sharded into file batches and spread across the pool. + +/// List every regular file under `directory` in one walk. +/// +/// Matches `find DIR -type f`: hidden files included, no ignore files +/// honored, symlinks not followed. Returns `(directory, files, message)`. +/// A path that isn't a directory (e.g. flagged then removed) is reported as +/// a benign skip, not an error. +pub fn enumerate_dir(directory: &str) -> (String, Vec, String) { + if !Path::new(directory).is_dir() { + return ( + directory.to_string(), + Vec::new(), + "skipped: not a directory".to_string(), + ); + } + let walker = ignore::WalkBuilder::new(directory) + .hidden(false) + .ignore(false) + .git_ignore(false) + .git_global(false) + .git_exclude(false) + .parents(false) + .follow_links(false) + .build(); + let mut files = Vec::new(); + let mut walk_error: Option = None; + for entry in walker { + match entry { + Ok(e) => { + if e.file_type().is_some_and(|t| t.is_file()) { + files.push(e.into_path()); + } + } + Err(e) => walk_error = Some(e.to_string()), + } + } + if let Some(msg) = walk_error { + return (directory.to_string(), Vec::new(), msg); + } + (directory.to_string(), files, "ok".to_string()) +} + +/// Refresh atime+mtime on a batch of files (`touch -a -m -c` semantics). +/// +/// Returns `(files_attempted, errors, message)`. A file deleted between +/// enumeration and touch is silently skipped, not an error, and nothing is +/// ever created. A real failure (permission, I/O) is counted and surfaced. +pub fn touch_files(paths: &[PathBuf]) -> (usize, usize, String) { + if paths.is_empty() { + return (0, 0, "ok".to_string()); + } + let now = FileTime::now(); + let mut errors = 0; + let mut msg = "ok".to_string(); + for p in paths { + match filetime::set_file_times(p, now, now) { + Ok(()) => {} + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => { + errors = 1; + msg = format!("touch {}: {e}", p.display()); + } + } + } + (paths.len(), errors, msg) +} + +/// Split a flat file list into evenly-sized batches for the touch pool. +pub fn shard(files: Vec, batch_size: usize) -> Vec> { + if files.is_empty() { + return Vec::new(); + } + let mut batches = Vec::with_capacity(files.len().div_ceil(batch_size)); + let mut current = Vec::with_capacity(batch_size.min(files.len())); + for f in files { + current.push(f); + if current.len() == batch_size { + batches.push(std::mem::take(&mut current)); + } + } + if !current.is_empty() { + batches.push(current); + } + batches +} + +// --- command ------------------------------------------------------------------- + +pub struct KeepOptions<'a> { + pub csv_dir: Option, + pub stage: String, + pub jobs_n: u64, + pub yes: bool, + pub dry_run: bool, + pub verbose: bool, + pub config_keep: Option<&'a KeepRules>, +} + +pub fn cmd_keep(opts: &KeepOptions, out: &Out) -> i32 { + if opts.yes && opts.dry_run { + out.error("error: --yes and --dry-run are mutually exclusive"); + return 2; + } + + // The keep-list comes from the config `[keep]` block — the single source + // of truth. + let keep_rules: &KeepRules = match opts.config_keep { + Some(rules) => rules, + None => { + out.error("error: no [keep] block in config. add one with `solx config edit`."); + return 2; + } + }; + + let csv_dir = opts.csv_dir.clone().unwrap_or_else(crate::config::home_dir); + if !csv_dir.is_dir() { + out.error(&format!( + "error: --csv-dir {} is not a directory \ + (Sol drops the warning CSVs in $HOME).", + csv_dir.display() + )); + return 2; + } + let stages: Vec = if opts.stage == STAGES_ALL { + STAGE_ORDER.iter().map(|s| s.to_string()).collect() + } else { + vec![opts.stage.clone()] + }; + + let plan = match build_plan(&csv_dir, &stages, keep_rules) { + Ok(p) => p, + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + }; + if let Err(e) = report_plan(out, &plan, &csv_dir, &stages, opts.verbose) { + out.error(&format!("error: {e}")); + return 1; + } + + if plan.kept.is_empty() { + if out.json_mode { + // Still emit a document so an agent gets structured output, not + // empty stdout, when nothing is flagged. + match plan_json(&plan, &csv_dir, &stages, opts.dry_run) { + Ok(doc) => out.json(&doc), + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + } + } else { + out.status("no flagged directories matched — nothing to do."); + } + return 0; + } + + if opts.dry_run { + if out.json_mode { + match plan_json(&plan, &csv_dir, &stages, true) { + Ok(doc) => out.json(&doc), + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + } + } + return 0; + } + + if !opts.yes { + // Destructive: never block on a prompt in a non-interactive session. + if !out.interactive { + out.error(&format!( + "error: non-interactive session — pass -y to renew {} \ + directories, or -n to preview.", + plan.kept.len() + )); + return 2; + } + if !confirm( + &format!("Touch mtimes on {} directories?", plan.kept.len()), + false, + ) { + out.status("aborted"); + return 1; + } + } + + let (total_files, failures) = execute(&plan, opts.jobs_n, out); + + if out.json_mode { + let kept_truncated = plan.kept.len() > JSON_LIST_CAP; + let mut summary = json!({ + "renewed": true, + "dirs": plan.kept.len(), + "files_touched": total_files, + "failures": failures, + "kept_truncated": kept_truncated, + "kept": plan.kept.iter().take(JSON_LIST_CAP).map(|(_, d)| d.clone()).collect::>(), + }); + if kept_truncated { + match dump_full_plan(&plan, &csv_dir, &stages) { + Ok(path) => summary["full_plan_path"] = json!(path), + Err(e) => { + out.error(&format!("error: {e}")); + return 1; + } + } + } + out.json(&summary); + } else { + let failed = if failures > 0 { + format!(" · {failures} failed") + } else { + String::new() + }; + out.status(&format!( + "done {} dirs · {total_files} files touched{failed}", + plan.kept.len() + )); + } + if failures > 0 { + 1 + } else { + 0 + } +} + +/// Print the plan summary to stderr (human) — stdout stays the data channel. +fn report_plan( + out: &Out, + plan: &Plan, + csv_dir: &Path, + stages: &[String], + verbose: bool, +) -> Result<(), String> { + if out.json_mode { + return Ok(()); + } + out.status(&format!( + "csv-dir: {} stages: {}", + csv_dir.display(), + stages.join(", ") + )); + out.status(&format!( + "plan: {} kept, {} skipped", + plan.kept.len(), + plan.skipped.len() + )); + if plan.kept.len() > JSON_LIST_CAP || plan.skipped.len() > JSON_LIST_CAP { + let path = dump_full_plan(plan, csv_dir, stages)?; + out.status(&format!( + "full plan ({} dirs): {path}", + plan.kept.len() + plan.skipped.len() + )); + } + if verbose { + if !plan.kept.is_empty() { + out.status("kept:"); + for (stage, d) in plan.kept.iter().take(20) { + out.status(&format!(" {stage:>9} {d}")); + } + if plan.kept.len() > 20 { + out.status(&format!(" … and {} more", plan.kept.len() - 20)); + } + } + if !plan.skipped.is_empty() { + out.status("skipped (flagged by Sol but not in [keep]):"); + for (stage, d) in plan.skipped.iter().take(20) { + out.status(&format!(" {stage:>9} {d}")); + } + } + } + Ok(()) +} + +/// Bounded plan document: exact counts, a capped sample of each list. +/// +/// When either list is truncated, the COMPLETE plan is spilled to a temp +/// file and its path returned under `full_plan_path` — so the response +/// stays small enough for an agent's context while the full detail is one +/// `cat` away. +fn plan_json( + plan: &Plan, + csv_dir: &Path, + stages: &[String], + dry_run: bool, +) -> Result { + let entry = |(stage, dir): &(String, String)| json!({"stage": stage, "dir": dir}); + let kept_truncated = plan.kept.len() > JSON_LIST_CAP; + let skipped_truncated = plan.skipped.len() > JSON_LIST_CAP; + let mut doc = json!({ + "dry_run": dry_run, + "csv_dir": csv_dir.display().to_string(), + "stages": stages, + "kept_count": plan.kept.len(), + "skipped_count": plan.skipped.len(), + "kept_truncated": kept_truncated, + "skipped_truncated": skipped_truncated, + "kept": plan.kept.iter().take(JSON_LIST_CAP).map(entry).collect::>(), + "skipped": plan.skipped.iter().take(JSON_LIST_CAP).map(entry).collect::>(), + }); + if kept_truncated || skipped_truncated { + doc["full_plan_path"] = json!(dump_full_plan(plan, csv_dir, stages)?); + } + Ok(doc) +} + +/// Write the complete (untruncated) plan to `solx-keep-plan-*.json` in the +/// system temp dir; return its path. +/// +/// The file is created owner-only (0600) with bounded name-collision +/// retries, and stays on disk after the run. A creation or write failure is +/// an error (the document enumerates the user's scratch layout, so a +/// truncated or missing spill must never be advertised as complete). +fn dump_full_plan(plan: &Plan, csv_dir: &Path, stages: &[String]) -> Result { + let entry = |(stage, dir): &(String, String)| json!({"stage": stage, "dir": dir}); + let doc = json!({ + "csv_dir": csv_dir.display().to_string(), + "stages": stages, + "kept": plan.kept.iter().map(entry).collect::>(), + "skipped": plan.skipped.iter().map(entry).collect::>(), + }); + let temp = tempfile::Builder::new() + .prefix("solx-keep-plan-") + .suffix(".json") + .tempfile() + .map_err(|e| format!("unable to create the full-plan temp file: {e}"))?; + let (mut file, path) = temp + .keep() + .map_err(|e| format!("unable to keep the full-plan temp file: {e}"))?; + file.write_all(to_python_json(&doc).as_bytes()) + .map_err(|e| format!("unable to write {}: {e}", path.display()))?; + Ok(path.display().to_string()) +} + +// --- execution ------------------------------------------------------------------- + +enum Task { + Enumerate(String), + Touch(String, Vec), +} + +struct PoolState { + queue: VecDeque, + in_flight: usize, + total_files: usize, + failures: usize, +} + +/// Renew `plan.kept`. Returns `(files_touched, failures)`. +/// +/// With `jobs_n <= 1` runs serially (no pool — fast and deterministic for +/// small runs). Otherwise one worker pool runs both halves: enumerate a +/// directory, shard its files, and queue the batches as touch tasks, so a +/// single huge directory spreads its batches over every worker. +pub fn execute(plan: &Plan, jobs_n: u64, out: &Out) -> (usize, usize) { + if jobs_n <= 1 { + return execute_serial(plan, out); + } + + let state = Mutex::new(PoolState { + queue: plan + .kept + .iter() + .map(|(_, d)| Task::Enumerate(d.clone())) + .collect(), + in_flight: 0, + total_files: 0, + failures: 0, + }); + let ready = Condvar::new(); + let out = *out; + + std::thread::scope(|scope| { + for _ in 0..jobs_n { + scope.spawn(|| worker(&state, &ready, &out)); + } + }); + + let final_state = state.into_inner().expect("pool lock"); + (final_state.total_files, final_state.failures) +} + +fn worker(state: &Mutex, ready: &Condvar, out: &Out) { + loop { + let task = { + let mut s = state.lock().expect("pool lock"); + loop { + if let Some(task) = s.queue.pop_front() { + s.in_flight += 1; + break task; + } + if s.in_flight == 0 { + // Nothing queued and nothing running: the pipeline drained. + ready.notify_all(); + return; + } + s = ready.wait(s).expect("pool lock"); + } + }; + + match task { + Task::Enumerate(d) => { + let (_, files, msg) = enumerate_dir(&d); + let mut s = state.lock().expect("pool lock"); + if msg == "ok" { + for batch in shard(files, BATCH) { + s.queue.push_back(Task::Touch(d.clone(), batch)); + } + } else if !msg.starts_with("skipped") { + s.failures += 1; + out.error(&format!("FAIL enumerate {d} :: {msg}")); + } + s.in_flight -= 1; + ready.notify_all(); + } + Task::Touch(d, batch) => { + let (n, errs, msg) = touch_files(&batch); + let mut s = state.lock().expect("pool lock"); + s.total_files += n; + if errs > 0 { + s.failures += 1; + out.error(&format!("FAIL touch {d} :: {msg}")); + } + s.in_flight -= 1; + ready.notify_all(); + } + } + } +} + +fn execute_serial(plan: &Plan, out: &Out) -> (usize, usize) { + let mut total_files = 0; + let mut failures = 0; + for (_, d) in &plan.kept { + let (_, files, msg) = enumerate_dir(d); + if msg != "ok" && !msg.starts_with("skipped") { + failures += 1; + out.error(&format!("FAIL enumerate {d} :: {msg}")); + continue; + } + let count = files.len(); + for batch in shard(files, BATCH) { + let (n, errs, tmsg) = touch_files(&batch); + total_files += n; + if errs > 0 { + failures += 1; + out.error(&format!("FAIL touch {d} :: {tmsg}")); + } + } + if msg == "ok" && !out.json_mode { + out.status(&format!(" ok {count:>7} files {d}")); + } + } + (total_files, failures) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + fn keep(include: &[&str], exclude: &[&str]) -> KeepRules { + KeepRules::new( + &include.iter().map(|s| s.to_string()).collect::>(), + &exclude.iter().map(|s| s.to_string()).collect::>(), + ) + } + + fn write_csv(path: &Path, dirs: &[&str]) { + let mut lines = vec!["Directory,LastAccess,Size".to_string()]; + lines.extend(dirs.iter().map(|d| format!("{d},2026-01-01,1G"))); + fs::write(path, lines.join("\n") + "\n").unwrap(); + } + + fn stages_all() -> Vec { + STAGE_ORDER.iter().map(|s| s.to_string()).collect() + } + + // ---- planning ------------------------------------------------------------ + + #[test] + fn load_csv_dirs_reads_directory_column() { + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("scratch-dirs-pending-removal.csv"); + write_csv(&p, &["/scratch/sparky/a", "/scratch/sparky/b"]); + assert_eq!( + load_csv_dirs(&p).unwrap(), + ["/scratch/sparky/a", "/scratch/sparky/b"] + ); + } + + #[test] + fn load_csv_dirs_missing_file() { + let dir = tempfile::tempdir().unwrap(); + assert!(load_csv_dirs(&dir.path().join("absent.csv")) + .unwrap() + .is_empty()); + } + + #[test] + fn load_csv_dirs_directory_not_first_column() { + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("x.csv"); + fs::write(&p, "User,Directory,Size\nsparky,/scratch/sparky/a,12G\n").unwrap(); + assert_eq!(load_csv_dirs(&p).unwrap(), ["/scratch/sparky/a"]); + } + + #[test] + fn load_csv_dirs_bom_header_yields_no_directories() { + // A BOM is part of the first header cell's name, so the column + // lookup misses and the file contributes nothing. + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("bom.csv"); + fs::write(&p, b"\xEF\xBB\xBFDirectory,Size\n/scratch/sparky/a,1G\n").unwrap(); + assert!(load_csv_dirs(&p).unwrap().is_empty()); + // With the Directory column not first, the BOM lands on another + // header and the column still resolves. + let p2 = dir.path().join("bom2.csv"); + fs::write(&p2, b"\xEF\xBB\xBFSize,Directory\n1G,/scratch/sparky/a\n").unwrap(); + assert_eq!(load_csv_dirs(&p2).unwrap(), ["/scratch/sparky/a"]); + } + + #[test] + fn load_csv_dirs_invalid_utf8_record_is_error() { + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("bad.csv"); + fs::write(&p, b"Directory,Size\n/scratch/sparky/\xFF\xFE,1G\n").unwrap(); + let err = load_csv_dirs(&p).unwrap_err(); + assert!(err.contains("unable to read")); + assert!(err.contains("bad.csv")); + } + + #[test] + fn load_csv_dirs_unreadable_file_is_error() { + use std::os::unix::fs::PermissionsExt; + let dir = tempfile::tempdir().unwrap(); + let p = dir.path().join("locked.csv"); + write_csv(&p, &["/scratch/sparky/a"]); + fs::set_permissions(&p, fs::Permissions::from_mode(0o000)).unwrap(); + let err = load_csv_dirs(&p).unwrap_err(); + fs::set_permissions(&p, fs::Permissions::from_mode(0o644)).unwrap(); + assert!(err.contains("unable to read")); + assert!(err.contains("locked.csv")); + } + + #[test] + fn build_plan_filters_by_keep() { + let dir = tempfile::tempdir().unwrap(); + write_csv( + &dir.path().join("scratch-dirs-pending-removal.csv"), + &["/scratch/sparky/proj-a", "/scratch/sparky/proj-z"], + ); + write_csv( + &dir.path().join("scratch-dirs-over-90days.csv"), + &["/scratch/sparky/proj-b"], + ); + let rules = keep(&["/scratch/sparky/proj-a", "/scratch/sparky/proj-b"], &[]); + let plan = build_plan(dir.path(), &stages_all(), &rules).unwrap(); + let kept: Vec<&str> = plan.kept.iter().map(|(_, d)| d.as_str()).collect(); + assert_eq!(kept, ["/scratch/sparky/proj-a", "/scratch/sparky/proj-b"]); + let skipped: Vec<&str> = plan.skipped.iter().map(|(_, d)| d.as_str()).collect(); + assert_eq!(skipped, ["/scratch/sparky/proj-z"]); + } + + #[test] + fn build_plan_dedupes_across_stages() { + let dir = tempfile::tempdir().unwrap(); + write_csv( + &dir.path().join("scratch-dirs-pending-removal.csv"), + &["/scratch/sparky/a"], + ); + write_csv( + &dir.path().join("scratch-dirs-over-90days.csv"), + &["/scratch/sparky/a"], + ); + let rules = keep(&["/scratch/sparky/a"], &[]); + let plan = build_plan(dir.path(), &stages_all(), &rules).unwrap(); + assert_eq!(plan.kept.len(), 1); + assert_eq!(plan.kept[0].0, "pending"); // first stage wins + } + + #[test] + fn build_plan_exclude_carve_out() { + let dir = tempfile::tempdir().unwrap(); + write_csv( + &dir.path().join("scratch-dirs-pending-removal.csv"), + &[ + "/scratch/sparky/proj/run-1", + "/scratch/sparky/proj/__pycache__", + ], + ); + let rules = keep(&["/scratch/sparky/proj/**"], &["**/__pycache__"]); + let plan = build_plan(dir.path(), &["pending".to_string()], &rules).unwrap(); + let kept: Vec<&str> = plan.kept.iter().map(|(_, d)| d.as_str()).collect(); + assert_eq!(kept, ["/scratch/sparky/proj/run-1"]); + let skipped: Vec<&str> = plan.skipped.iter().map(|(_, d)| d.as_str()).collect(); + assert_eq!(skipped, ["/scratch/sparky/proj/__pycache__"]); + } + + #[test] + fn build_plan_negation_last_match_wins() { + // `!` carve-outs within the include list (gitignore last-match-wins). + let dir = tempfile::tempdir().unwrap(); + let rules = keep(&["/scratch/sparky/proj", "!**/__pycache__"], &[]); + write_csv( + &dir.path().join("scratch-dirs-pending-removal.csv"), + &[ + "/scratch/sparky/proj/run", + "/scratch/sparky/proj/__pycache__", + "/scratch/sparky/x", + ], + ); + let plan = build_plan(dir.path(), &["pending".to_string()], &rules).unwrap(); + let kept: Vec<&str> = plan.kept.iter().map(|(_, d)| d.as_str()).collect(); + assert_eq!(kept, ["/scratch/sparky/proj/run"]); + } + + // ---- shard / enumerate / touch (the renewal mechanism) ---------------------- + + #[test] + fn shard_even_batches() { + let files: Vec = (0..10).map(|i| PathBuf::from(format!("f{i}"))).collect(); + let batches = shard(files.clone(), 3); + let sizes: Vec = batches.iter().map(|b| b.len()).collect(); + assert_eq!(sizes, [3, 3, 3, 1]); + let flat: Vec = batches.into_iter().flatten().collect(); + assert_eq!(flat, files); + } + + #[test] + fn shard_empty() { + assert!(shard(Vec::new(), BATCH).is_empty()); + } + + #[test] + fn enumerate_dir_lists_all_including_hidden_and_ignored() { + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("a.txt"), "x").unwrap(); + fs::write(dir.path().join(".hidden"), "x").unwrap(); + fs::create_dir(dir.path().join("sub")).unwrap(); + fs::write(dir.path().join("sub/b.txt"), "x").unwrap(); + // A .gitignore plus an ignored file: both must still be listed. + fs::write(dir.path().join(".gitignore"), "ignored.txt\n").unwrap(); + fs::write(dir.path().join("ignored.txt"), "x").unwrap(); + + let (_, files, msg) = enumerate_dir(dir.path().to_str().unwrap()); + assert_eq!(msg, "ok"); + assert!(files.iter().all(|p| p.is_file())); + // 5 regular files: a.txt, .hidden, sub/b.txt, .gitignore, ignored.txt + assert_eq!(files.len(), 5); + } + + #[test] + fn enumerate_dir_skips_symlinked_files() { + // `find -type f` does not count symlinks; neither does the walker. + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("real.txt"), "x").unwrap(); + std::os::unix::fs::symlink(dir.path().join("real.txt"), dir.path().join("link.txt")) + .unwrap(); + let (_, files, msg) = enumerate_dir(dir.path().to_str().unwrap()); + assert_eq!(msg, "ok"); + assert_eq!(files.len(), 1); + } + + #[test] + fn enumerate_dir_not_a_directory() { + let dir = tempfile::tempdir().unwrap(); + let missing = dir.path().join("nope"); + let (_, files, msg) = enumerate_dir(missing.to_str().unwrap()); + assert!(files.is_empty()); + assert!(msg.starts_with("skipped")); + } + + #[test] + fn touch_files_refreshes_times() { + let dir = tempfile::tempdir().unwrap(); + let f = dir.path().join("stale.txt"); + fs::write(&f, "x").unwrap(); + let old = FileTime::from_unix_time(FileTime::now().unix_seconds() - 8_640_000, 0); + filetime::set_file_times(&f, old, old).unwrap(); + + let (attempted, errors, _) = touch_files(std::slice::from_ref(&f)); + assert_eq!((attempted, errors), (1, 0)); + let mtime = FileTime::from_last_modification_time(&f.metadata().unwrap()); + assert!(mtime.unix_seconds() > FileTime::now().unix_seconds() - 10); + } + + #[test] + fn touch_files_missing_path_is_silent_skip() { + let dir = tempfile::tempdir().unwrap(); + let ghost = dir.path().join("gone.txt"); + let (attempted, errors, msg) = touch_files(std::slice::from_ref(&ghost)); + assert_eq!((attempted, errors), (1, 0)); + assert_eq!(msg, "ok"); + assert!(!ghost.exists()); // never created + } + + #[test] + fn touch_files_empty_batch() { + assert_eq!(touch_files(&[]), (0, 0, "ok".to_string())); + } + + #[test] + fn execute_serial_counts_and_skips() { + let dir = tempfile::tempdir().unwrap(); + let real = dir.path().join("proj"); + fs::create_dir(&real).unwrap(); + fs::write(real.join("a"), "x").unwrap(); + fs::write(real.join("b"), "x").unwrap(); + let plan = Plan { + kept: vec![ + ("pending".to_string(), real.display().to_string()), + ("pending".to_string(), "/does/not/exist".to_string()), + ], + skipped: vec![], + }; + let out = Out { + json_mode: true, + interactive: false, + }; + let (files, failures) = execute(&plan, 1, &out); + assert_eq!((files, failures), (2, 0)); + } + + #[test] + fn execute_parallel_matches_serial_counts() { + let dir = tempfile::tempdir().unwrap(); + let mut kept = Vec::new(); + for d in 0..5 { + let sub = dir.path().join(format!("d{d}")); + fs::create_dir(&sub).unwrap(); + for f in 0..7 { + fs::write(sub.join(format!("f{f}")), "x").unwrap(); + } + kept.push(("pending".to_string(), sub.display().to_string())); + } + let plan = Plan { + kept, + skipped: vec![], + }; + let out = Out { + json_mode: true, + interactive: false, + }; + let (files, failures) = execute(&plan, 4, &out); + assert_eq!((files, failures), (35, 0)); + } + + #[test] + fn default_jobs_within_bounds() { + let n = default_jobs(); + assert!((1..=8).contains(&n)); + } +} diff --git a/solx/src/main.rs b/solx/src/main.rs new file mode 100644 index 0000000..201b86d --- /dev/null +++ b/solx/src/main.rs @@ -0,0 +1,601 @@ +//! `solx` entry point and command dispatch. +//! +//! Surface (see docs/solx.md): +//! +//! ```text +//! solx init +//! solx job list (alias `ls`; group also reachable as `jobs`) +//! solx job start [TEMPLATE] +//! solx job stop [JOBID] +//! solx job jump [JOBID] (also `solx jump`) +//! solx job time [JOBID] +//! solx keep [--stage S] [--csv-dir D] [-j N] [-y] [-n] [-v] +//! solx config show [--json] +//! solx config edit +//! solx completions +//! solx cheatsheet +//! solx version (alias of --version) +//! solx help (alias of --help) +//! ``` +//! +//! Global output flag: `--json` forces JSON; by default output auto-detects +//! (tables on a terminal, JSON when stdout is not a TTY). `--json` is +//! accepted both before the subcommand and trailing on every leaf except +//! `job start`, where a non-leading `--json` is salloc passthrough. + +mod cheatsheet; +mod completions; +mod config; +mod gitwild; +mod init; +mod jobs; +mod keep; +mod output; +mod side; +mod slurm; + +use std::path::PathBuf; + +use clap::{CommandFactory, Parser, Subcommand}; + +use crate::output::{py_repr, Out}; +use crate::side::require_sol; + +const VERSION: &str = env!("CARGO_PKG_VERSION"); + +#[derive(Parser)] +#[command( + name = "solx", + about = "CLI for ASU's Sol supercomputer.", + disable_version_flag = true, + disable_help_subcommand = true +)] +struct Cli { + /// Show version and exit. + #[arg(long)] + version: bool, + + /// Force JSON output (machine-readable). + #[arg(long, global = true)] + json: bool, + + #[command(subcommand)] + command: Option, +} + +#[derive(Subcommand)] +enum Cmd { + /// Write a starter config.toml. + Init { + /// Overwrite without prompting (-y/--yes accepted too). + #[arg(short = 'f', long = "force", alias = "yes", short_alias = 'y')] + force: bool, + }, + + /// Renew CSV-flagged scratch files filtered by the keep block in config. + Keep(KeepArgs), + + /// Drop into a shell on the job's compute node (= solx job jump). + Jump { + /// Job ID. Defaults to current job (compute) or sole/most-recent + /// running job (login). + jobid: Option, + /// Suppress the nesting / most-recent heads-up. + #[arg(short = 'q', long)] + quiet: bool, + }, + + /// Emit a shell completion script (bash, zsh, or fish). + Completions { + /// Target shell: bash, zsh, or fish. + shell: String, + }, + + /// Print the Sol cheat sheet (SLURM + solx quick reference) as text. + Cheatsheet, + + /// Show version and exit (alias of --version). + Version, + + /// Manage interactive Slurm jobs on Sol (alias: jobs). + #[command(alias = "jobs")] + Job { + #[command(subcommand)] + command: Option, + }, + + /// Inspect and edit the solx config. + Config { + #[command(subcommand)] + command: Option, + }, + + /// Show help and exit (alias of --help). + Help, +} + +#[derive(clap::Args)] +struct KeepArgs { + /// Which warning CSVs to read. + #[arg(long, default_value = "all")] + stage: String, + /// Directory holding Sol's warning CSVs. + #[arg(long = "csv-dir")] + csv_dir: Option, + /// Parallel touch workers. + #[arg( + short = 'j', + long = "jobs", + default_value_t = keep::default_jobs(), + value_parser = clap::value_parser!(u64).range(1..) + )] + jobs: u64, + /// Skip confirmation prompt (also -f/--force). + #[arg(short = 'y', long = "yes", alias = "force", short_alias = 'f')] + yes: bool, + /// Print plan without executing. + #[arg(short = 'n', long = "dry-run")] + dry_run: bool, + /// Verbose plan + progress. + #[arg(short = 'v', long)] + verbose: bool, +} + +#[derive(Subcommand)] +enum JobCmd { + /// Print my Sol jobs. + #[command(alias = "ls")] + List, + + /// Start an interactive allocation from a config template. + /// + /// Unrecognized options and everything after `--` pass through to + /// salloc. + #[command(disable_help_flag = true)] + Start { + /// Template name (defaults to default_template) plus salloc + /// passthrough. + #[arg(num_args = 0.., allow_hyphen_values = true, trailing_var_arg = true)] + rest: Vec, + }, + + /// Cancel a job (prompts unless -y). + Stop { + /// Job ID. Defaults per resolution rules. + jobid: Option, + /// Skip confirmation prompt (also -f/--force). + #[arg(short = 'y', long = "yes", alias = "force", short_alias = 'f')] + yes: bool, + /// Print scancel argv without executing. + #[arg(short = 'n', long = "dry-run")] + dry_run: bool, + }, + + /// Drop into a shell on the job's compute node. + Jump { + /// Job ID. Defaults per resolution rules. + jobid: Option, + /// Suppress the nesting / most-recent heads-up. + #[arg(short = 'q', long)] + quiet: bool, + }, + + /// Print remaining time (D-HH:MM:SS). + Time { + /// Job ID. Defaults per resolution rules. + jobid: Option, + }, +} + +#[derive(Subcommand)] +enum ConfigCmd { + /// Print the resolved config. + Show, + /// Open the config in $EDITOR. + Edit, +} + +fn main() { + std::process::exit(run()); +} + +fn run() -> i32 { + // Runtime-completion invocations (the `_SOLX_COMPLETE` env protocol + // that installed completion scripts use to call back into solx) never + // execute a command: exit 0 silently. + if std::env::var_os("_SOLX_COMPLETE").is_some() { + return 0; + } + + let argv: Vec = std::env::args().skip(1).collect(); + + // A leading `--json` resolves before anything else so a `job start` + // invocation can hand its raw tail to the Click-style parser (clap + // would otherwise eat the `--` separator and the passthrough options). + // `--version` is left to clap: only an invocation it fully validates + // prints the version (junk alongside the flag is a usage error). + let mut i = 0; + let mut json = false; + while i < argv.len() && argv[i] == "--json" { + json = true; + i += 1; + } + let rest = &argv[i..]; + + // No-args invocations print the group help on stdout and exit 2. + if rest.is_empty() { + return print_group_help(&[]); + } + if rest.len() == 1 && matches!(rest[0].as_str(), "job" | "jobs" | "config") { + let group = if rest[0] == "jobs" { "job" } else { &rest[0] }; + return print_group_help(&[group]); + } + // `job start` parses its own tail (template / passthrough split). + if matches!(rest[0].as_str(), "job" | "jobs") + && rest.get(1).map(String::as_str) == Some("start") + { + return run_job_start(json, &rest[2..]); + } + + let cli = match Cli::try_parse() { + Ok(cli) => cli, + Err(err) => { + // clap renders help to stdout (exit 0) and usage errors to + // stderr (exit 2). + err.exit(); + } + }; + if cli.version { + println!("{VERSION}"); + return 0; + } + let json = cli.json || json; + + match cli.command { + None => { + eprintln!("error: missing subcommand. Try 'solx --help'."); + 2 + } + Some(Cmd::Version) => { + println!("{VERSION}"); + 0 + } + Some(Cmd::Help) => { + print!("{}", root_help()); + 0 + } + Some(Cmd::Completions { shell }) => completions::cmd_completions(&shell), + Some(Cmd::Cheatsheet) => cheatsheet::cmd_cheatsheet(), + Some(Cmd::Init { force }) => { + require_sol(); + let out = Out::auto(json); + init::cmd_init(force, &out) + } + Some(Cmd::Keep(args)) => { + require_sol(); + run_keep(&args, json) + } + Some(Cmd::Jump { jobid, quiet }) => { + require_sol(); + run_jump(jobid.as_deref(), quiet, json) + } + Some(Cmd::Job { command }) => match command { + None => print_group_help(&["job"]), + Some(JobCmd::List) => { + require_sol(); + let out = Out::auto(json); + jobs::cmd_list(&slurm::real_runner, &out) + } + // Unreachable in practice: `job start` is intercepted on the raw + // argv above. Kept for completeness. + Some(JobCmd::Start { rest }) => run_job_start(json, &rest), + Some(JobCmd::Stop { + jobid, + yes, + dry_run, + }) => { + require_sol(); + let out = Out::auto(json); + jobs::cmd_stop(jobid.as_deref(), yes, dry_run, &slurm::real_runner, &out) + } + Some(JobCmd::Jump { jobid, quiet }) => { + require_sol(); + run_jump(jobid.as_deref(), quiet, json) + } + Some(JobCmd::Time { jobid }) => { + require_sol(); + let out = Out::auto(json); + jobs::cmd_time(jobid.as_deref(), &slurm::real_runner, &out) + } + }, + Some(Cmd::Config { command }) => match command { + None => print_group_help(&["config"]), + Some(ConfigCmd::Show) => { + require_sol(); + run_config_show(json) + } + Some(ConfigCmd::Edit) => { + require_sol(); + run_config_edit() + } + }, + } +} + +/// The root help text, with the binary name in the usage line. +fn root_help() -> String { + Cli::command().bin_name("solx").render_help().to_string() +} + +/// Print the help for a (sub)command path on stdout; exit code 2 +/// (a no-args invocation is a usage error that still shows the way out). +fn print_group_help(path: &[&str]) -> i32 { + match path { + [] => print!("{}", root_help()), + [group] => { + // Render with the full `solx ` usage prefix. + let mut cmd = Cli::command(); + let mut sub = cmd + .find_subcommand_mut(group) + .expect("known subcommand group") + .clone() + .bin_name(format!("solx {group}")); + print!("{}", sub.render_help()); + } + _ => unreachable!("only root and one-level groups print help here"), + } + 2 +} + +fn load_or_exit(out: &Out) -> Result { + match config::load(&config::config_path()) { + Ok(c) => Ok(c), + Err(e) => { + out.error(&format!("error: {e}")); + Err(2) + } + } +} + +fn run_jump(jobid: Option<&str>, quiet: bool, json: bool) -> i32 { + let out = Out::auto(json); + let config = match load_or_exit(&out) { + Ok(c) => c, + Err(code) => return code, + }; + jobs::cmd_jump(&config, jobid, quiet, &slurm::real_runner, &out) +} + +/// `job start` help. The command's tail is parsed by +/// [`jobs::parse_start_tail`], not clap, so its help is rendered here: the +/// full `solx job start` usage plus the contract options (`-n/--dry-run`, +/// `--timeout`), the TEMPLATE argument, and the salloc passthrough. +const JOB_START_HELP: &str = "\ +Start an interactive allocation from a config template. + +Unrecognized options and everything after `--` pass through to salloc. + +Usage: solx job start [OPTIONS] [TEMPLATE] [SALLOC_ARGS]... + +Arguments: + [TEMPLATE] Template name; defaults to default_template + [SALLOC_ARGS]... Extra arguments forwarded to salloc + +Options: + -n, --dry-run Print salloc argv without submitting + --timeout Override start_timeout (e.g. \"5m\", \"1h\") + -h, --help Print help +"; + +fn run_job_start(json: bool, tail: &[String]) -> i32 { + require_sol(); + let parsed = match jobs::parse_start_tail(tail) { + Ok(p) => p, + Err(e) => { + eprintln!("error: {e}"); + return 2; + } + }; + if parsed.help { + print!("{JOB_START_HELP}"); + return 0; + } + let out = Out::auto(json); + let config = match load_or_exit(&out) { + Ok(c) => c, + Err(code) => return code, + }; + let mut timeout_seconds: Option = None; + if let Some(t) = parsed.timeout.as_deref().filter(|t| !t.is_empty()) { + match config::parse_duration(t) { + Ok(secs) => timeout_seconds = Some(secs), + Err(e) => { + out.error(&format!("error: {e}")); + return 2; + } + } + } + jobs::cmd_start( + &config, + parsed.template.as_deref(), + parsed.dry_run, + timeout_seconds, + &parsed.passthrough, + None, + &out, + ) +} + +fn run_keep(args: &KeepArgs, json: bool) -> i32 { + let out = Out::auto(json); + let valid = ["all", "inactive", "over90", "pending"]; // sorted + if !valid.contains(&args.stage.as_str()) { + out.error(&format!( + "invalid --stage {}. choose from: {}", + py_repr(&args.stage), + valid.join(", ") + )); + return 2; + } + // The keep-list lives in the config `[keep]` block. A missing config.toml + // is fine (config stays None → `keep` reports no [keep] block); a config + // that exists but is malformed still errors. + let config = if config::config_path().exists() { + match load_or_exit(&out) { + Ok(c) => Some(c), + Err(code) => return code, + } + } else { + None + }; + let opts = keep::KeepOptions { + csv_dir: args.csv_dir.clone(), + stage: args.stage.clone(), + jobs_n: args.jobs, + yes: args.yes, + dry_run: args.dry_run, + verbose: args.verbose, + config_keep: config.as_ref().and_then(|c| c.keep.as_ref()), + }; + keep::cmd_keep(&opts, &out) +} + +fn run_config_show(json: bool) -> i32 { + use serde_json::{json, Map, Value}; + + let out = Out::auto(json); + let config = match load_or_exit(&out) { + Ok(c) => c, + Err(code) => return code, + }; + + if out.json_mode { + let mut templates = Map::new(); + for (name, t) in &config.templates { + let mut body = Map::new(); + body.insert("name".to_string(), json!(t.name)); + body.insert("partition".to_string(), json!(t.partition)); + body.insert("time".to_string(), json!(t.time)); + if let Some(qos) = &t.qos { + body.insert("qos".to_string(), json!(qos)); + } + if let Some(gres) = &t.gres { + body.insert("gres".to_string(), json!(gres)); + } + if !t.extra_args.is_empty() { + body.insert("extra_args".to_string(), json!(t.extra_args)); + } + templates.insert(name.clone(), Value::Object(body)); + } + let keep_value = match &config.keep { + Some(k) => json!({"include": k.raw_include, "exclude": k.raw_exclude}), + None => Value::Null, + }; + out.json(&json!({ + "default_shell": config.default_shell, + "default_template": config.default_template, + "start_timeout_seconds": config.start_timeout_seconds, + "templates": templates, + "keep": keep_value, + })); + return 0; + } + + out.human(&format!("default_shell {}", config.default_shell)); + out.human(&format!("default_template {}", config.default_template)); + out.human(&format!( + "start_timeout {}s", + config.start_timeout_seconds + )); + for (name, t) in &config.templates { + out.human(&format!("\n[jobs.{name}]")); + out.human(&format!(" partition {}", t.partition)); + out.human(&format!(" time {}", t.time)); + if let Some(qos) = &t.qos { + out.human(&format!(" qos {qos}")); + } + if let Some(gres) = &t.gres { + out.human(&format!(" gres {gres}")); + } + if !t.extra_args.is_empty() { + out.human(&format!(" extra_args {}", t.extra_args.join(" "))); + } + } + match &config.keep { + Some(k) => { + out.human("\n[keep]"); + for (i, pat) in k.raw_include.iter().enumerate() { + let label = if i == 0 { "include " } else { " " }; + out.human(&format!(" {label} {pat}")); + } + for (i, pat) in k.raw_exclude.iter().enumerate() { + let label = if i == 0 { "exclude " } else { " " }; + out.human(&format!(" {label} {pat}")); + } + } + None => out.human("\n[keep] not configured (solx keep will exit 2)"), + } + 0 +} + +fn run_config_edit() -> i32 { + let p = config::config_path(); + if !p.exists() { + eprintln!("no config at {}. run `solx init` first.", p.display()); + return 2; + } + // $EDITOR is often a command with flags (e.g. "code --wait", + // "vim -u NORC"), so split it into argv rather than treating the whole + // string as one binary. + let editor = std::env::var("EDITOR") + .ok() + .filter(|s| !s.is_empty()) + .or_else(|| which("vi")) + .unwrap_or_else(|| "nano".to_string()); + // An unparseable $EDITOR (e.g. an unbalanced quote) is a hard runtime + // failure, not a usage error: one clean line, exit 1. + let argv = match shlex::split(&editor) { + Some(argv) if !argv.is_empty() => argv, + _ => { + eprintln!("error: unparseable $EDITOR value {}", py_repr(&editor)); + return 1; + } + }; + match std::process::Command::new(&argv[0]) + .args(&argv[1..]) + .arg(&p) + .status() + { + Ok(status) => status.code().unwrap_or(1), + Err(e) => { + eprintln!("error: failed to run {}: {e}", argv[0]); + 1 + } + } +} + +/// Locate `name` on PATH (a plain executable-file check). +fn which(name: &str) -> Option { + use std::os::unix::fs::PermissionsExt; + + let path = std::env::var("PATH").ok()?; + for dir in path.split(':').filter(|d| !d.is_empty()) { + let candidate = std::path::Path::new(dir).join(name); + if let Ok(meta) = candidate.metadata() { + if meta.is_file() && meta.permissions().mode() & 0o111 != 0 { + return Some(candidate.display().to_string()); + } + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn cli_tree_is_consistent() { + Cli::command().debug_assert(); + } +} diff --git a/solx/src/output.rs b/solx/src/output.rs new file mode 100644 index 0000000..0f1ab98 --- /dev/null +++ b/solx/src/output.rs @@ -0,0 +1,285 @@ +//! Output layer: human rendering vs machine-readable JSON. +//! +//! A CLI driven by an agent should not have to know a flag exists to get +//! parseable output. So: +//! +//! * When stdout is **not a TTY**, data commands emit JSON automatically; on +//! a TTY they render plain aligned tables. The global `--json` flag forces +//! JSON anywhere (a human on a terminal gets tables with no flag; an agent +//! passes `--json`). +//! * All diagnostics, progress, and errors go to **stderr**, so stdout stays +//! a clean data channel an agent can parse without stripping noise. +//! * Interactivity (whether we may *prompt*) is decided by **stdin**, +//! separately from the stdout-format decision. A non-interactive session +//! never blocks on a confirmation prompt. +//! +//! JSON documents are rendered byte-for-byte like Python's +//! `json.dumps(obj, indent=2)` plus a trailing newline: two-space indent, +//! `", "` / `": "` separators, and `\uXXXX` escapes for every non-ASCII +//! character. Payloads are built as [`serde_json::Value`] with the +//! `preserve_order` feature, so object keys serialize in insertion order. + +use std::io::{IsTerminal, Write}; + +use serde_json::Value; + +/// A resolved output target: format choice + interactivity. +#[derive(Clone, Copy, Debug)] +pub struct Out { + /// Emit JSON on the data channel (stdout) instead of a human rendering. + pub json_mode: bool, + /// stdin is a TTY, so prompting a human is allowed. + pub interactive: bool, +} + +impl Out { + /// Build an `Out`, auto-detecting format from the stdout TTY. + /// + /// `force_json` (the global `--json` flag) overrides the auto-detect. + /// `interactive` reflects whether **stdin** is a TTY. + pub fn auto(force_json: bool) -> Self { + Out { + json_mode: force_json || !std::io::stdout().is_terminal(), + interactive: std::io::stdin().is_terminal(), + } + } + + /// A progress / context line. Goes to stderr in every mode. + pub fn status(&self, msg: &str) { + eprintln!("{msg}"); + } + + /// An error line. Goes to stderr in every mode. + pub fn error(&self, msg: &str) { + eprintln!("{msg}"); + } + + /// Write one clean JSON document to stdout (no color, no wrapping). + pub fn json(&self, obj: &Value) { + let mut stdout = std::io::stdout().lock(); + let _ = stdout.write_all(to_python_json(obj).as_bytes()); + let _ = stdout.write_all(b"\n"); + let _ = stdout.flush(); + } + + /// Render something to stdout in human mode. + pub fn human(&self, text: &str) { + println!("{text}"); + } + + /// Emit a result: JSON `data` in json mode, else the `human` render. + /// + /// `human` is a thunk so the (possibly expensive) rendering is only + /// built when it will actually be shown. A `None` render prints nothing. + pub fn emit(&self, data: &Value, human: impl FnOnce() -> Option) { + if self.json_mode { + self.json(data); + } else if let Some(rendered) = human() { + self.human(&rendered); + } + } +} + +/// Ask a yes/no question on stderr and read the answer from stdin. +/// +/// Callers gate on [`Out::interactive`] first — a non-interactive session +/// must never reach a prompt. Empty input takes `default`; `y`/`yes` +/// (case-insensitive) is true, anything else false. +pub fn confirm(prompt: &str, default: bool) -> bool { + let hint = if default { "[Y/n]" } else { "[y/N]" }; + eprint!("{prompt} {hint} "); + let _ = std::io::stderr().flush(); + let mut line = String::new(); + if std::io::stdin().read_line(&mut line).is_err() { + return default; + } + let answer = line.trim().to_ascii_lowercase(); + if answer.is_empty() { + return default; + } + matches!(answer.as_str(), "y" | "yes") +} + +/// Render `v` exactly like Python's `json.dumps(v, indent=2)` (no trailing +/// newline; callers append one per document). +pub fn to_python_json(v: &Value) -> String { + let mut buf = String::new(); + write_value(v, 0, &mut buf); + buf +} + +fn write_value(v: &Value, indent: usize, buf: &mut String) { + match v { + Value::Null => buf.push_str("null"), + Value::Bool(b) => buf.push_str(if *b { "true" } else { "false" }), + Value::Number(n) => buf.push_str(&n.to_string()), + Value::String(s) => write_string(s, buf), + Value::Array(items) => { + if items.is_empty() { + buf.push_str("[]"); + return; + } + buf.push_str("[\n"); + for (i, item) in items.iter().enumerate() { + push_spaces(buf, indent + 2); + write_value(item, indent + 2, buf); + if i + 1 < items.len() { + buf.push(','); + } + buf.push('\n'); + } + push_spaces(buf, indent); + buf.push(']'); + } + Value::Object(map) => { + if map.is_empty() { + buf.push_str("{}"); + return; + } + buf.push_str("{\n"); + for (i, (key, val)) in map.iter().enumerate() { + push_spaces(buf, indent + 2); + write_string(key, buf); + buf.push_str(": "); + write_value(val, indent + 2, buf); + if i + 1 < map.len() { + buf.push(','); + } + buf.push('\n'); + } + push_spaces(buf, indent); + buf.push('}'); + } + } +} + +fn push_spaces(buf: &mut String, n: usize) { + for _ in 0..n { + buf.push(' '); + } +} + +/// Escape a string like Python's `json.dumps` with `ensure_ascii=True`: +/// everything outside `0x20..=0x7E` becomes a `\uXXXX` escape (surrogate +/// pairs for astral-plane characters). +fn write_string(s: &str, buf: &mut String) { + buf.push('"'); + for c in s.chars() { + match c { + '"' => buf.push_str("\\\""), + '\\' => buf.push_str("\\\\"), + '\n' => buf.push_str("\\n"), + '\r' => buf.push_str("\\r"), + '\t' => buf.push_str("\\t"), + '\u{8}' => buf.push_str("\\b"), + '\u{c}' => buf.push_str("\\f"), + '\u{20}'..='\u{7e}' => buf.push(c), + _ => { + let cp = c as u32; + if cp <= 0xFFFF { + buf.push_str(&format!("\\u{cp:04x}")); + } else { + let v = cp - 0x10000; + let hi = 0xD800 + (v >> 10); + let lo = 0xDC00 + (v & 0x3FF); + buf.push_str(&format!("\\u{hi:04x}\\u{lo:04x}")); + } + } + } + } + buf.push('"'); +} + +/// Strip style-tag lookalikes from a diagnostic, the way the reference +/// plain (non-TTY) renderer does: a bracket group whose content is an +/// optional `/` followed only by characters in `[a-zA-Z0-9 #._]` is removed; +/// a bracket written as `\[` stays a literal `[`. Bracket groups carrying +/// any other character (e.g. `[jobs.]`, `[jobs.*]`) are left intact. +pub fn strip_markup(msg: &str) -> String { + let protected = msg.replace("\\[", "\u{0}"); + let tag = regex::Regex::new(r"\[/?[a-zA-Z0-9 #._]*\]").expect("markup tag pattern is valid"); + tag.replace_all(&protected, "").replace('\u{0}', "[") +} + +/// Render `s` like Python's `repr()` for the common case: single quotes, +/// switching to double quotes when the string contains a single quote (and +/// no double quote), with backslash escapes for the usual control characters. +pub fn py_repr(s: &str) -> String { + let quote = if s.contains('\'') && !s.contains('"') { + '"' + } else { + '\'' + }; + let mut out = String::with_capacity(s.len() + 2); + out.push(quote); + for c in s.chars() { + match c { + '\\' => out.push_str("\\\\"), + '\n' => out.push_str("\\n"), + '\r' => out.push_str("\\r"), + '\t' => out.push_str("\\t"), + c if c == quote => { + out.push('\\'); + out.push(c); + } + c if (c as u32) < 0x20 || c as u32 == 0x7f => { + out.push_str(&format!("\\x{:02x}", c as u32)); + } + c => out.push(c), + } + } + out.push(quote); + out +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn json_object_two_space_indent_ordered() { + let v = json!({"b": 1, "a": [1, 2], "c": {"x": true}}); + assert_eq!( + to_python_json(&v), + "{\n \"b\": 1,\n \"a\": [\n 1,\n 2\n ],\n \"c\": {\n \"x\": true\n }\n}" + ); + } + + #[test] + fn json_empty_containers_stay_inline() { + assert_eq!(to_python_json(&json!([])), "[]"); + assert_eq!(to_python_json(&json!({})), "{}"); + assert_eq!( + to_python_json(&json!({"a": [], "b": {}})), + "{\n \"a\": [],\n \"b\": {}\n}" + ); + } + + #[test] + fn json_strings_escape_non_ascii_like_python() { + // Python: json.dumps("café — ok\t\x7f") == '"caf\\u00e9 \\u2014 ok\\t\\u007f"' + let v = json!("café — ok\t\u{7f}"); + assert_eq!(to_python_json(&v), "\"caf\\u00e9 \\u2014 ok\\t\\u007f\""); + } + + #[test] + fn json_astral_plane_uses_surrogate_pairs() { + let v = json!("🎉"); + assert_eq!(to_python_json(&v), "\"\\ud83c\\udf89\""); + } + + #[test] + fn json_null_and_numbers() { + let v = json!({"keep": null, "n": 300}); + assert_eq!(to_python_json(&v), "{\n \"keep\": null,\n \"n\": 300\n}"); + } + + #[test] + fn py_repr_quoting() { + assert_eq!(py_repr("tcsh"), "'tcsh'"); + assert_eq!(py_repr("it's"), "\"it's\""); + assert_eq!(py_repr("a\"b'c"), "'a\"b\\'c'"); + assert_eq!(py_repr("a\nb"), "'a\\nb'"); + } +} diff --git a/solx/src/side.rs b/solx/src/side.rs new file mode 100644 index 0000000..32f903c --- /dev/null +++ b/solx/src/side.rs @@ -0,0 +1,195 @@ +//! Detect whether the current host is part of the Sol cluster. +//! +//! `solx` is Sol-only. Each subcommand asks [`require_sol`] to enforce the +//! guard — wrong-side invocations exit 2 with a clear redirect rather than +//! attempting to talk to a Slurm controller that isn't there. + +use std::io::Read; +use std::process::{Command, Stdio}; +use std::time::{Duration, Instant}; + +pub const SOL_HOSTNAME_SUFFIX: &str = ".sol.rc.asu.edu"; + +const NOT_SOL_MESSAGE: &str = "solx is Sol-only — SSH to a Sol login node first, then re-run.\n\ + See: https://docs.rc.asu.edu/"; + +/// Return `true` if the current host is on the Sol cluster. +/// +/// Looks for any token ending in `.sol.rc.asu.edu` in `hostname -a` output +/// and the DNS-resolved FQDN of the kernel hostname. +pub fn is_sol() -> bool { + matches_sol(&hostname_a()) +} + +/// Exit 2 with a redirect message if not on Sol. Used by every subcommand. +pub fn require_sol() { + if !is_sol() { + eprintln!("{NOT_SOL_MESSAGE}"); + std::process::exit(2); + } +} + +pub fn matches_sol(text: &str) -> bool { + text.split_whitespace() + .any(|tok| tok.ends_with(SOL_HOSTNAME_SUFFIX)) +} + +/// The kernel hostname (FQDN when the node is configured with one). +fn kernel_hostname() -> String { + std::fs::read_to_string("/proc/sys/kernel/hostname") + .map(|s| s.trim().to_string()) + .unwrap_or_default() +} + +/// The DNS-resolved fully qualified name for this host (Python +/// `socket.getfqdn()` semantics): resolve the kernel hostname to an +/// address, reverse-resolve that address, and take the first of the +/// returned primary name + aliases that contains a dot (else the primary +/// name); the kernel hostname is returned unchanged when resolution fails. +/// On Sol compute nodes the kernel hostname is the short name (e.g. +/// `scc041`) and the resolver supplies the `.sol.rc.asu.edu` form. +fn fqdn() -> String { + let name = kernel_hostname(); + if name.is_empty() { + return name; + } + match reverse_names(&name) { + Some((primary, aliases)) => std::iter::once(primary.clone()) + .chain(aliases) + .find(|n| n.contains('.')) + .unwrap_or(primary), + None => name, + } +} + +extern "C" { + // Not re-exported by the libc crate; the glibc prototype. + fn gethostbyaddr( + addr: *const libc::c_void, + len: libc::socklen_t, + family: libc::c_int, + ) -> *mut libc::hostent; +} + +/// Resolve `name` forward to its first address, then reverse-resolve the +/// address. Returns the primary host name and its aliases, or `None` when +/// either resolution step fails. +fn reverse_names(name: &str) -> Option<(String, Vec)> { + use std::ffi::{CStr, CString}; + + let c_name = CString::new(name).ok()?; + let mut hints: libc::addrinfo = unsafe { std::mem::zeroed() }; + hints.ai_family = libc::AF_UNSPEC; + let mut res: *mut libc::addrinfo = std::ptr::null_mut(); + let rc = unsafe { libc::getaddrinfo(c_name.as_ptr(), std::ptr::null(), &hints, &mut res) }; + if rc != 0 || res.is_null() { + return None; + } + + // Extract (address bytes, family) from the first result. + let addr: Option<(Vec, libc::c_int)> = unsafe { + let family = (*res).ai_family; + let sockaddr = (*res).ai_addr; + match family { + libc::AF_INET => { + let sin = sockaddr as *const libc::sockaddr_in; + let bytes = (*sin).sin_addr.s_addr.to_ne_bytes().to_vec(); + Some((bytes, family)) + } + libc::AF_INET6 => { + let sin6 = sockaddr as *const libc::sockaddr_in6; + Some(((*sin6).sin6_addr.s6_addr.to_vec(), family)) + } + _ => None, + } + }; + unsafe { libc::freeaddrinfo(res) }; + let (bytes, family) = addr?; + + // glibc gethostbyaddr returns the primary name plus aliases (a + // getnameinfo lookup yields only one name, which on Sol is the short + // one — the FQDN arrives as an alias). + let hostent = unsafe { + gethostbyaddr( + bytes.as_ptr() as *const libc::c_void, + bytes.len() as libc::socklen_t, + family, + ) + }; + if hostent.is_null() { + return None; + } + unsafe { + let h_name = (*hostent).h_name; + if h_name.is_null() { + return None; + } + let primary = CStr::from_ptr(h_name).to_string_lossy().into_owned(); + let mut aliases = Vec::new(); + let mut p = (*hostent).h_aliases; + if !p.is_null() { + while !(*p).is_null() { + aliases.push(CStr::from_ptr(*p).to_string_lossy().into_owned()); + p = p.add(1); + } + } + Some((primary, aliases)) + } +} + +/// Run `hostname -a` (2s timeout) and return its output combined with the +/// resolved FQDN; fall back to the FQDN alone on failure. +fn hostname_a() -> String { + let fqdn = fqdn(); + let child = Command::new("hostname") + .arg("-a") + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn(); + let mut child = match child { + Ok(c) => c, + Err(_) => return fqdn, + }; + let deadline = Instant::now() + Duration::from_secs(2); + loop { + match child.try_wait() { + Ok(Some(_)) => break, + Ok(None) => { + if Instant::now() >= deadline { + let _ = child.kill(); + let _ = child.wait(); + return fqdn; + } + std::thread::sleep(Duration::from_millis(10)); + } + Err(_) => return fqdn, + } + } + let mut stdout = String::new(); + if let Some(mut pipe) = child.stdout.take() { + let _ = pipe.read_to_string(&mut stdout); + } + format!("{stdout} {fqdn}") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sol_token_anywhere_matches() { + assert!(matches_sol("scc041.sol.rc.asu.edu")); + assert!(matches_sol("alias1 login01.sol.rc.asu.edu alias2")); + assert!(matches_sol(" sc042.sol.rc.asu.edu")); + } + + #[test] + fn non_sol_hosts_do_not_match() { + assert!(!matches_sol("laptop.local")); + assert!(!matches_sol("phx01.phx.rc.asu.edu")); + assert!(!matches_sol("")); + // Suffix must terminate the token. + assert!(!matches_sol("x.sol.rc.asu.edu.evil.com")); + } +} diff --git a/solx/src/slurm.rs b/solx/src/slurm.rs new file mode 100644 index 0000000..c544e1f --- /dev/null +++ b/solx/src/slurm.rs @@ -0,0 +1,881 @@ +//! Thin wrappers around `squeue`, `scancel`, `salloc`, and `srun`. +//! +//! Not a Slurm client library — every function shells out and parses the +//! result. Tests inject a [`Runner`] so they can mock subprocess output +//! without spawning anything. + +use std::collections::HashMap; +use std::fmt; +use std::io::Read; +use std::process::{Command, Stdio}; +use std::time::{Duration, Instant}; + +use crate::config::JobTemplate; +use crate::output::py_repr; + +/// A runner takes argv and returns (returncode, stdout, stderr). +pub type Runner<'a> = &'a dyn Fn(&[String]) -> (i32, String, String); + +/// One row of `squeue -u $USER`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Job { + pub job_id: String, + pub name: String, + pub state: String, + pub time_used: String, + pub time_left: String, + pub partition: String, + pub node_list: String, +} + +impl Job { + /// Parse one `squeue` pipe-delimited row (field order set by + /// [`squeue_user_jobs`]'s format string). + pub fn from_squeue_row(line: &str) -> Result { + let parts: Vec<&str> = line.split('|').collect(); + if parts.len() < 7 { + return Err(SlurmError(format!( + "unexpected squeue row: {}", + py_repr(line) + ))); + } + Ok(Job { + job_id: parts[0].to_string(), + name: parts[1].to_string(), + state: parts[2].to_string(), + time_used: parts[3].to_string(), + time_left: parts[4].to_string(), + partition: parts[5].to_string(), + node_list: parts[6].to_string(), + }) + } +} + +/// Any Slurm-side failure surfaced to the user. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SlurmError(pub String); + +impl fmt::Display for SlurmError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl std::error::Error for SlurmError {} + +/// Default runner: a real subprocess with captured output. +pub fn real_runner(argv: &[String]) -> (i32, String, String) { + let result = Command::new(&argv[0]) + .args(&argv[1..]) + .stdin(Stdio::null()) + .output(); + match result { + Ok(out) => ( + out.status.code().unwrap_or(1), + String::from_utf8_lossy(&out.stdout).into_owned(), + String::from_utf8_lossy(&out.stderr).into_owned(), + ), + Err(e) => (1, String::new(), format!("{}: {e}", argv[0])), + } +} + +// --- squeue --------------------------------------------------------------- + +const SQUEUE_FORMAT: &str = "%i|%j|%T|%M|%L|%P|%R"; + +/// Return the user's current jobs (running, pending, etc.). +pub fn squeue_user_jobs(user: Option<&str>, runner: Runner) -> Result, SlurmError> { + let user = match user { + Some(u) => u.to_string(), + None => std::env::var("USER").unwrap_or_default(), + }; + let argv: Vec = ["squeue", "-u", &user, "-h", "-o", SQUEUE_FORMAT] + .iter() + .map(|s| s.to_string()) + .collect(); + let (code, out, err) = runner(&argv); + if code != 0 { + let detail = if err.trim().is_empty() { + out.trim().to_string() + } else { + err.trim().to_string() + }; + return Err(SlurmError(format!("squeue failed: {detail}"))); + } + out.lines() + .filter(|line| !line.trim().is_empty()) + .map(Job::from_squeue_row) + .collect() +} + +// --- jobid resolution ----------------------------------------------------- +// +// Resolution is VERB-AWARE. The conventions are inspired by tmux (a no-arg +// command acts on the obvious target; "most recent" when several exist; warn +// when you act on the session you're sitting in) but adapted to Slurm, where +// a cancelled job is unrecoverable and attaching spends real allocation time: +// +// * `time`/`jump` (read / attach): when several jobs match, auto-pick the +// MOST RECENT one (like `tmux attach`). Deterministic, so it's agent-safe. +// * `stop` (cancel): NEVER auto-picks among several — that's how you cancel +// the wrong job. It returns the candidates so the caller can print them +// and exit 2. +// * `jump`'s auto-pick considers RUNNING jobs only (you can't attach to a +// pending one). An EXPLICIT arg or $SLURM_JOB_ID is passed through as-is +// (no state pre-check) — `srun` surfaces a wrong-state job far more +// clearly than we could, and it saves a squeue round-trip. +// +// "Inside an allocation" ($SLURM_JOB_ID set) is treated as "the current +// session": it's the default target, and acting on it carries a nesting / +// self-cancel warning the caller surfaces. + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Verb { + Jump, + Stop, + Time, +} + +/// Outcome of resolving a jobid for one verb. +/// +/// Exactly one of these holds: +/// * `job_id` is set → resolved; act on it. +/// * `ambiguous` is true → several candidates, caller must disambiguate. +/// * `error` is set → nothing to act on (no jobs / none running). +#[derive(Debug, Clone, Default)] +pub struct Resolution { + pub job_id: Option, + /// "arg" | "inside" | "single" | "most-recent" + pub source: &'static str, + /// $SLURM_JOB_ID is set (acting from within an allocation). + pub inside: bool, + pub inside_job_id: Option, + /// Set considered (for ambiguity / context). + pub candidates: Vec, + pub ambiguous: bool, + pub error: Option, +} + +impl Resolution { + /// True when the resolved job is the one we're sitting inside. + pub fn acting_on_current(&self) -> bool { + self.inside && self.job_id.is_some() && self.job_id == self.inside_job_id + } +} + +/// Sort key making "most recent" == "highest job id". +/// +/// Slurm assigns monotonically increasing ids, so the highest id is the +/// newest submission — which for `solx job start` is the one you just made. +/// Array ids like `123_4` sort by (base, index); a non-numeric id sorts +/// first so a real number always wins. +fn jobid_key(job_id: &str) -> (i64, i64) { + let (base, idx) = match job_id.split_once('_') { + Some((b, i)) => (b, i), + None => (job_id, ""), + }; + match base.parse::() { + Ok(b) => { + let i = if !idx.is_empty() && idx.bytes().all(|c| c.is_ascii_digit()) { + idx.parse().unwrap_or(0) + } else { + 0 + }; + (b, i) + } + Err(_) => (-1, 0), + } +} + +/// Return the most recently submitted job (highest job id). +/// Ties keep the first occurrence. Panics on an empty slice (callers +/// guarantee at least one candidate). +pub fn most_recent(jobs: &[Job]) -> &Job { + let mut best = &jobs[0]; + let mut best_key = jobid_key(&best.job_id); + for j in &jobs[1..] { + let k = jobid_key(&j.job_id); + if k > best_key { + best = j; + best_key = k; + } + } + best +} + +/// Resolve the jobid for `stop` / `jump` / `time`, verb-aware (see above). +/// +/// Order: explicit arg > inside-allocation ($SLURM_JOB_ID) > squeue. From +/// squeue, a single candidate is used; several are auto-resolved to the most +/// recent for read/attach verbs, or returned as `ambiguous` for stop. +/// +/// Errors if the squeue query fails (the explicit-arg and inside-allocation +/// paths short-circuit before any squeue call, so they never do). `env` +/// substitutes for the process environment in tests; `None` reads the real +/// one. +pub fn resolve_jobid( + arg: Option<&str>, + verb: Verb, + user: Option<&str>, + env: Option<&HashMap>, + runner: Runner, +) -> Result { + let inside_id: Option = match env { + Some(map) => map.get("SLURM_JOB_ID").cloned(), + None => std::env::var("SLURM_JOB_ID").ok(), + } + .filter(|v| !v.is_empty()); + let inside = inside_id.is_some(); + + if let Some(a) = arg.filter(|a| !a.is_empty()) { + return Ok(Resolution { + job_id: Some(a.to_string()), + source: "arg", + inside, + inside_job_id: inside_id, + ..Default::default() + }); + } + if let Some(id) = inside_id.clone() { + return Ok(Resolution { + job_id: Some(id), + source: "inside", + inside: true, + inside_job_id: inside_id, + ..Default::default() + }); + } + + let jobs = squeue_user_jobs(user, runner)?; + let candidates: Vec = if verb == Verb::Jump { + jobs.iter() + .filter(|j| j.state == "RUNNING") + .cloned() + .collect() + } else { + jobs.clone() + }; + + if candidates.is_empty() { + // For jump, distinguish "you have jobs but none running" from "no jobs". + let err = if verb == Verb::Jump && !jobs.is_empty() { + "no running job to attach to (jobs exist but none are RUNNING)" + } else { + "no jobs found for the current user" + }; + return Ok(Resolution { + error: Some(err.to_string()), + candidates: jobs, + inside, + ..Default::default() + }); + } + + if candidates.len() == 1 { + return Ok(Resolution { + job_id: Some(candidates[0].job_id.clone()), + source: "single", + candidates, + inside, + inside_job_id: inside_id, + ..Default::default() + }); + } + + if verb == Verb::Stop { + // Never auto-pick which job to cancel. + return Ok(Resolution { + ambiguous: true, + candidates, + inside, + inside_job_id: inside_id, + ..Default::default() + }); + } + + let chosen = most_recent(&candidates).job_id.clone(); + Ok(Resolution { + job_id: Some(chosen), + source: "most-recent", + candidates, + inside, + inside_job_id: inside_id, + ..Default::default() + }) +} + +// --- salloc / scancel / srun argv builders --------------------------------- + +/// Build the argv for `salloc --no-shell` from a template + CLI passthrough. +pub fn salloc_argv(template: &JobTemplate, passthrough: &[String]) -> Vec { + let mut argv = vec![ + "salloc".to_string(), + "--no-shell".to_string(), + "-J".to_string(), + format!("solx-{}", template.name), + "-p".to_string(), + template.partition.clone(), + "-t".to_string(), + template.time.clone(), + ]; + if let Some(qos) = &template.qos { + argv.push("-q".to_string()); + argv.push(qos.clone()); + } + if let Some(gres) = &template.gres { + argv.push(format!("--gres={gres}")); + } + argv.extend(template.extra_args.iter().cloned()); + argv.extend(passthrough.iter().cloned()); + argv +} + +pub fn scancel_argv(job_id: &str) -> Vec { + vec!["scancel".to_string(), job_id.to_string()] +} + +/// Argv for attaching a pty shell to a running allocation. +/// +/// `--overlap` lets the step share the allocation's resources with steps +/// already running in it. Without it, srun demands exclusive use of the node +/// and stalls with "step creation temporarily disabled (Requested nodes are +/// busy)" whenever the job already has a step occupying its resources. +pub fn srun_pty_argv(job_id: &str, shell: &str) -> Vec { + vec![ + "srun".to_string(), + format!("--jobid={job_id}"), + "--overlap".to_string(), + "--pty".to_string(), + shell.to_string(), + ] +} + +pub fn squeue_time_left_argv(job_id: &str) -> Vec { + ["squeue", "-h", "-j", job_id, "-O", "TimeLeft"] + .iter() + .map(|s| s.to_string()) + .collect() +} + +// --- salloc execution ------------------------------------------------------- + +/// Extract the jobid from `salloc`'s stderr `Granted job allocation N` line. +pub fn parse_granted_jobid(stderr_text: &str) -> Result { + const NEEDLE: &str = "Granted job allocation "; + let mut search = stderr_text; + while let Some(pos) = search.find(NEEDLE) { + let after = &search[pos + NEEDLE.len()..]; + let digits: String = after.chars().take_while(|c| c.is_ascii_digit()).collect(); + if !digits.is_empty() { + return Ok(digits); + } + search = after; + } + Err(SlurmError(format!( + "could not parse jobid from salloc output:\n{stderr_text}" + ))) +} + +/// Join argv for display, quoting like Python's `shlex.join`: a token is +/// quoted only when it contains a character outside `[A-Za-z0-9_@%+=:,./-]` +/// (so `=`-style flags like `--gres=gpu:a100:1` stay bare), using single +/// quotes with embedded `'` rendered as `'"'"'`. +pub fn shell_join(argv: &[String]) -> String { + argv.iter() + .map(|s| shlex_quote(s)) + .collect::>() + .join(" ") +} + +fn shlex_quote(s: &str) -> String { + let safe = |c: char| c.is_ascii_alphanumeric() || "_@%+=:,./-".contains(c); + if !s.is_empty() && s.chars().all(safe) { + s.to_string() + } else { + format!("'{}'", s.replace('\'', "'\"'\"'")) + } +} + +/// Invoke salloc and return the granted jobid. +/// +/// `salloc --no-shell` blocks until the allocation lands, then exits. If the +/// queue stalls beyond `timeout_seconds`, the process is killed and a +/// [`SlurmError`] surfaces a clear timeout instead of a hang. A `runner` +/// (tests) bypasses the subprocess and timeout entirely. +pub fn run_salloc( + argv: &[String], + timeout_seconds: i64, + runner: Option, +) -> Result { + if let Some(run) = runner { + let (code, _, err) = run(argv); + if code != 0 { + return Err(SlurmError(format!("salloc failed: {}", err.trim()))); + } + return parse_granted_jobid(&err); + } + + let timeout_err = || { + SlurmError(format!( + "salloc timed out after {timeout_seconds}s waiting for the queue. \ + Cancel the request manually if needed; the request may still be \ + queued. Argv: {}", + shell_join(argv) + )) + }; + + let mut child = Command::new(&argv[0]) + .args(&argv[1..]) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| SlurmError(format!("salloc failed: {e}")))?; + + // Drain the pipes on threads so a chatty salloc can't dead-lock against + // a full pipe buffer while we poll for exit. + let mut stdout_pipe = child.stdout.take().expect("stdout piped"); + let mut stderr_pipe = child.stderr.take().expect("stderr piped"); + let out_thread = std::thread::spawn(move || { + let mut buf = Vec::new(); + let _ = stdout_pipe.read_to_end(&mut buf); + buf + }); + let err_thread = std::thread::spawn(move || { + let mut buf = Vec::new(); + let _ = stderr_pipe.read_to_end(&mut buf); + buf + }); + + let deadline = Instant::now() + Duration::from_secs(timeout_seconds.max(0) as u64); + let status = loop { + match child.try_wait() { + Ok(Some(status)) => break status, + Ok(None) => { + if Instant::now() >= deadline { + let _ = child.kill(); + let _ = child.wait(); + return Err(timeout_err()); + } + std::thread::sleep(Duration::from_millis(25)); + } + Err(e) => return Err(SlurmError(format!("salloc failed: {e}"))), + } + }; + let _stdout = out_thread.join().unwrap_or_default(); + let stderr = String::from_utf8_lossy(&err_thread.join().unwrap_or_default()).into_owned(); + + if !status.success() { + return Err(SlurmError(format!( + "salloc failed (exit {}):\n{}", + status.code().unwrap_or(1), + stderr.trim() + ))); + } + parse_granted_jobid(&stderr) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::cell::RefCell; + + fn runner_of( + code: i32, + stdout: &str, + stderr: &str, + ) -> impl Fn(&[String]) -> (i32, String, String) { + let stdout = stdout.to_string(); + let stderr = stderr.to_string(); + move |_argv: &[String]| (code, stdout.clone(), stderr.clone()) + } + + fn env(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + + fn job(id: &str) -> Job { + Job { + job_id: id.to_string(), + name: "a".to_string(), + state: "RUNNING".to_string(), + time_used: String::new(), + time_left: String::new(), + partition: "p".to_string(), + node_list: String::new(), + } + } + + // ---- squeue ------------------------------------------------------------ + + #[test] + fn squeue_user_jobs_parses_rows() { + let out = "12345|solx-default|RUNNING|00:05:23|00:54:37|lightwork|sg045\n\ + 12346|notebook|PENDING|00:00:00|01:00:00|htc|(Resources)\n"; + let captured: RefCell>> = RefCell::new(Vec::new()); + let runner = |argv: &[String]| { + captured.borrow_mut().push(argv.to_vec()); + (0, out.to_string(), String::new()) + }; + let jobs = squeue_user_jobs(Some("sparky"), &runner).unwrap(); + assert_eq!(jobs.len(), 2); + assert_eq!( + jobs[0], + Job { + job_id: "12345".to_string(), + name: "solx-default".to_string(), + state: "RUNNING".to_string(), + time_used: "00:05:23".to_string(), + time_left: "00:54:37".to_string(), + partition: "lightwork".to_string(), + node_list: "sg045".to_string(), + } + ); + let argv = &captured.borrow()[0]; + assert!(argv.contains(&"-u".to_string()) && argv.contains(&"sparky".to_string())); + } + + #[test] + fn squeue_user_jobs_empty() { + let runner = runner_of(0, "", ""); + assert!(squeue_user_jobs(Some("sparky"), &runner) + .unwrap() + .is_empty()); + } + + #[test] + fn squeue_user_jobs_failure() { + let runner = runner_of(1, "", "slurmctld is down"); + let err = squeue_user_jobs(Some("sparky"), &runner).unwrap_err(); + assert_eq!(err.0, "squeue failed: slurmctld is down"); + } + + #[test] + fn squeue_row_too_short_is_error() { + let err = Job::from_squeue_row("only|three|fields").unwrap_err(); + assert!(err.0.starts_with("unexpected squeue row: ")); + } + + // ---- resolve_jobid ----------------------------------------------------- + + const TWO_RUNNING: &str = "12345|solx-default|RUNNING|00:01:00|00:59:00|lightwork|sg045\n\ + 67890|notebook|RUNNING|00:01:00|00:59:00|htc|sg010\n"; + + #[test] + fn resolve_arg_wins() { + let called = RefCell::new(false); + let runner = |_argv: &[String]| { + *called.borrow_mut() = true; + (0, String::new(), String::new()) + }; + let res = resolve_jobid( + Some("99999"), + Verb::Stop, + None, + Some(&env(&[("SLURM_JOB_ID", "11111")])), + &runner, + ) + .unwrap(); + assert_eq!(res.job_id.as_deref(), Some("99999")); + assert_eq!(res.source, "arg"); + assert!(res.inside); + assert_eq!(res.inside_job_id.as_deref(), Some("11111")); + assert!(!*called.borrow()); // never queried squeue + } + + #[test] + fn resolve_uses_env_on_compute_node() { + let called = RefCell::new(false); + let runner = |_argv: &[String]| { + *called.borrow_mut() = true; + (0, String::new(), String::new()) + }; + let res = resolve_jobid( + None, + Verb::Time, + None, + Some(&env(&[("SLURM_JOB_ID", "55555")])), + &runner, + ) + .unwrap(); + assert_eq!(res.job_id.as_deref(), Some("55555")); + assert_eq!(res.source, "inside"); + assert!(res.acting_on_current()); + assert!(!*called.borrow()); + } + + #[test] + fn resolve_single_running_job() { + let runner = runner_of( + 0, + "12345|solx-default|RUNNING|00:01:00|00:59:00|lightwork|sg045\n", + "", + ); + let res = + resolve_jobid(None, Verb::Stop, Some("sparky"), Some(&env(&[])), &runner).unwrap(); + assert_eq!(res.job_id.as_deref(), Some("12345")); + assert_eq!(res.source, "single"); + assert!(!res.ambiguous); + } + + #[test] + fn resolve_zero_jobs() { + let runner = runner_of(0, "", ""); + let res = + resolve_jobid(None, Verb::Time, Some("sparky"), Some(&env(&[])), &runner).unwrap(); + assert!(res.job_id.is_none()); + assert!(res.error.as_deref().unwrap().contains("no jobs found")); + } + + #[test] + fn resolve_stop_ambiguous_no_autopick() { + let runner = runner_of(0, TWO_RUNNING, ""); + let res = + resolve_jobid(None, Verb::Stop, Some("sparky"), Some(&env(&[])), &runner).unwrap(); + assert!(res.job_id.is_none()); + assert!(res.ambiguous); + let ids: Vec<&str> = res.candidates.iter().map(|j| j.job_id.as_str()).collect(); + assert_eq!(ids, ["12345", "67890"]); + } + + #[test] + fn resolve_time_picks_most_recent() { + let runner = runner_of(0, TWO_RUNNING, ""); + let res = + resolve_jobid(None, Verb::Time, Some("sparky"), Some(&env(&[])), &runner).unwrap(); + assert_eq!(res.job_id.as_deref(), Some("67890")); // highest jobid == most recent + assert_eq!(res.source, "most-recent"); + assert!(!res.ambiguous); + } + + #[test] + fn resolve_jump_filters_running_only() { + let out = "12345|a|RUNNING|00:01|00:59|p|sg045\n\ + 67890|b|PENDING|00:00|01:00|p|(Resources)\n"; + let runner = runner_of(0, out, ""); + let res = + resolve_jobid(None, Verb::Jump, Some("sparky"), Some(&env(&[])), &runner).unwrap(); + // Only the RUNNING job is an attach candidate -> unambiguous. + assert_eq!(res.job_id.as_deref(), Some("12345")); + assert_eq!(res.source, "single"); + } + + #[test] + fn resolve_jump_no_running() { + let runner = runner_of(0, "67890|b|PENDING|00:00|01:00|p|(Resources)\n", ""); + let res = + resolve_jobid(None, Verb::Jump, Some("sparky"), Some(&env(&[])), &runner).unwrap(); + assert!(res.job_id.is_none()); + assert!(res.error.as_deref().unwrap().contains("no running job")); + } + + #[test] + fn resolve_squeue_failure_propagates() { + let runner = runner_of(1, "", "boom"); + let err = + resolve_jobid(None, Verb::Time, Some("sparky"), Some(&env(&[])), &runner).unwrap_err(); + assert_eq!(err.0, "squeue failed: boom"); + } + + #[test] + fn resolve_empty_slurm_job_id_is_not_inside() { + let runner = runner_of(0, "12345|a|RUNNING|0:01|0:59|p|sg045\n", ""); + let res = resolve_jobid( + None, + Verb::Time, + Some("sparky"), + Some(&env(&[("SLURM_JOB_ID", "")])), + &runner, + ) + .unwrap(); + assert_eq!(res.source, "single"); + assert!(!res.inside); + } + + #[test] + fn most_recent_highest_jobid() { + let jobs = vec![job("100"), job("9999"), job("250")]; + assert_eq!(most_recent(&jobs).job_id, "9999"); + } + + #[test] + fn most_recent_array_ids() { + let jobs = vec![job("100_1"), job("100_7")]; + assert_eq!(most_recent(&jobs).job_id, "100_7"); + } + + #[test] + fn most_recent_non_numeric_sorts_first() { + let jobs = vec![job("abc"), job("5")]; + assert_eq!(most_recent(&jobs).job_id, "5"); + } + + // ---- argv builders ----------------------------------------------------- + + #[test] + fn salloc_argv_minimal() { + let t = JobTemplate { + name: "default".to_string(), + partition: "lightwork".to_string(), + time: "1-0".to_string(), + qos: None, + gres: None, + extra_args: vec![], + }; + assert_eq!( + salloc_argv(&t, &[]), + [ + "salloc", + "--no-shell", + "-J", + "solx-default", + "-p", + "lightwork", + "-t", + "1-0" + ] + ); + } + + #[test] + fn salloc_argv_full() { + let t = JobTemplate { + name: "gpu".to_string(), + partition: "public".to_string(), + time: "0-4".to_string(), + qos: Some("public".to_string()), + gres: Some("gpu:a100:1".to_string()), + extra_args: vec!["--mem=64G".to_string(), "--cpus-per-task=8".to_string()], + }; + assert_eq!( + salloc_argv(&t, &["--mail-type=END".to_string()]), + [ + "salloc", + "--no-shell", + "-J", + "solx-gpu", + "-p", + "public", + "-t", + "0-4", + "-q", + "public", + "--gres=gpu:a100:1", + "--mem=64G", + "--cpus-per-task=8", + "--mail-type=END", + ] + ); + } + + #[test] + fn scancel_argv_shape() { + assert_eq!(scancel_argv("12345"), ["scancel", "12345"]); + } + + #[test] + fn srun_pty_argv_shape() { + // --overlap lets the step share the allocation's busy resources. + assert_eq!( + srun_pty_argv("12345", "zsh"), + ["srun", "--jobid=12345", "--overlap", "--pty", "zsh"] + ); + } + + #[test] + fn squeue_time_left_argv_shape() { + assert_eq!( + squeue_time_left_argv("12345"), + ["squeue", "-h", "-j", "12345", "-O", "TimeLeft"] + ); + } + + // ---- salloc parse + run ------------------------------------------------ + + #[test] + fn parse_granted_jobid_ok() { + let text = "salloc: Pending job allocation 51642835\n\ + salloc: job 51642835 queued and waiting for resources\n\ + salloc: job 51642835 has been allocated resources\n\ + salloc: Granted job allocation 51642835\n"; + assert_eq!(parse_granted_jobid(text).unwrap(), "51642835"); + } + + #[test] + fn parse_granted_jobid_missing() { + let err = parse_granted_jobid("salloc: error: queue down\n").unwrap_err(); + assert!(err.0.starts_with("could not parse")); + } + + #[test] + fn run_salloc_success_via_runner() { + let captured: RefCell>> = RefCell::new(Vec::new()); + let runner = |argv: &[String]| { + captured.borrow_mut().push(argv.to_vec()); + ( + 0, + String::new(), + "salloc: Granted job allocation 99999\n".to_string(), + ) + }; + let argv: Vec = vec!["salloc".to_string(), "--no-shell".to_string()]; + let jid = run_salloc(&argv, 60, Some(&runner)).unwrap(); + assert_eq!(jid, "99999"); + assert_eq!(captured.borrow()[0], argv); + } + + #[test] + fn run_salloc_failure_via_runner() { + let runner = runner_of(1, "", "salloc: error: invalid partition\n"); + let err = run_salloc(&["salloc".to_string()], 60, Some(&runner)).unwrap_err(); + assert!(err.0.contains("invalid partition")); + } + + #[test] + fn shell_join_plain_tokens() { + let argv: Vec = ["salloc", "--no-shell", "-J", "solx-default"] + .iter() + .map(|s| s.to_string()) + .collect(); + assert_eq!(shell_join(&argv), "salloc --no-shell -J solx-default"); + } + + #[test] + fn shell_join_keeps_equals_tokens_bare() { + // The gpu-template argv: every `=`/`:`-bearing token stays unquoted, + // matching Python's shlex.join. + let argv: Vec = [ + "salloc", + "--no-shell", + "-J", + "solx-gpu", + "-p", + "public", + "-t", + "0-4", + "--gres=gpu:a100:1", + "--mem=64G", + "--cpus-per-task=8", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + assert_eq!( + shell_join(&argv), + "salloc --no-shell -J solx-gpu -p public -t 0-4 \ + --gres=gpu:a100:1 --mem=64G --cpus-per-task=8" + ); + } + + #[test] + fn shell_join_quotes_unsafe_tokens() { + let argv: Vec = ["echo", "a b", "", "it's", "a*b"] + .iter() + .map(|s| s.to_string()) + .collect(); + assert_eq!(shell_join(&argv), r#"echo 'a b' '' 'it'"'"'s' 'a*b'"#); + } +} diff --git a/solx/src/solx/__init__.py b/solx/src/solx/__init__.py deleted file mode 100644 index 1082c66..0000000 --- a/solx/src/solx/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""solx — CLI for ASU's Sol supercomputer.""" - -__version__ = "0.5.1" diff --git a/solx/src/solx/__main__.py b/solx/src/solx/__main__.py deleted file mode 100644 index 43b2876..0000000 --- a/solx/src/solx/__main__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Allow `python -m solx` invocation.""" -from solx.main import main - - -if __name__ == "__main__": - main() diff --git a/solx/src/solx/_completions.py b/solx/src/solx/_completions.py deleted file mode 100644 index 6fff6e3..0000000 --- a/solx/src/solx/_completions.py +++ /dev/null @@ -1,502 +0,0 @@ -"""Static shell completion scripts for solx (bash, zsh, fish). - -One data structure (`COMMANDS`) mirrors the CLI surface: its commands, -subcommands, and flags correspond one-to-one to `main.py`'s parser tree -(a pinning test in `tests/test_completions.py` walks both and fails on any -drift), while the descriptions are one-line summaries of the parser's help -strings. Each `*_script()` function renders the table into a fully static -script: nothing shells back into solx at completion time, so the first Tab -of a session costs no interpreter start. - -The zsh script works in both install modes: eval/source (`compdef` registers -the completer) and fpath autoload (`solx completions zsh > ~/.zfunc/_solx`, -where compinit loads the file body *as* the completer, so it must call -itself). A footer keyed on `$zsh_eval_context` picks the right branch. -""" -from __future__ import annotations - -# Flag value kinds: None (boolean), "file"/"dir" (filesystem paths), "value" -# (free-form argument), or a tuple of literal choices. -Flag = tuple[tuple[str, ...], "str | tuple[str, ...] | None", str] - -_JSON: Flag = (("--json",), None, "Force JSON output (machine-readable).") -HELP_FLAG: Flag = (("-h", "--help"), None, "Show this help message and exit.") - -STAGE_CHOICES = ("all", "pending", "over90", "inactive") -SHELL_CHOICES = ("bash", "zsh", "fish") - -GLOBAL_FLAGS: list[Flag] = [ - HELP_FLAG, - (("--version",), None, "Show version and exit."), - _JSON, -] - -# command -> {"help": str, "flags": [Flag], "positional": (label, choices|None), -# "sub": {subcommand -> same shape}} -COMMANDS: dict[str, dict] = { - "init": { - "help": "Write a starter config.toml.", - "flags": [ - (("-f", "--force", "-y", "--yes"), None, "Overwrite without prompting."), - _JSON, - ], - }, - "keep": { - "help": "Renew CSV-flagged scratch files filtered by the keep block in config.", - "flags": [ - (("--stage",), STAGE_CHOICES, "Which warning CSVs to read."), - (("--csv-dir",), "dir", "Directory holding Sol's warning CSVs."), - (("--solkeep",), "file", "Path to a gitignore-style keep-list."), - (("-j", "--jobs"), "value", "Parallel touch workers."), - (("-y", "--yes", "-f", "--force"), None, "Skip confirmation prompt."), - (("-n", "--dry-run"), None, "Print plan without executing."), - (("-v", "--verbose"), None, "Verbose plan + progress."), - _JSON, - ], - }, - "jump": { - "help": "Drop into a shell on the job's compute node (= solx job jump).", - "positional": ("jobid", None), - "flags": [ - (("-q", "--quiet"), None, "Suppress the nesting / most-recent heads-up."), - _JSON, - ], - }, - "job": { - "help": "Manage interactive Slurm jobs on Sol (alias: jobs).", - "sub": { - "list": { - "help": "Print my Sol jobs.", - "flags": [_JSON], - }, - "start": { - "help": "Start an interactive allocation from a config template.", - "positional": ("template", None), - "flags": [ - (("-n", "--dry-run"), None, "Print salloc argv without submitting."), - (("--timeout",), "value", 'Override start_timeout (e.g. "5m", "1h").'), - ], - }, - "stop": { - "help": "Cancel a job (prompts unless -y).", - "positional": ("jobid", None), - "flags": [ - (("-y", "--yes", "-f", "--force"), None, "Skip confirmation prompt."), - (("-n", "--dry-run"), None, "Print scancel argv without executing."), - _JSON, - ], - }, - "jump": { - "help": "Drop into a shell on the job's compute node.", - "positional": ("jobid", None), - "flags": [ - (("-q", "--quiet"), None, "Suppress the nesting / most-recent heads-up."), - _JSON, - ], - }, - "time": { - "help": "Print remaining time (D-HH:MM:SS).", - "positional": ("jobid", None), - "flags": [_JSON], - }, - }, - }, - "config": { - "help": "Inspect and edit the solx config.", - "sub": { - "show": { - "help": "Print the resolved config.", - "flags": [(("--json",), None, "Emit JSON.")], - }, - "edit": { - "help": "Open the config in $EDITOR.", - "flags": [], - }, - "import-solkeep": { - "help": "Migrate a legacy ~/.solkeep keep-list into the config's keep block.", - "flags": [ - (("--solkeep",), "file", "Keep-list to import (default: ~/.solkeep)."), - (("-f", "--force"), None, "Accept a lossy import."), - _JSON, - ], - }, - }, - }, - "completions": { - "help": "Emit a shell completion script (bash, zsh, or fish).", - "positional": ("shell", SHELL_CHOICES), - "flags": [], - }, - "version": { - "help": "Show version and exit (alias of --version).", - "flags": [], - }, - "help": { - "help": "Show help and exit (alias of --help).", - "flags": [], - }, -} - - -def _flag_words(flags: list[Flag]) -> list[str]: - return [form for forms, _value, _help in flags for form in forms] - - -# --- bash -------------------------------------------------------------------- - - -def bash_script() -> str: - top = " ".join(COMMANDS) - group_arms: list[str] = [] - leaf_arms: list[str] = [] - for name, spec in COMMANDS.items(): - if "sub" in spec: - subs = spec["sub"] - sub_arms = "\n".join( - f' {sname}) flags="{" ".join([*_flag_words(sspec.get("flags", [])), "-h", "--help"])}" ;;' - for sname, sspec in subs.items() - ) - pattern = f"{name}|jobs" if name == "job" else name - group_arms.append( - f""" {pattern}) - if [[ -z "$sub" ]]; then - if [[ "$cur" != -* ]]; then - mapfile -t COMPREPLY < <(compgen -W "{" ".join(subs)}" -- "$cur") - return - fi - flags="-h --help" - fi - case "$sub" in -{sub_arms} - esac - ;;""" - ) - else: - words = _flag_words(spec.get("flags", [])) - pos = spec.get("positional") - choices = "" - if pos and isinstance(pos[1], tuple): - choices = " ".join(pos[1]) - leaf_arms.append( - f""" {name}) - flags="{" ".join([*words, "-h", "--help"])}" - words="{choices}" - ;;""" - ) - arms = "\n".join(leaf_arms + group_arms) - return f"""\ -# bash completion for solx -_solx() {{ - local cur prev - COMPREPLY=() - cur="${{COMP_WORDS[COMP_CWORD]}}" - prev="${{COMP_WORDS[COMP_CWORD-1]}}" - - # On a mid-word Tab, COMP_WORDS carries the whole word; complete against - # only the part left of the cursor. - if [[ -n "${{COMP_LINE-}}" ]]; then - local left="${{COMP_LINE:0:COMP_POINT}}" - while [[ -n "$cur" && "${{left%"$cur"}}" == "$left" ]]; do - cur="${{cur%?}}" - done - fi - - # First two non-flag words decide the (sub)command context. - local i word cmd="" sub="" - for ((i = 1; i < COMP_CWORD; i++)); do - word="${{COMP_WORDS[i]}}" - [[ "$word" == -* ]] && continue - if [[ -z "$cmd" ]]; then - cmd="$word" - elif [[ -z "$sub" ]]; then - sub="$word" - fi - done - - # Option values. Path candidates go through mapfile (no word splitting, - # no glob expansion — spaces and metacharacters survive) and `compopt -o - # filenames` (where available) so readline escapes what it inserts. - case "$prev" in - --csv-dir) - type compopt &> /dev/null && compopt -o filenames 2> /dev/null - mapfile -t COMPREPLY < <(compgen -d -- "$cur") - return - ;; - --solkeep) - type compopt &> /dev/null && compopt -o filenames 2> /dev/null - mapfile -t COMPREPLY < <(compgen -f -- "$cur") - return - ;; - --stage) - mapfile -t COMPREPLY < <(compgen -W "{" ".join(STAGE_CHOICES)}" -- "$cur") - return - ;; - -j|--jobs|--timeout) - return - ;; - esac - - if [[ -z "$cmd" ]]; then - if [[ "$cur" == -* ]]; then - mapfile -t COMPREPLY < <(compgen -W "{" ".join(_flag_words(GLOBAL_FLAGS))}" -- "$cur") - else - mapfile -t COMPREPLY < <(compgen -W "{top}" -- "$cur") - fi - return - fi - - local flags="" words="" - case "$cmd" in -{arms} - esac - if [[ "$cur" == -* ]]; then - mapfile -t COMPREPLY < <(compgen -W "$flags" -- "$cur") - elif [[ -n "$words" && -z "$sub" ]]; then - # $words holds positional choices; offer them only until the - # positional is filled. - mapfile -t COMPREPLY < <(compgen -W "$words" -- "$cur") - fi -}} - -complete -F _solx solx""" - - -# --- zsh --------------------------------------------------------------------- - - -def _zsh_q(text: str) -> str: - """Quote `text` for inclusion inside a zsh single-quoted string.""" - return text.replace("'", "'\\''") - - -def _zsh_desc(text: str) -> str: - """Sanitize a description for an `_arguments` `[...]` field.""" - return _zsh_q(text.replace("[", "").replace("]", "")) - - -def _zsh_item(name: str, desc: str) -> str: - """Render one `name:description` element for `_describe`.""" - escaped = _zsh_q(desc.replace(":", "\\:")) - return f"'{name}:{escaped}'" - - -def _zsh_flag_specs(flags: list[Flag]) -> list[str]: - specs: list[str] = [] - for forms, value, help_text in flags: - action = "" - if value == "file": - action = ":file:_files" - elif value == "dir": - action = ":directory:_files -/" - elif value == "value": - action = ":value:" - elif isinstance(value, tuple): - action = f":value:({' '.join(value)})" - desc = f"[{_zsh_desc(help_text)}]" - if len(forms) == 1: - specs.append(f"'{forms[0]}{desc}{action}'") - else: - exclusion = " ".join(forms) - brace = ",".join(forms) - specs.append(f"'({exclusion})'{{{brace}}}'{desc}{action}'") - return specs - - -def _zsh_leaf_arguments(spec: dict, indent: str) -> str: - """Render the `_arguments` call for a leaf (sub)command.""" - parts = _zsh_flag_specs(spec.get("flags", [])) - parts.append("'(-h --help)'{-h,--help}'[Show this help message and exit.]'") - pos = spec.get("positional") - if pos is not None: - label, choices = pos - action = f"({' '.join(choices)})" if isinstance(choices, tuple) else "" - parts.append(f"'1:{label}:{action}'") - joined = f" \\\n{indent} ".join(parts) - return f"{indent}_arguments \\\n{indent} {joined}" - - -def _zsh_group_fn(name: str, spec: dict) -> str: - subs = spec["sub"] - items = "\n ".join( - _zsh_item(sname, sspec["help"]) for sname, sspec in subs.items() - ) - arms = [] - for sname, sspec in subs.items(): - arms.append( - f" ({sname})\n" - + _zsh_leaf_arguments(sspec, " ") - + "\n ;;" - ) - arms_text = "\n".join(arms) - return f"""\ -_solx_{name}() {{ - local curcontext="$curcontext" state line - typeset -A opt_args - - _arguments -C \\ - '(-h --help)'{{-h,--help}}'[Show this help message and exit.]' \\ - '1: :->subcommand' \\ - '*:: :->subargs' - - case $state in - (subcommand) - local -a subcommands - subcommands=( - {items} - ) - _describe -t commands 'solx {name} command' subcommands - ;; - (subargs) - case $words[1] in -{arms_text} - esac - ;; - esac -}}""" - - -def zsh_script() -> str: - group_fns = [ - _zsh_group_fn(name, spec) for name, spec in COMMANDS.items() if "sub" in spec - ] - items = "\n ".join( - _zsh_item(name, spec["help"]) for name, spec in COMMANDS.items() - ) - arms = [] - for name, spec in COMMANDS.items(): - if "sub" in spec: - pattern = f"({name}|jobs)" if name == "job" else f"({name})" - arms.append(f" {pattern}\n _solx_{name}\n ;;") - else: - arms.append( - f" ({name})\n" - + _zsh_leaf_arguments(spec, " ") - + "\n ;;" - ) - arms_text = "\n".join(arms) - group_fns_text = "\n\n".join(group_fns) - body = f"""\ -#compdef solx - -{group_fns_text} - -_solx() {{ - local curcontext="$curcontext" state line - typeset -A opt_args - - _arguments -C \\ - '(-h --help)'{{-h,--help}}'[Show this help message and exit.]' \\ - '--version[Show version and exit.]' \\ - '--json[Force JSON output (machine-readable).]' \\ - '1: :->command' \\ - '*:: :->args' - - case $state in - (command) - local -a commands - commands=( - {items} - ) - _describe -t commands 'solx command' commands - ;; - (args) - case $words[1] in -{arms_text} - esac - ;; - esac -}} - -if [[ $zsh_eval_context[-1] == loadautofunc ]]; then - # autoload from fpath, call function directly - _solx "$@" -else - # eval/source/. command, register function for later - compdef _solx solx -fi""" - return body - - -# --- fish -------------------------------------------------------------------- - - -def _fish_q(text: str) -> str: - """Quote `text` for a fish single-quoted string.""" - return text.replace("\\", "\\\\").replace("'", "\\'") - - -def _fish_flag_lines(flags: list[Flag], condition: str) -> list[str]: - lines: list[str] = [] - for forms, value, help_text in flags: - opts = " ".join( - f"-s {form.lstrip('-')}" if not form.startswith("--") else f"-l {form[2:]}" - for form in forms - ) - extra = "" - if value in ("file", "dir"): - extra = " -r -F" - elif value == "value": - extra = " -x" - elif isinstance(value, tuple): - extra = f" -x -a '{' '.join(value)}'" - lines.append( - f"complete -c solx -n '{condition}' {opts}{extra} -d '{_fish_q(help_text)}'" - ) - return lines - - -def fish_script() -> str: - lines = [ - "# fish completion for solx", - "complete -c solx -f", - ] - for forms, _value, help_text in GLOBAL_FLAGS: - opts = " ".join( - f"-s {form.lstrip('-')}" if not form.startswith("--") else f"-l {form[2:]}" - for form in forms - ) - lines.append( - f"complete -c solx -n __fish_use_subcommand {opts} -d '{_fish_q(help_text)}'" - ) - for name, spec in COMMANDS.items(): - lines.append( - f"complete -c solx -n __fish_use_subcommand -a {name} -d '{_fish_q(spec['help'])}'" - ) - if "sub" in spec: - seen = f"__fish_seen_subcommand_from {name}" - if name == "job": - seen = "__fish_seen_subcommand_from job jobs" - subnames = " ".join(spec["sub"]) - # Group level (no subcommand picked yet): only -h/--help. - lines.extend( - _fish_flag_lines( - [HELP_FLAG], - f"{seen}; and not __fish_seen_subcommand_from {subnames}", - ) - ) - for sname, sspec in spec["sub"].items(): - lines.append( - f"complete -c solx -n '{seen}; and not __fish_seen_subcommand_from {subnames}' " - f"-a {sname} -d '{_fish_q(sspec['help'])}'" - ) - lines.extend( - _fish_flag_lines( - [*sspec.get("flags", []), HELP_FLAG], - f"{seen}; and __fish_seen_subcommand_from {sname}", - ) - ) - else: - condition = f"__fish_seen_subcommand_from {name}" - lines.extend( - _fish_flag_lines([*spec.get("flags", []), HELP_FLAG], condition) - ) - pos = spec.get("positional") - if pos is not None and isinstance(pos[1], tuple): - choices = " ".join(pos[1]) - # Offer the positional's choices only until one is given. - lines.append( - f"complete -c solx " - f"-n '{condition}; and not __fish_seen_subcommand_from {choices}' " - f"-a '{choices}'" - ) - return "\n".join(lines) diff --git a/solx/src/solx/config.py b/solx/src/solx/config.py deleted file mode 100644 index 7ae8d59..0000000 --- a/solx/src/solx/config.py +++ /dev/null @@ -1,426 +0,0 @@ -"""Single-file config under $XDG_CONFIG_HOME/solx/config.toml. - -The user runs `solx init` to write a starter file; everything else just -reads it. No `[shared]` merge — each `[jobs.]` table is -self-contained, which keeps the schema obvious at the cost of repeating -a flag across templates if someone really wants that. -""" -from __future__ import annotations - -import os -import re -from dataclasses import dataclass, field - -try: - import tomllib # Python 3.11+ -except ModuleNotFoundError: # Python 3.10 — backport - import tomli as tomllib -from pathlib import Path -from typing import TYPE_CHECKING - -# pathspec is imported where the [keep] specs are compiled (not here) so that -# importing this module stays cheap on NFS; most commands load config without -# ever touching keep rules. -if TYPE_CHECKING: - import pathspec - - -CONFIG_FILENAME = "config.toml" -DEFAULT_START_TIMEOUT = "10m" - - -class ConfigError(Exception): - """Raised for any user-facing config problem (missing file, bad schema).""" - - -@dataclass(frozen=True) -class JobTemplate: - """One `[jobs.]` table.""" - - name: str - partition: str - time: str - qos: str | None = None - gres: str | None = None - extra_args: tuple[str, ...] = () - - -@dataclass(frozen=True) -class KeepRules: - """Resolved `[keep]` include/exclude as compiled pathspecs.""" - - include: pathspec.PathSpec - exclude: pathspec.PathSpec - raw_include: tuple[str, ...] = () - raw_exclude: tuple[str, ...] = () - - def matches(self, path: str) -> bool: - """Return True if `path` is included and not excluded.""" - if not self.include.match_file(path): - return False - return not self.exclude.match_file(path) - - -@dataclass(frozen=True) -class Config: - default_shell: str - default_template: str - start_timeout_seconds: int - templates: dict[str, JobTemplate] = field(default_factory=dict) - keep: KeepRules | None = None - - def template(self, name: str) -> JobTemplate: - """Look up a template by name; raise ConfigError if missing.""" - if name not in self.templates: - available = ", ".join(sorted(self.templates)) or "(none)" - raise ConfigError( - f"unknown job template {name!r}. defined: {available}" - ) - return self.templates[name] - - -def config_path() -> Path: - """Resolve the config path honoring XDG_CONFIG_HOME with the usual fallback.""" - base = os.environ.get("XDG_CONFIG_HOME") or str(Path.home() / ".config") - return Path(base) / "solx" / CONFIG_FILENAME - - -def load(path: Path | None = None) -> Config: - """Load and validate the config from `path` (defaults to `config_path()`).""" - p = path or config_path() - if not p.exists(): - raise ConfigError( - f"no config at {p}. run `solx init` to write a starter file." - ) - try: - with p.open("rb") as f: - raw = tomllib.load(f) - except tomllib.TOMLDecodeError as e: - raise ConfigError(f"invalid TOML in {p}: {e}") from e - except OSError as e: - # Unreadable file (permissions, a directory in its place, I/O error): - # surface a clean config error instead of a traceback. - raise ConfigError(f"unable to read config at {p}: {e}") from e - return _parse(raw, source=str(p)) - - -def _parse(raw: dict, *, source: str) -> Config: - default_shell = _require_str(raw, "default_shell", source) - default_template = _require_str(raw, "default_template", source) - timeout_str = raw.get("start_timeout", DEFAULT_START_TIMEOUT) - if not isinstance(timeout_str, str): - raise ConfigError( - f"{source}: `start_timeout` must be a string like \"10m\"" - ) - start_timeout_seconds = parse_duration(timeout_str) - - jobs_raw = raw.get("jobs", {}) - if not isinstance(jobs_raw, dict) or not jobs_raw: - raise ConfigError( - f"{source}: at least one [jobs.] table is required" - ) - templates = { - name: _parse_template(name, body, source) - for name, body in jobs_raw.items() - } - if default_template not in templates: - raise ConfigError( - f"{source}: default_template={default_template!r} is not defined " - f"under [jobs.*]" - ) - - keep = _parse_keep(raw.get("keep"), source) - - return Config( - default_shell=default_shell, - default_template=default_template, - start_timeout_seconds=start_timeout_seconds, - templates=templates, - keep=keep, - ) - - -def _parse_template(name: str, body: object, source: str) -> JobTemplate: - if not isinstance(body, dict): - raise ConfigError(f"{source}: [jobs.{name}] must be a table") - partition = _require_str(body, "partition", f"{source}:[jobs.{name}]") - time = _require_str(body, "time", f"{source}:[jobs.{name}]") - qos = _optional_str(body, "qos", f"{source}:[jobs.{name}]") - gres = _optional_str(body, "gres", f"{source}:[jobs.{name}]") - extra_args = _optional_str_list(body, "extra_args", f"{source}:[jobs.{name}]") - return JobTemplate( - name=name, - partition=partition, - time=time, - qos=qos, - gres=gres, - extra_args=tuple(extra_args), - ) - - -def _parse_keep(body: object, source: str) -> KeepRules | None: - if body is None: - return None - if not isinstance(body, dict): - raise ConfigError(f"{source}: [keep] must be a table") - import pathspec - - include = _optional_str_list(body, "include", f"{source}:[keep]") - exclude = _optional_str_list(body, "exclude", f"{source}:[keep]") - if not include: - raise ConfigError( - f"{source}: [keep].include must be a non-empty array" - ) - return KeepRules( - include=pathspec.GitIgnoreSpec.from_lines(include), - exclude=pathspec.GitIgnoreSpec.from_lines(exclude), - raw_include=tuple(include), - raw_exclude=tuple(exclude), - ) - - -def load_solkeep(path: Path) -> KeepRules | None: - """Load a gitignore-style `~/.solkeep` keep-list into `KeepRules`. - - The legacy `~/.solkeep` format: each line is a keep pattern, `!` negates - (carves a subtree out), `#`/blank lines are ignored, a bare path matches - that directory *and everything under it*, and the last matching rule wins. - `pathspec`'s `GitIgnoreSpec` implements those semantics, so the whole file - becomes a single keep matcher (with an empty exclude). Returns None if the - file is missing or has no effective rules — so `solx keep` can fall through - to its "nothing to match" handling. `~/.solkeep` is a deprecated fallback - (see `keep.SOLKEEP_REMOVED_IN`); the supported home is the config `[keep]`. - """ - if not path.exists(): - return None - try: - lines = path.read_text().splitlines() - except OSError: - return None - import pathspec - - effective = [ln for ln in lines if ln.strip() and not ln.strip().startswith("#")] - if not effective: - return None - return KeepRules( - include=pathspec.GitIgnoreSpec.from_lines(lines), - exclude=pathspec.GitIgnoreSpec.from_lines([]), - raw_include=tuple(effective), - raw_exclude=(), - ) - - -def _require_str(body: dict, key: str, ctx: str) -> str: - if key not in body: - raise ConfigError(f"{ctx}: required key `{key}` is missing") - val = body[key] - if not isinstance(val, str) or not val: - raise ConfigError(f"{ctx}: `{key}` must be a non-empty string") - return val - - -def _optional_str(body: dict, key: str, ctx: str) -> str | None: - if key not in body: - return None - val = body[key] - if not isinstance(val, str) or not val: - raise ConfigError(f"{ctx}: `{key}` must be a non-empty string") - return val - - -def _optional_str_list(body: dict, key: str, ctx: str) -> list[str]: - if key not in body: - return [] - val = body[key] - if not isinstance(val, list) or any(not isinstance(x, str) for x in val): - raise ConfigError(f"{ctx}: `{key}` must be an array of strings") - return list(val) - - -_DURATION_RE = re.compile(r"^\s*(\d+)\s*([smh])\s*$", re.IGNORECASE) -_DURATION_UNITS = {"s": 1, "m": 60, "h": 3600} - - -def parse_duration(text: str) -> int: - """Parse a string like "10m" / "30s" / "1h" into seconds.""" - m = _DURATION_RE.match(text) - if not m: - raise ConfigError( - f"invalid duration {text!r}; use forms like \"30s\", \"10m\", \"1h\"" - ) - n = int(m.group(1)) - unit = m.group(2).lower() - return n * _DURATION_UNITS[unit] - - -def import_solkeep(path: Path) -> tuple[list[str], list[str]] | None: - """Split a `~/.solkeep` file into `([keep].include, [keep].exclude)`. - - `.solkeep` is one gitignore-style list; `solx init` imports it into the new - config's `[keep]` block so an existing keep-list carries over without - rewriting. Plain lines become `include`, `!`-prefixed lines become - `exclude` (the `!` dropped); `#`/blank lines are skipped. Returns None if - the file is missing or has no `include` patterns. This is a best-effort - import of the common "broad includes + `!` carve-outs" shape — review the - result with `solx config show`. - """ - if not path.exists(): - return None - try: - lines = path.read_text().splitlines() - except OSError: - return None - include: list[str] = [] - exclude: list[str] = [] - for raw in lines: - s = raw.strip() - if not s or s.startswith("#"): - continue - if s.startswith("!"): - carve = s[1:].strip() - if carve: # a bare `!` carves nothing — drop it rather than emit "" - exclude.append(carve) - else: - include.append(s) - if not include: # a keep-list with no keep patterns is nothing to import - return None - return include, exclude - - -def solkeep_is_order_sensitive(path: Path) -> bool: - """True if `path`'s rules can't be split into include/exclude faithfully. - - `~/.solkeep` is gitignore *last-match-wins*; the config `[keep]` block is - `include AND NOT exclude` (see `KeepRules.matches`). The two agree only when - every `!` carve-out comes *after* the positive rules it carves. A positive - rule appearing *after* a `!` line is an order-dependent re-include that the - split into separate include/exclude lists silently drops — so - `solx config import-solkeep` warns when it detects one rather than quietly - keeping fewer directories. - """ - try: - lines = path.read_text().splitlines() - except OSError: - return False - seen_carve = False - for raw in lines: - s = raw.strip() - if not s or s.startswith("#"): - continue - if s.startswith("!"): - seen_carve = True - elif seen_carve: - return True - return False - - -def starter_config_text( - keep: tuple[list[str], list[str]] | None = None, - default_shell: str = "bash", -) -> str: - """The text that `solx init` writes to a fresh config.toml. - - With no `keep`, the `[keep]` block is a commented placeholder using the - `sparky` placeholder (no maintainer name baked in). When `keep` is given - (imported from `~/.solkeep` via `import_solkeep`), an active `[keep]` block - is written instead. `default_shell` sets the `default_shell` value (the - `solx init` walkthrough can pick it). - """ - base = _STARTER_CONFIG_BASE.replace( - 'default_shell = "bash"', f'default_shell = {_toml_str(default_shell)}' - ) - block = _render_keep_block(*keep) if keep else _KEEP_PLACEHOLDER - return base + block - - -def _toml_str(s: str) -> str: - """Render `s` as a TOML basic string, escaping every char TOML forbids. - - Besides backslash and double-quote, control characters (other than tab) are - illegal in a TOML basic string and must be `\\uXXXX`-escaped — otherwise a - keep pattern carrying a stray control byte would render an unparseable - config. tab is emitted as `\\t`. - """ - out = ['"'] - for ch in s: - if ch == "\\": - out.append("\\\\") - elif ch == '"': - out.append('\\"') - elif ch == "\t": - out.append("\\t") - elif ch < " " or ch == "\x7f": - out.append(f"\\u{ord(ch):04x}") - else: - out.append(ch) - out.append('"') - return "".join(out) - - -def render_keep_block( - include: list[str], exclude: list[str], *, source: str = "~/.solkeep" -) -> str: - """Public: render a `[keep]` TOML block from include/exclude pattern lists. - - Used by `solx config import-solkeep` to append a migrated keep-list to an - existing config.toml. `source` names where the patterns came from, for the - provenance comment (the command passes the actual keep-list path). - """ - return _render_keep_block(include, exclude, source=source) - - -def _render_keep_block( - include: list[str], exclude: list[str], *, source: str = "~/.solkeep" -) -> str: - lines = [ - f"# [keep] imported from {source} — directories `solx keep` renews", - "# when Sol flags them. Patterns are gitignore-style (** for recursion).", - "[keep]", - "include = [", - *(f" {_toml_str(p)}," for p in include), - "]", - ] - if exclude: - lines += ["exclude = [", *(f" {_toml_str(p)}," for p in exclude), "]"] - return "\n".join(lines) + "\n" - - -_STARTER_CONFIG_BASE = """\ -# solx config — see https://github.com/Shu-Wan/solx/blob/main/solx/README.md -# -# Used by `solx job jump` when dropping into a shell on a compute node. -default_shell = "bash" - -# Default template for `solx job start` when invoked without an argument. -default_template = "default" - -# Cap on how long `solx job start` waits for the queue. CLI flag --timeout -# overrides per-run. -start_timeout = "10m" - - -# Job templates. Run `solx job start ` to allocate one. -# Each table is self-contained; repeat flags across templates if needed. - -[jobs.default] -partition = "lightwork" -time = "1-0" -qos = "public" - -[jobs.debug] -partition = "htc" -time = "0-1" - - -""" - -_KEEP_PLACEHOLDER = """\ -# Scratch paths to keep alive when Sol flags them in a warning CSV -# *and* `solx keep` runs. Replace `sparky` with your ASURITE. -# Patterns use gitignore-style globs (** for recursion). -# Uncomment + edit to enable: -# -# [keep] -# include = ["/scratch/sparky/your-project", "/scratch/sparky/experiments/**"] -# exclude = ["**/__pycache__", "**/.venv"] -""" diff --git a/solx/src/solx/init.py b/solx/src/solx/init.py deleted file mode 100644 index 321cade..0000000 --- a/solx/src/solx/init.py +++ /dev/null @@ -1,220 +0,0 @@ -"""`solx init` — write a starter `config.toml`.""" -from __future__ import annotations - -import os -import stat -from pathlib import Path -from typing import Callable - -try: - import tomllib # Python 3.11+ -except ModuleNotFoundError: # Python 3.10 — backport - import tomli as tomllib - -from solx import config as cfg -from solx.output import Out - - -SHELLS = ("bash", "zsh", "fish") - - -def _default_walkthrough(out: Out, solkeep: Path | None) -> dict | None: - """Interactive first-run walkthrough. Returns answers, or None if declined. - - Steps (more can be added later): optionally import an existing `~/.solkeep` - into `[keep]`, then pick the login shell `solx job jump` opens. Returns - ``{"shell": str, "keep": (include, exclude) | None}``. - """ - from rich.prompt import Confirm, Prompt # lazy: interactive walkthrough only - - if not Confirm.ask("Walk through a quick setup?", default=False): - return None - - # Step 1 — shell (a real choice, so the walkthrough doesn't open with two - # yes/no questions in a row). - out.status("\n[bold]Step 1 — shell[/]") - shell = Prompt.ask( - "Which shell should `solx job jump` open on the compute node?", - choices=list(SHELLS), - default="bash", - ) - - # Step 2 — scratch keep-list (only when there's a ~/.solkeep to offer). - keep = None - candidate = cfg.import_solkeep(solkeep) if solkeep is not None else None - if candidate is not None: - inc, exc = candidate - out.status( - f"\n[bold]Step 2 — scratch keep-list[/] " - f"({solkeep}: {len(inc)} include / {len(exc)} exclude)" - ) - if Confirm.ask("Import it into \\[keep]?", default=True): # \\[ escapes markup - keep = candidate - - return {"shell": shell, "keep": keep} - - -def cmd_init( - *, - path: Path | None = None, - force: bool = False, - solkeep: Path | None = None, - out: Out | None = None, - confirm_fn: Callable[..., bool] | None = None, - walkthrough_fn: Callable[[Out, Path | None], dict | None] | None = None, -) -> int: - out = out or Out.auto() - p = path or cfg.config_path() - - if p.exists() and not force: - # Never block on the overwrite prompt in a non-interactive session. - if not out.interactive: - out.error(f"[red]error:[/] {p} already exists. pass -f to overwrite.") - return 2 - ask = confirm_fn - if ask is None: - from rich.prompt import Confirm # lazy: only the prompt path needs rich - - ask = Confirm.ask - if not ask(f"{p} already exists. Overwrite?", default=False): - out.status("[dim]aborted[/]") - return 1 - - # Optional interactive walkthrough — skipped entirely in a non-interactive - # session (an agent/cron just gets the defaults, never a hung prompt). The - # `~/.solkeep` import is one of its prompted steps; importing is convenience - # only — `solx keep` reads `~/.solkeep` at runtime regardless. - imported = None - default_shell = "bash" - if out.interactive: - result = (walkthrough_fn or _default_walkthrough)(out, solkeep) - if result: - default_shell = result.get("shell") or "bash" - imported = result.get("keep") - - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(cfg.starter_config_text(keep=imported, default_shell=default_shell)) - # Mode 0600 — config may eventually contain user-specific paths or - # mail-user etc.; keep it readable only by the owner. - os.chmod(p, stat.S_IRUSR | stat.S_IWUSR) - - if imported is not None: - inc, exc = imported - out.status( - f"[green]imported[/] {len(inc)} include / {len(exc)} exclude " - "pattern(s) into \\[keep]" - ) - out.status("[dim]edit it with `solx config edit`, then `solx job start`.[/]") - out.emit(data={"wrote": str(p)}, human=lambda: f"[green]wrote[/] {p}") - return 0 - - -def cmd_import_solkeep( - *, - path: Path | None = None, - solkeep: Path | None = None, - force: bool = False, - out: Out | None = None, -) -> int: - """Migrate a legacy `~/.solkeep` keep-list into the config's `[keep]` block. - - The implicit `~/.solkeep` fallback (and the `.solkeep` format) is - deprecated and loses support in a future release (see - `keep.SOLKEEP_REMOVED_IN`); this is the one-shot migration. Reads `solkeep` - (default `~/.solkeep`), splits it into include/exclude via `import_solkeep`, - and appends a rendered `[keep]` block to an existing `config.toml`. The - merged document is validated before anything is written, so a pattern that - can't round-trip through TOML never leaves a corrupt config on disk. - Refuses if the config already has an active `[keep]` table — a second one - is invalid TOML, so the user must merge by hand there. - - `.solkeep` is gitignore last-match-wins while `[keep]` is - include-minus-exclude, so an order-dependent re-include (a positive rule - under an earlier `!` carve-out) can't be preserved — the split would renew - *fewer* directories, and since `[keep]` then takes precedence over - `~/.solkeep` (see `keep.cmd_keep`), keeping the old file does not preserve - the prior behavior. Such a **lossy** import is **refused** unless `force` - is set, so the semantic change is never silent. - """ - out = out or Out.auto() - p = path or cfg.config_path() - src = solkeep or (Path.home() / ".solkeep") - - if not p.exists(): - out.error( - f"[red]error:[/] no config at {p}. run `solx init` first, then re-run this." - ) - return 2 - - imported = cfg.import_solkeep(src) - if imported is None: - out.error( - f"[red]error:[/] nothing to import from {src} (missing or no patterns)." - ) - return 2 - include, exclude = imported - - try: - existing = cfg.load(p) - except cfg.ConfigError as e: - out.error(f"[red]error:[/] {e}") - return 2 - if existing.keep is not None: - out.error( - r"[red]error:[/] config already has a \[keep] block. merge the " - "patterns by hand with `solx config edit` (a second \\[keep] table " - "would be invalid TOML)." - ) - return 2 - - # A lossy migration (order-dependent re-include) changes which directories - # get renewed and can't be undone by keeping ~/.solkeep, since [keep] wins. - # Refuse it unless the user explicitly accepts with -f, so nothing is - # silently written. - lossy = cfg.solkeep_is_order_sensitive(src) - if lossy and not force: - out.error( - rf"[red]error:[/] {src} re-includes a path under an earlier `!` " - r"carve-out. A \[keep] block (include minus exclude) can't preserve " - "that ordering, so the migration would renew FEWER directories — and " - r"\[keep] then takes precedence over ~/.solkeep, so keeping the old " - "file won't preserve current behavior. Compare `solx keep --dry-run` " - "before and after, then re-run with -f to accept the change (or edit " - "the config by hand)." - ) - return 2 - - block = cfg.render_keep_block(include, exclude, source=str(src)) - # Validate the merged document before touching the file: a pattern that - # can't round-trip through TOML must never leave a corrupt config on disk. - new_text = p.read_text(encoding="utf-8").rstrip("\n") + "\n\n" + block - try: - tomllib.loads(new_text) - except tomllib.TOMLDecodeError as e: - out.error( - f"[red]error:[/] importing these patterns would produce invalid TOML " - f"({e}); config left unchanged. Fix {src} or run `solx config edit`." - ) - return 1 - p.write_text(new_text, encoding="utf-8") - - out.status( - f"[green]imported[/] {len(include)} include / {len(exclude)} exclude " - r"pattern(s) into \[keep]" - ) - if lossy: # only reachable with -f - out.status( - r"[yellow]warning:[/] ordering not preserved (re-include under a `!` " - "carve-out) — verify with `solx keep --dry-run` against the old " - f"{src} and adjust the \\[keep] block if it renews too little." - ) - else: - out.status( - "[dim]review with `solx config show`, then verify with " - "`solx keep --dry-run` before removing the old keep-list.[/]" - ) - out.emit( - data={"config": str(p), "include": include, "exclude": exclude}, - human=lambda: f"[green]wrote[/] \\[keep] → {p}", - ) - return 0 diff --git a/solx/src/solx/jobs.py b/solx/src/solx/jobs.py deleted file mode 100644 index 54b625e..0000000 --- a/solx/src/solx/jobs.py +++ /dev/null @@ -1,305 +0,0 @@ -"""`solx job` subcommands: list, start, stop, jump, time. - -Output obeys `solx.output.Out`: JSON on a non-TTY stdout, Rich tables on a -TTY, all diagnostics on stderr. Jobid resolution is verb-aware (see -`solx.slurm.resolve_jobid`): read/attach verbs auto-pick the most recent job, -the destructive `stop` never does, and acting from inside an allocation -carries a nesting / self-cancel guard. -""" -from __future__ import annotations - -import os -import shlex -from dataclasses import asdict -from typing import Iterable - -from solx import slurm -from solx.config import Config, ConfigError -from solx.output import Out -from solx.slurm import Job, SlurmError - - -# --- shared rendering ----------------------------------------------------- - - -def _jobs_table(jobs: Iterable[Job]): - from rich.table import Table # lazy: only the human-render path needs rich - - t = Table(title=None, show_lines=False, header_style="bold") - for col in ("JOBID", "NAME", "STATE", "TIME", "LEFT", "PARTITION", "NODE / REASON"): - t.add_column(col) - for j in jobs: - t.add_row( - j.job_id, j.name, j.state, j.time_used, j.time_left, - j.partition, j.node_list, - ) - return t - - -def _jobs_payload(jobs: Iterable[Job]) -> list[dict]: - return [asdict(j) for j in jobs] - - -def _print_candidates(out: Out, jobs: Iterable[Job], reason: str) -> None: - """Surface a candidate set for a verb that won't auto-pick (stop).""" - jobs = list(jobs) - if out.json_mode: - out.json({"error": reason, "jobs": _jobs_payload(jobs)}) - else: - out.error(f"[yellow]{reason} — specify a JOBID:[/]") - out.stderr.print(_jobs_table(jobs)) - - -# --- list ----------------------------------------------------------------- - - -def cmd_list(*, runner: slurm.Runner = slurm.real_runner, out: Out | None = None) -> int: - out = out or Out.auto() - try: - jobs = slurm.squeue_user_jobs(runner=runner) - except SlurmError as e: - out.error(f"[red]error:[/] {e}") - return 1 - out.emit( - data=_jobs_payload(jobs), - human=lambda: _jobs_table(jobs) if jobs else "[dim]no jobs in queue[/]", - ) - return 0 - - -# --- start ---------------------------------------------------------------- - - -def cmd_start( - *, - config: Config, - template_name: str | None, - dry_run: bool, - timeout_override: int | None, - passthrough: list[str], - salloc_runner: slurm.Runner | None = None, - out: Out | None = None, -) -> int: - out = out or Out.auto() - name = template_name or config.default_template - try: - template = config.template(name) - except ConfigError as e: - out.error(f"[red]error:[/] {e}") - return 1 - - argv = slurm.salloc_argv(template, passthrough=passthrough) - - if dry_run: - out.status("[bold]dry-run — would run:[/]") - out.emit( - data={"dry_run": True, "template": name, "argv": argv}, - human=lambda: f" {shlex.join(argv)}", - ) - return 0 - - timeout = timeout_override or config.start_timeout_seconds - out.status(f"[dim]submitting:[/] {shlex.join(argv)}") - out.status( - f"[dim]waiting up to {timeout}s for the queue to grant the allocation…[/]" - ) - try: - jobid = slurm.run_salloc(argv, timeout_seconds=timeout, runner=salloc_runner) - except SlurmError as e: - out.error(f"[red]error:[/] {e}") - return 1 - - out.status(f"[green]allocated job[/] [bold]{jobid}[/]") - out.status( - f"[dim]attach:[/] solx job jump {jobid} " - f"[dim](or: srun --jobid={jobid} --overlap --pty {config.default_shell})[/]" - ) - if out.json_mode: - out.json({"jobid": jobid, "template": name}) - return 0 - - -# --- stop ----------------------------------------------------------------- - - -def cmd_stop( - *, - jobid_arg: str | None, - yes: bool, - dry_run: bool, - runner: slurm.Runner = slurm.real_runner, - out: Out | None = None, - confirm_fn=None, -) -> int: - out = out or Out.auto() - if yes and dry_run: - out.error("[red]error:[/] --yes and --dry-run are mutually exclusive") - return 2 - - try: - res = slurm.resolve_jobid(jobid_arg, verb=slurm.VERB_STOP, runner=runner) - except SlurmError as e: - out.error(f"[red]error:[/] {e}") - return 1 - if res.error: - out.error(f"[red]error:[/] {res.error}") - return 1 - if res.ambiguous: - _print_candidates(out, res.candidates, "multiple jobs running") - return 2 - - jid = res.job_id - argv = slurm.scancel_argv(jid) - - # Acting on the job you're sitting inside ends this session — surface it - # in every path, including a dry-run preview, so the resolver's decision is - # never a surprise. - self_cancel = res.acting_on_current - if self_cancel: - out.status( - f"[yellow]warning:[/] job {jid} is the allocation you're inside " - "($SLURM_JOB_ID); cancelling it will end this session." - ) - - if dry_run: - out.status("[bold]dry-run — would run:[/]") - out.emit( - data={ - "dry_run": True, - "jobid": jid, - "argv": argv, - "inside_allocation": self_cancel, - }, - human=lambda: f" {shlex.join(argv)}", - ) - return 0 - - if not yes: - if not out.interactive: - out.error( - "[red]error:[/] non-interactive session — pass -y to cancel " - f"job {jid}, or -n to preview." - ) - return 2 - ask = confirm_fn - if ask is None: - from rich.prompt import Confirm # lazy: only the prompt path needs rich - - ask = Confirm.ask - prompt = ( - f"Cancel job {jid} (the one you're inside)?" - if self_cancel - else f"Cancel job {jid}?" - ) - if not ask(prompt, default=False): - out.status("[dim]aborted[/]") - return 1 - - code, _, err = runner(argv) - if code != 0: - out.error(f"[red]scancel failed:[/] {err.strip()}") - return 1 - out.status(f"[green]cancelled[/] job {jid}") - if out.json_mode: - out.json({"cancelled": jid}) - return 0 - - -# --- jump ----------------------------------------------------------------- - - -def cmd_jump( - *, - config: Config, - jobid_arg: str | None, - quiet: bool = False, - runner: slurm.Runner = slurm.real_runner, - exec_fn=None, - out: Out | None = None, -) -> int: - """Drop the user into a shell on the job's compute node. - - Exec-replaces the current process with `srun --pty` so the user's shell - history and signal handling are clean. Tests inject `exec_fn` to capture - argv without exec'ing. - - Nesting heads-up: attaching from *inside* an allocation ($SLURM_JOB_ID set) - spawns a nested step. Unlike `stop`, attach is non-destructive and - Ctrl-D-recoverable, so we WARN-AND-PROCEED (not refuse) — `-q/--quiet` - silences the heads-up. - """ - out = out or Out.auto() - try: - res = slurm.resolve_jobid(jobid_arg, verb=slurm.VERB_JUMP, runner=runner) - except SlurmError as e: - out.error(f"[red]error:[/] {e}") - return 1 - if res.error: - out.error(f"[red]error:[/] {res.error}") - return 1 - - if not quiet: - if res.acting_on_current: - out.status( - f"[yellow]already inside job {res.inside_job_id}[/] — opening a " - "nested srun step here burns extra resources. `exit` to leave, " - "or pass another JOBID. Attaching anyway." - ) - elif res.inside: - out.status( - f"[yellow]nesting:[/] you're inside job {res.inside_job_id}; " - f"attaching to job {res.job_id} opens a step on another " - "allocation. Proceeding." - ) - if res.source == "most-recent": - out.status( - f"[dim]multiple running jobs; attaching to most recent " - f"{res.job_id} (pass JOBID to choose another)[/]" - ) - - jid = res.job_id - argv = slurm.srun_pty_argv(jid, config.default_shell) - if exec_fn is not None: - exec_fn(argv) - return 0 - - os.execvp(argv[0], argv) - return 0 # unreachable - - -# --- time ----------------------------------------------------------------- - - -def cmd_time( - *, - jobid_arg: str | None, - runner: slurm.Runner = slurm.real_runner, - out: Out | None = None, -) -> int: - out = out or Out.auto() - try: - res = slurm.resolve_jobid(jobid_arg, verb=slurm.VERB_TIME, runner=runner) - except SlurmError as e: - out.error(f"[red]error:[/] {e}") - return 1 - if res.error: - out.error(f"[red]error:[/] {res.error}") - return 1 - if res.source == "most-recent": - out.status( - f"[dim]multiple jobs; showing most recent {res.job_id} " - "(pass JOBID to choose another)[/]" - ) - - jid = res.job_id - argv = slurm.squeue_time_left_argv(jid) - code, out_text, err = runner(argv) - if code != 0 or not out_text.strip(): - out.error( - f"[red]squeue failed for jobid {jid}:[/] " - f"{err.strip() or '(empty output)'}" - ) - return 1 - time_left = out_text.strip() - out.emit(data={"jobid": jid, "time_left": time_left}, human=lambda: time_left) - return 0 diff --git a/solx/src/solx/keep.py b/solx/src/solx/keep.py deleted file mode 100644 index eb43cb7..0000000 --- a/solx/src/solx/keep.py +++ /dev/null @@ -1,505 +0,0 @@ -"""`solx keep` — renew scratch files Sol has flagged, filtered by `[keep]`. - -Read Sol's warning CSVs from `--csv-dir`, intersect the flagged directories -with the `[keep]` include/exclude globs from config (via `pathspec`), and -`touch -a -m -c` only the intersection. Preserves the original tool's "only -renew what Sol has explicitly flagged" ethical posture — we never walk -`/scratch` wholesale. - -Execution is file-level-sharded (PR #18): a bounded streaming pipeline over -one worker pool — enumerate a kept directory, split its files into evenly-sized -batches, and `touch` the batches across the pool. A single huge directory -fans out into many batches, so `-j` scales the parallelism of the whole run -including its largest directory, not just the count of directories. -Enumeration uses `fd` (or `rg`) when on `PATH` — both walk a tree -multithreaded — and `find` otherwise. - -This is metadata-heavy NFS I/O. On Sol run it on a compute node or the DTN -(`ssh soldtn`), not a throttled login node. -""" -from __future__ import annotations - -import csv -import json -import os -import shutil -import subprocess -import tempfile -from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait -from dataclasses import dataclass, field -from pathlib import Path -from typing import Callable - -from solx.config import Config, KeepRules, load_solkeep -from solx.output import Out - - -STAGE_FILES = { - "pending": "scratch-dirs-pending-removal.csv", - "over90": "scratch-dirs-over-90days.csv", - "inactive": "scratch-dirs-inactive.csv", -} -STAGE_ORDER = ("pending", "over90", "inactive") -STAGES_ALL = "all" - -# ~/.solkeep is the legacy keep-list the standalone sol_renew.py used. solx keep -# still reads it as a last-resort fallback, but the config [keep] block is the -# supported home; the implicit fallback and the .solkeep format stay supported -# through the 0.x line and are removed in the version below. -SOLKEEP_REMOVED_IN = "1.0.0" - -# Files per touch shard. Big enough that per-batch subprocess overhead is -# negligible, small enough that one huge directory fans out into many batches -# and keeps every worker busy. xargs re-splits each batch into `touch` calls -# of 500 internally. -BATCH = 2000 - -# Cap on how many dirs we inline into a JSON payload. Sol's warning CSVs can -# list thousands of flagged dirs; emitting them all makes a multi-megabyte -# document that blows an agent's context. We cap the inlined sample and always -# report the true totals + a `*_truncated` flag (agent-native principle #5: -# bounded responses). Counts are always exact; the lists are a sample. -JSON_LIST_CAP = 100 - - -@dataclass(frozen=True) -class Plan: - """The directories `solx keep` would touch (`kept`) vs filter out (`skipped`).""" - - kept: list[tuple[str, str]] = field(default_factory=list) - skipped: list[tuple[str, str]] = field(default_factory=list) - - @property - def empty(self) -> bool: - return not self.kept and not self.skipped - - -# --- planning ------------------------------------------------------------- - - -def load_csv_dirs(csv_path: Path) -> list[str]: - """Return the `Directory` column from one of Sol's warning CSVs. - - A missing file is fine — Sol only drops the CSV when there's something to - flag. An empty result means nothing to do for that stage. - """ - if not csv_path.exists(): - return [] - dirs: list[str] = [] - with csv_path.open(newline="") as fh: - reader = csv.DictReader(fh) - for row in reader: - d = (row.get("Directory") or "").strip() - if d: - dirs.append(d) - return dirs - - -def build_plan(csv_dir: Path, stages: list[str], keep: KeepRules) -> Plan: - """Walk the chosen stages' CSVs and split flagged dirs into kept/skipped.""" - kept: list[tuple[str, str]] = [] - skipped: list[tuple[str, str]] = [] - seen: set[str] = set() - for stage in stages: - for d in load_csv_dirs(csv_dir / STAGE_FILES[stage]): - if d in seen: - continue - seen.add(d) - (kept if keep.matches(d) else skipped).append((stage, d)) - return Plan(kept=kept, skipped=skipped) - - -# --- enumeration + touching ---------------------------------------------- -# -# Two task kinds run on one worker pool: -# enumerate_dir -- walk a kept directory, return its files -# touch_files -- `touch -a -m -c` a batch of those files -# touch is the expensive half (one metadata write per file), so it is sharded -# into file batches and spread across the pool. Paths are kept as bytes -# end-to-end so a non-UTF-8 filename can't crash the run. - - -def _pick_lister() -> tuple[str, str]: - """Choose the fastest available file lister: (kind, binary path). - - `fd` and `rg` walk a directory tree multithreaded, faster than `find` on a - large directory; `find` is the always-present fallback. - - The `--hidden --no-ignore` flags are LOAD-BEARING, not cosmetic: both fd - and rg skip dotfiles and honor .gitignore/.fdignore/global-ignore by - default, so without them a renewal would silently skip hidden and - git-ignored files and under-protect them. With both flags, each matches - `find -type f`. Detection is via `shutil.which`, so a shell alias/function - named `rg` (e.g. Claude Code's bundled ripgrep shim) is ignored — only a - real PATH binary is used. - """ - for name in ("fd", "fdfind"): # fdfind = the binary name on Debian/Ubuntu - binary = shutil.which(name) - if binary: - return ("fd", binary) - binary = shutil.which("rg") - if binary: - return ("rg", binary) - return ("find", "find") - - -# Resolved once at import; ProcessPoolExecutor workers inherit it (fork) or -# recompute it cheaply (spawn). -LISTER_KIND, LISTER_BIN = _pick_lister() - - -def enumerate_dir(directory: str) -> tuple[str, list[bytes], str]: - """List every regular file under `directory` in one walk. - - Returns (directory, file_paths, message). A path that isn't a directory - (e.g. flagged then removed) is reported as a benign skip, not an error. - """ - if not os.path.isdir(directory): - return (directory, [], "skipped: not a directory") - - if LISTER_KIND == "fd": - argv = [LISTER_BIN, "--hidden", "--no-ignore", "--type", "f", - "--print0", "--search-path", directory] - elif LISTER_KIND == "rg": - argv = [LISTER_BIN, "--files", "--hidden", "--no-ignore", "--null", - directory] - else: - argv = ["find", directory, "-type", "f", "-print0"] - - try: - proc = subprocess.run(argv, capture_output=True, check=False) - except Exception as e: # noqa: BLE001 - return (directory, [], f"exec failed: {e}") - - # rg exits 1 when it lists no files -- that's an empty (but valid) - # directory, not an error. fd/find return 0 in that case; for all three a - # genuinely bad walk (permission, I/O) is rg>=2 / fd!=0 / find!=0. - empty_ok = LISTER_KIND == "rg" and proc.returncode == 1 and not proc.stdout - if proc.returncode != 0 and not empty_ok: - err = proc.stderr.decode("utf-8", "replace").strip().splitlines() - return (directory, [], err[-1] if err else f"{LISTER_KIND}: nonzero exit") - files = [p for p in proc.stdout.split(b"\0") if p] - return (directory, files, "ok") - - -def touch_files(paths: list[bytes]) -> tuple[int, int, str]: - """`touch -a -m -c` a batch of files in one xargs pass. - - Returns (files_attempted, errors, message). `touch -c` never creates a - file and exits 0 on a path that no longer exists, so a file deleted - between enumeration and touch is silently skipped, not an error. A nonzero - exit means a real failure (permission, I/O), which we surface. - """ - if not paths: - return (0, 0, "ok") - - data = b"\0".join(paths) + b"\0" - try: - proc = subprocess.run( - ["xargs", "-0", "-r", "-n", "500", "touch", "-a", "-m", "-c", "--"], - input=data, - capture_output=True, - check=False, - ) - except Exception as e: # noqa: BLE001 - return (len(paths), 1, f"exec failed: {e}") - - if proc.returncode != 0: - err = proc.stderr.decode("utf-8", "replace").strip().splitlines() - return (len(paths), 1, err[-1] if err else "touch: nonzero exit") - return (len(paths), 0, "ok") - - -def shard(files: list[bytes], batch_size: int = BATCH) -> list[list[bytes]]: - """Split a flat file list into evenly-sized batches for the touch pool.""" - return [files[i : i + batch_size] for i in range(0, len(files), batch_size)] - - -# --- command -------------------------------------------------------------- - - -def cmd_keep( - *, - config: Config | None, - csv_dir: Path | None, - stage: str, - jobs_n: int, - yes: bool, - dry_run: bool, - verbose: bool, - solkeep: Path | None = None, - out: Out | None = None, - confirm_fn: Callable[..., bool] | None = None, - execute_fn: Callable[..., tuple[int, int]] | None = None, -) -> int: - out = out or Out.auto() - - if yes and dry_run: - out.error("[red]error:[/] --yes and --dry-run are mutually exclusive") - return 2 - - # Keep-list source, in precedence order: explicit --solkeep > config - # [keep] > the skill's ~/.solkeep (so an existing .solkeep just works). - if solkeep is not None: - keep_rules = load_solkeep(solkeep) - if keep_rules is None: - out.error(f"[red]error:[/] no keep rules found in {solkeep}") - return 2 - elif config is not None and config.keep is not None: - keep_rules = config.keep - else: - keep_rules = load_solkeep(Path.home() / ".solkeep") - if keep_rules is None: - out.error( - r"[red]error:[/] no \[keep] block in config and no ~/.solkeep. " - r"run `solx config edit` to add a \[keep] block." - ) - return 2 - # The .solkeep fallback is deprecated — nudge migration into [keep]. - out.status( - f"[yellow]deprecated:[/] reading the keep-list from ~/.solkeep is " - f"deprecated and loses support in solx {SOLKEEP_REMOVED_IN}. " - r"migrate it into your config's \[keep] block: solx config import-solkeep" - ) - - csv_dir = csv_dir or Path.home() - if not csv_dir.is_dir(): - out.error( - f"[red]error:[/] --csv-dir {csv_dir} is not a directory " - "(Sol drops the warning CSVs in $HOME)." - ) - return 2 - stages = list(STAGE_ORDER) if stage == STAGES_ALL else [stage] - - plan = build_plan(csv_dir, stages, keep_rules) - _report_plan(out, plan, csv_dir, stages, verbose) - - if not plan.kept: - if out.json_mode: - # Still emit a document so an agent gets structured output, not - # empty stdout, when nothing is flagged. - out.json(_plan_json(plan, csv_dir, stages, dry_run=dry_run)) - else: - out.status( - "[dim]no flagged directories matched [keep] — nothing to do.[/]" - ) - return 0 - - if dry_run: - if out.json_mode: - out.json(_plan_json(plan, csv_dir, stages, dry_run=True)) - return 0 - - if not yes: - # Destructive: never block on a prompt in a non-interactive session. - if not out.interactive: - out.error( - "[red]error:[/] non-interactive session — pass -y to renew " - f"{len(plan.kept)} directories, or -n to preview." - ) - return 2 - ask = confirm_fn - if ask is None: - from rich.prompt import Confirm # lazy: only the prompt path needs rich - - ask = Confirm.ask - if not ask( - f"Touch mtimes on {len(plan.kept)} directories?", default=False - ): - out.status("[dim]aborted[/]") - return 1 - - run = execute_fn or _execute - total_files, failures = run(plan, jobs_n, out) - - if out.json_mode: - summary = { - "renewed": True, - "dirs": len(plan.kept), - "files_touched": total_files, - "failures": failures, - "kept_truncated": len(plan.kept) > JSON_LIST_CAP, - "kept": [d for _, d in plan.kept[:JSON_LIST_CAP]], - } - if summary["kept_truncated"]: - summary["full_plan_path"] = _dump_full_plan(plan, csv_dir, stages) - out.json(summary) - else: - out.status( - f"[green]done[/] {len(plan.kept)} dirs · " - f"{total_files} files touched" - + (f" · [red]{failures} failed[/]" if failures else "") - ) - return 1 if failures else 0 - - -def _report_plan( - out: Out, - plan: Plan, - csv_dir: Path, - stages: list[str], - verbose: bool, -) -> None: - """Print the plan summary to stderr (human) — stdout stays the data channel.""" - if out.json_mode: - return - out.status( - f"[dim]csv-dir:[/] {csv_dir} [dim]stages:[/] {', '.join(stages)}" - ) - out.status( - f"[bold]plan:[/] {len(plan.kept)} kept, {len(plan.skipped)} skipped" - ) - if len(plan.kept) > JSON_LIST_CAP or len(plan.skipped) > JSON_LIST_CAP: - path = _dump_full_plan(plan, csv_dir, stages) - out.status(f"[dim]full plan ({len(plan.kept) + len(plan.skipped)} dirs):[/] {path}") - if verbose: - if plan.kept: - out.status("[green]kept:[/]") - for stage_name, d in plan.kept[:20]: - out.status(f" [dim]{stage_name:>9}[/] {d}") - if len(plan.kept) > 20: - out.status(f" [dim]… and {len(plan.kept) - 20} more[/]") - if plan.skipped: - out.status( - r"[yellow]skipped[/] (flagged by Sol but not in \[keep]):" - ) - for stage_name, d in plan.skipped[:20]: - out.status(f" [dim]{stage_name:>9}[/] {d}") - - -def _plan_json(plan: Plan, csv_dir: Path, stages: list[str], *, dry_run: bool) -> dict: - """Bounded plan document: exact counts, a capped sample of each list. - - When either list is truncated, the COMPLETE plan is spilled to a temp file - and its path returned under ``full_plan_path`` — so the response stays small - enough for an agent's context while the full detail is one ``cat`` away. - """ - truncated = len(plan.kept) > JSON_LIST_CAP or len(plan.skipped) > JSON_LIST_CAP - doc = { - "dry_run": dry_run, - "csv_dir": str(csv_dir), - "stages": stages, - "kept_count": len(plan.kept), - "skipped_count": len(plan.skipped), - "kept_truncated": len(plan.kept) > JSON_LIST_CAP, - "skipped_truncated": len(plan.skipped) > JSON_LIST_CAP, - "kept": [{"stage": s, "dir": d} for s, d in plan.kept[:JSON_LIST_CAP]], - "skipped": [{"stage": s, "dir": d} for s, d in plan.skipped[:JSON_LIST_CAP]], - } - if truncated: - doc["full_plan_path"] = _dump_full_plan(plan, csv_dir, stages) - return doc - - -def _dump_full_plan(plan: Plan, csv_dir: Path, stages: list[str]) -> str: - """Write the complete (untruncated) plan to a temp file; return its path.""" - fd, path = tempfile.mkstemp(prefix="solx-keep-plan-", suffix=".json") - with os.fdopen(fd, "w") as fh: - json.dump( - { - "csv_dir": str(csv_dir), - "stages": stages, - "kept": [{"stage": s, "dir": d} for s, d in plan.kept], - "skipped": [{"stage": s, "dir": d} for s, d in plan.skipped], - }, - fh, - indent=2, - ) - return path - - -def _execute( - plan: Plan, - jobs_n: int, - out: Out, - *, - enumerate_fn: Callable[[str], tuple[str, list[bytes], str]] | None = None, - touch_fn: Callable[[list[bytes]], tuple[int, int, str]] | None = None, -) -> tuple[int, int]: - """Renew `plan.kept` as a bounded streaming pipeline. Returns (files, failures). - - With ``jobs_n <= 1`` runs serially (no process pool — fast and deterministic - for tests and small runs). Otherwise one worker pool runs both halves: - enumerate a directory, shard its files, submit the batches as `touch` tasks, - and top up enumeration only while the in-flight set has room. The bounded - window keeps peak memory a small multiple of `jobs_n` batches and lets a - single huge directory spread its batches over every worker. - """ - enumerate_fn = enumerate_fn or enumerate_dir - touch_fn = touch_fn or touch_files - dirs = [d for _, d in plan.kept] - total_files = 0 - enum_fail = touch_fail = 0 - - if jobs_n <= 1: - for d in dirs: - try: - _, files, msg = enumerate_fn(d) - except Exception as e: # noqa: BLE001 — never let one dir abort the run - enum_fail += 1 - out.error(f"[red]FAIL[/] enumerate {d} :: {e}") - continue - if msg != "ok" and not msg.startswith("skipped"): - enum_fail += 1 - out.error(f"[red]FAIL[/] enumerate {d} :: {msg}") - continue - for batch in shard(files): - try: - n, errs, tmsg = touch_fn(batch) - except Exception as e: # noqa: BLE001 - touch_fail += 1 - out.error(f"[red]FAIL[/] touch {d} :: {e}") - continue - total_files += n - if errs: - touch_fail += 1 - out.error(f"[red]FAIL[/] touch {d} :: {tmsg}") - if msg == "ok" and not out.json_mode: - out.status(f" [dim]ok[/] {len(files):>7d} files {d}") - return total_files, enum_fail + touch_fail - - # Parallel: bounded streaming window over one pool. - window = max(2 * jobs_n, jobs_n + 8) - pending: dict = {} - di = iter(dirs) - - with ProcessPoolExecutor(max_workers=jobs_n) as pool: - - def fill() -> None: - while len(pending) < window: - d = next(di, None) - if d is None: - return - pending[pool.submit(enumerate_fn, d)] = ("enum", d) - - fill() - while pending: - done, _ = wait(pending, return_when=FIRST_COMPLETED) - for fut in done: - kind, d = pending.pop(fut) - if kind == "enum": - try: - _, files, msg = fut.result() - except Exception as e: # noqa: BLE001 — e.g. BrokenProcessPool - enum_fail += 1 - out.error(f"[red]FAIL[/] enumerate {d} :: {e}") - continue - if msg == "ok": - for batch in shard(files): - pending[pool.submit(touch_fn, batch)] = ("touch", d) - elif not msg.startswith("skipped"): - enum_fail += 1 - out.error(f"[red]FAIL[/] enumerate {d} :: {msg}") - else: # touch batch - try: - n, errs, tmsg = fut.result() - except Exception as e: # noqa: BLE001 - touch_fail += 1 - out.error(f"[red]FAIL[/] touch {d} :: {e}") - continue - total_files += n - if errs: - touch_fail += 1 - out.error(f"[red]FAIL[/] touch {d} :: {tmsg}") - fill() - return total_files, enum_fail + touch_fail diff --git a/solx/src/solx/main.py b/solx/src/solx/main.py deleted file mode 100644 index 9ef886e..0000000 --- a/solx/src/solx/main.py +++ /dev/null @@ -1,773 +0,0 @@ -"""Command-line entry point for `solx`. - -Surface (see docs/solx.md): - - solx init - solx job list (alias `ls`; group also reachable as `jobs`) - solx job start [TEMPLATE] - solx job stop [JOBID] - solx job jump [JOBID] [-q] (also `solx jump`) - solx job time [JOBID] - solx keep [--stage S] [--csv-dir D] [-j N] [-y] [-n] [-v] - solx config show [--json] - solx config edit - solx config import-solkeep (migrate ~/.solkeep into [keep]) - solx completions - solx version (alias of --version) - solx help (alias of --help) - -Global output flag: `--json` forces JSON; by default output auto-detects -(Rich tables on a terminal, JSON when stdout is not a TTY). See `solx.output`. -Every output-producing leaf subcommand also accepts a trailing `--json`. -After `job start`, a `--json` belongs to the salloc passthrough; `config -edit`, `completions`, `version`, and `help` take no `--json` at all. -""" -from __future__ import annotations - -import os -import sys - -from solx import __version__ - -# Recognized by type checkers like typing.TYPE_CHECKING, without importing -# `typing` at runtime. -TYPE_CHECKING = False -if TYPE_CHECKING: - import argparse - - from solx.output import Out - -# solx's home lives on NFS, where every module import is a network round-trip, -# so every invocation pays for whatever this module pulls in. Importing this -# module loads nothing beyond what the interpreter already has: argparse and -# pathlib are imported when the parser tree is built (so the `--version` / -# `version` fast path in `main()` skips them entirely), and command -# implementations (with their rich/pathspec dependency trees) are imported -# inside the handlers below. - -_JSON_HELP = "Force JSON output (machine-readable)." - - -# --- helpers ---------------------------------------------------------------- - - -def _require_sol() -> None: - from solx.side import require_sol - - require_sol() - - -def _out(json_flag: bool) -> Out: - """Build the resolved output target for a command body.""" - from solx.output import Out - - return Out.auto(force="json" if json_flag else None) - - -def _json_flag(ns: argparse.Namespace) -> bool: - """Resolved --json: the root flag or the subcommand's trailing flag.""" - return bool(getattr(ns, "json_root", False) or getattr(ns, "json_leaf", False)) - - -def _load_or_exit(out: Out): - from solx import config as cfg - from solx.config import ConfigError - - try: - return cfg.load() - except ConfigError as e: - out.error(f"error: {e}") - raise SystemExit(2) - - -# --- command handlers ------------------------------------------------------- - - -def _cmd_init(ns: argparse.Namespace) -> None: - _require_sol() - from pathlib import Path - - from solx import init as init_mod - - # Auto-import an existing ~/.solkeep into the new config's [keep] block. - sys.exit( - init_mod.cmd_init( - force=ns.force, solkeep=Path.home() / ".solkeep", out=_out(_json_flag(ns)) - ) - ) - - -def _cmd_keep(ns: argparse.Namespace) -> None: - _require_sol() - from solx import config as cfg - from solx import keep as keep_mod - - out = _out(_json_flag(ns)) - valid_stages = {"all", *keep_mod.STAGE_ORDER} - if ns.stage not in valid_stages: - out.error( - f"invalid --stage {ns.stage!r}. choose from: {', '.join(sorted(valid_stages))}" - ) - sys.exit(2) - if ns.jobs_n < 1: - out.error(f"invalid --jobs {ns.jobs_n}. must be >= 1.") - sys.exit(2) - # `keep` can run off a `~/.solkeep` alone, so a missing config.toml is fine - # (config stays None). A config that exists but is malformed still errors. - config = _load_or_exit(out) if cfg.config_path().exists() else None - sys.exit( - keep_mod.cmd_keep( - config=config, - csv_dir=ns.csv_dir, - stage=ns.stage, - jobs_n=ns.jobs_n, - yes=ns.yes, - dry_run=ns.dry_run, - verbose=ns.verbose, - solkeep=ns.solkeep, - out=out, - ) - ) - - -def _cmd_jump(ns: argparse.Namespace) -> None: - _require_sol() - from solx import jobs as jobs_mod - - out = _out(_json_flag(ns)) - config = _load_or_exit(out) - sys.exit( - jobs_mod.cmd_jump(config=config, jobid_arg=ns.jobid, quiet=ns.quiet, out=out) - ) - - -def _cmd_job_list(ns: argparse.Namespace) -> None: - _require_sol() - from solx import jobs as jobs_mod - - sys.exit(jobs_mod.cmd_list(out=_out(_json_flag(ns)))) - - -def _cmd_job_stop(ns: argparse.Namespace) -> None: - _require_sol() - from solx import jobs as jobs_mod - - sys.exit( - jobs_mod.cmd_stop( - jobid_arg=ns.jobid, yes=ns.yes, dry_run=ns.dry_run, out=_out(_json_flag(ns)) - ) - ) - - -def _cmd_job_time(ns: argparse.Namespace) -> None: - _require_sol() - from solx import jobs as jobs_mod - - sys.exit(jobs_mod.cmd_time(jobid_arg=ns.jobid, out=_out(_json_flag(ns)))) - - -# Short flags `job start` recognizes ahead of `--`. A bundle of short flags -# (`-nn`) is consumed only when every letter is in this set. -_START_SHORTS = frozenset("n") - - -def _run_job_start( - json_flag: bool, - tail: list[str], - help_parser: argparse.ArgumentParser | None = None, -) -> None: - """Parse the `job start` tail and run the command. - - `job start` forwards unrecognized tokens to salloc, so its tail is parsed - here rather than by argparse: - - * Ahead of `--`: `-n`/`--dry-run` and `--timeout VALUE` (or - `--timeout=VALUE`) are consumed wherever they appear, even interleaved - with passthrough; a bundle of short flags (`-nn` == `-n -n`) is consumed - when every letter is a recognized short flag and forwarded whole to - salloc otherwise. - * The first `--` is consumed and shields everything after it: no later - token is ever parsed as a flag, and later `--` tokens are forwarded - literally. - * The first token not consumed by a known option names the TEMPLATE — on - either side of `--`. - * Every other token is passthrough to salloc, in its original order. - """ - _require_sol() - dry_run = False - timeout: str | None = None - template: str | None = None - passthrough: list[str] = [] - dd_seen = False - i = 0 - while i < len(tail): - tok = tail[i] - if dd_seen: - if template is None: - template = tok - else: - passthrough.append(tok) - elif tok == "--": - dd_seen = True - elif tok in ("-n", "--dry-run"): - dry_run = True - elif tok.startswith("--dry-run="): - print("error: option --dry-run does not take a value", file=sys.stderr) - sys.exit(2) - elif tok == "--timeout": - if i + 1 >= len(tail): - print("error: option --timeout requires an argument", file=sys.stderr) - sys.exit(2) - i += 1 - timeout = tail[i] - elif tok.startswith("--timeout="): - timeout = tok[len("--timeout=") :] - elif tok in ("-h", "--help") and help_parser is not None: - help_parser.print_help() - sys.exit(0) - elif len(tok) > 2 and tok[0] == "-" and all("a" <= c <= "z" for c in tok[1:]): - if all(c in _START_SHORTS for c in tok[1:]): - # Every letter is a known short flag — and `n` is the only - # one, so the bundle is some number of `-n` repeats. - dry_run = True - else: - passthrough.append(tok) - elif template is None: - template = tok - else: - passthrough.append(tok) - i += 1 - - from solx import config as cfg - from solx import jobs as jobs_mod - from solx.config import ConfigError - - out = _out(json_flag) - config = _load_or_exit(out) - timeout_seconds: int | None = None - if timeout: - try: - timeout_seconds = cfg.parse_duration(timeout) - except ConfigError as e: - out.error(f"error: {e}") - sys.exit(2) - sys.exit( - jobs_mod.cmd_start( - config=config, - template_name=template, - dry_run=dry_run, - timeout_override=timeout_seconds, - passthrough=passthrough, - out=out, - ) - ) - - -def _cmd_job_start_parsed(ns: argparse.Namespace) -> None: - # `main()` hands every `job start` invocation to `_run_job_start` before - # argparse dispatch; this reconstructs the tail for any stray path that - # still lands on the subparser, so both routes share one implementation. - tail: list[str] = [] - if ns.dry_run: - tail.append("-n") - if ns.timeout is not None: - tail.extend(["--timeout", ns.timeout]) - if ns.template is not None: - tail.append(ns.template) - tail.extend(ns.args) - _run_job_start(_json_flag(ns), tail) - - -def _cmd_config_show(ns: argparse.Namespace) -> None: - _require_sol() - out = _out(bool(getattr(ns, "json_root", False))) - config = _load_or_exit(out) - as_json = bool(ns.json_leaf) or out.json_mode - - if as_json: - from dataclasses import asdict - - # KeepRules holds compiled pathspec objects; serialize raw inputs only. - data = { - "default_shell": config.default_shell, - "default_template": config.default_template, - "start_timeout_seconds": config.start_timeout_seconds, - "templates": { - name: {k: v for k, v in asdict(t).items() if v not in (None, ())} - for name, t in config.templates.items() - }, - "keep": ( - { - "include": list(config.keep.raw_include), - "exclude": list(config.keep.raw_exclude), - } - if config.keep is not None - else None - ), - } - out.json(data) - sys.exit(0) - - from rich.table import Table - - c = out.stdout - c.print(f"[bold]default_shell[/] {config.default_shell}") - c.print(f"[bold]default_template[/] {config.default_template}") - c.print(f"[bold]start_timeout[/] {config.start_timeout_seconds}s") - - for name, t in config.templates.items(): - tbl = Table(title=rf"\[jobs.{name}]", show_header=False, title_justify="left") - tbl.add_row("partition", t.partition) - tbl.add_row("time", t.time) - if t.qos: - tbl.add_row("qos", t.qos) - if t.gres: - tbl.add_row("gres", t.gres) - if t.extra_args: - tbl.add_row("extra_args", " ".join(t.extra_args)) - c.print(tbl) - - if config.keep is not None: - tbl = Table(title=r"\[keep]", show_header=False, title_justify="left") - tbl.add_row("include", "\n".join(config.keep.raw_include)) - if config.keep.raw_exclude: - tbl.add_row("exclude", "\n".join(config.keep.raw_exclude)) - c.print(tbl) - else: - c.print(r"[dim]\[keep] not configured (solx keep will exit 2)[/]") - sys.exit(0) - - -def _cmd_config_edit(ns: argparse.Namespace) -> None: - _require_sol() - import shlex - import shutil - import subprocess - - from solx import config as cfg - - p = cfg.config_path() - if not p.exists(): - print(f"no config at {p}. run `solx init` first.", file=sys.stderr) - sys.exit(2) - # $EDITOR is often a command with flags (e.g. "code --wait", "vim -u NORC"), - # so split it into argv rather than treating the whole string as one binary. - editor = os.environ.get("EDITOR") or shutil.which("vi") or "nano" - editor_argv = shlex.split(editor) - sys.exit(subprocess.call([*editor_argv, str(p)])) - - -def _cmd_config_import_solkeep(ns: argparse.Namespace) -> None: - _require_sol() - from solx import init as init_mod - - sys.exit( - init_mod.cmd_import_solkeep( - solkeep=ns.solkeep, force=ns.force, out=_out(_json_flag(ns)) - ) - ) - - -def _cmd_completions(ns: argparse.Namespace) -> None: - shell = ns.shell.lower() - if shell not in {"bash", "zsh", "fish"}: - print(f"unknown shell {shell!r}; choose bash, zsh, or fish.", file=sys.stderr) - sys.exit(2) - from solx import _completions - - script = { - "bash": _completions.bash_script, - "zsh": _completions.zsh_script, - "fish": _completions.fish_script, - }[shell]() - print(script) - sys.exit(0) - - -def _cmd_version(ns: argparse.Namespace) -> None: - print(__version__) - sys.exit(0) - - -def _cmd_help(ns: argparse.Namespace) -> None: - # The root help, matching `solx --help`. - ns.help_parser.print_help() - sys.exit(0) - - -# --- parser tree ------------------------------------------------------------ - - -def _add_json(p: argparse.ArgumentParser, help: str = _JSON_HELP) -> None: - p.add_argument( - "--json", action="store_true", dest="json_leaf", default=False, help=help - ) - - -def _build_parser() -> tuple[argparse.ArgumentParser, argparse.ArgumentParser]: - """Build the argparse tree; returns (root parser, `job start` subparser). - - Every parser sets ``allow_abbrev=False``: option prefixes are never - expanded (`--time` must not match `--timeout`). - """ - import argparse - from pathlib import Path - - class _VersionAction(argparse.Action): - """Record `--version`; `main()` prints the version only after the - whole line parses, so invalid tokens elsewhere still error (exit 2).""" - - def __call__(self, parser, namespace, values, option_string=None): - setattr(namespace, self.dest, True) - - parser = argparse.ArgumentParser( - prog="solx", - description="CLI for ASU's Sol supercomputer.", - allow_abbrev=False, - ) - parser.add_argument( - "--version", - action=_VersionAction, - dest="show_version", - default=False, - nargs=0, - help="Show version and exit.", - ) - parser.add_argument( - "--json", action="store_true", dest="json_root", default=False, help=_JSON_HELP - ) - parser.set_defaults(func=None, help_parser=parser) - sub = parser.add_subparsers(dest="command", metavar="COMMAND", title="commands") - - # -- init - p = sub.add_parser( - "init", - help="Write a starter config.toml.", - description="Write a starter config.toml.", - allow_abbrev=False, - ) - p.add_argument( - "-f", "--force", "-y", "--yes", - dest="force", - action="store_true", - help="Overwrite without prompting (-y/--yes accepted too).", - ) - _add_json(p) - p.set_defaults(func=_cmd_init) - - # -- keep - p = sub.add_parser( - "keep", - help="Renew CSV-flagged scratch files filtered by the keep block in config.", - description="Renew CSV-flagged scratch files filtered by the keep block in config.", - allow_abbrev=False, - ) - p.add_argument("--stage", default="all", help="Which warning CSVs to read.") - p.add_argument( - "--csv-dir", - dest="csv_dir", - type=Path, - default=None, - metavar="DIR", - help="Directory holding Sol's warning CSVs.", - ) - p.add_argument( - "--solkeep", - type=Path, - default=None, - metavar="FILE", - help="Path to a gitignore-style keep-list (overrides the [keep] config block).", - ) - p.add_argument( - "-j", "--jobs", - dest="jobs_n", - type=int, - default=max(1, min(8, (os.cpu_count() or 2) // 4)), - metavar="N", - help="Parallel touch workers.", - ) - p.add_argument( - "-y", "--yes", "-f", "--force", - dest="yes", - action="store_true", - help="Skip confirmation prompt (also -f/--force).", - ) - p.add_argument( - "-n", "--dry-run", - dest="dry_run", - action="store_true", - help="Print plan without executing.", - ) - p.add_argument( - "-v", "--verbose", action="store_true", help="Verbose plan + progress." - ) - _add_json(p) - p.set_defaults(func=_cmd_keep) - - # -- jump (shortcut for `job jump`) - p = sub.add_parser( - "jump", - help="Drop into a shell on the job's compute node (= solx job jump).", - description="Drop into a shell on the job's compute node (= solx job jump).", - allow_abbrev=False, - ) - p.add_argument( - "jobid", - nargs="?", - default=None, - help="Job ID. Defaults to current job (compute) or sole/most-recent running job (login).", - ) - p.add_argument( - "-q", "--quiet", - action="store_true", - help="Suppress the nesting / most-recent heads-up.", - ) - _add_json(p) - p.set_defaults(func=_cmd_jump) - - # -- job group - p_job = sub.add_parser( - "job", - help="Manage interactive Slurm jobs on Sol (alias: jobs).", - description="Manage interactive Slurm jobs on Sol (alias: jobs).", - allow_abbrev=False, - ) - p_job.set_defaults(func=None, help_parser=p_job) - job_sub = p_job.add_subparsers(dest="job_command", metavar="COMMAND", title="commands") - - p = job_sub.add_parser( - "list", - help="Print my Sol jobs.", - description="Print my Sol jobs.", - allow_abbrev=False, - ) - _add_json(p) - p.set_defaults(func=_cmd_job_list) - - p_start = job_sub.add_parser( - "start", - help="Start an interactive allocation from a config template.", - description="Start an interactive allocation from a config template.", - allow_abbrev=False, - ) - p_start.add_argument( - "template", - nargs="?", - default=None, - help="Template name; defaults to default_template.", - ) - p_start.add_argument( - "-n", "--dry-run", - dest="dry_run", - action="store_true", - help="Print salloc argv without submitting.", - ) - p_start.add_argument( - "--timeout", - default=None, - metavar="DURATION", - help='Override start_timeout (e.g. "5m", "1h").', - ) - p_start.add_argument( - "args", - nargs=argparse.REMAINDER, - metavar="ARGS", - help="Extra arguments forwarded to salloc.", - ) - # No --json leaf flag here: after `job start`, --json belongs to the - # salloc passthrough. - p_start.set_defaults(func=_cmd_job_start_parsed) - - p = job_sub.add_parser( - "stop", - help="Cancel a job (prompts unless -y).", - description="Cancel a job (prompts unless -y).", - allow_abbrev=False, - ) - p.add_argument( - "jobid", nargs="?", default=None, help="Job ID. Defaults per resolution rules." - ) - p.add_argument( - "-y", "--yes", "-f", "--force", - dest="yes", - action="store_true", - help="Skip confirmation prompt (also -f/--force).", - ) - p.add_argument( - "-n", "--dry-run", - dest="dry_run", - action="store_true", - help="Print scancel argv without executing.", - ) - _add_json(p) - p.set_defaults(func=_cmd_job_stop) - - p = job_sub.add_parser( - "jump", - help="Drop into a shell on the job's compute node.", - description="Drop into a shell on the job's compute node.", - allow_abbrev=False, - ) - p.add_argument( - "jobid", nargs="?", default=None, help="Job ID. Defaults per resolution rules." - ) - p.add_argument( - "-q", "--quiet", - action="store_true", - help="Suppress the nesting / most-recent heads-up.", - ) - _add_json(p) - p.set_defaults(func=_cmd_jump) - - p = job_sub.add_parser( - "time", - help="Print remaining time (D-HH:MM:SS).", - description="Print remaining time (D-HH:MM:SS).", - allow_abbrev=False, - ) - p.add_argument( - "jobid", nargs="?", default=None, help="Job ID. Defaults per resolution rules." - ) - _add_json(p) - p.set_defaults(func=_cmd_job_time) - - # -- config group - p_config = sub.add_parser( - "config", - help="Inspect and edit the solx config.", - description="Inspect and edit the solx config.", - allow_abbrev=False, - ) - p_config.set_defaults(func=None, help_parser=p_config) - config_sub = p_config.add_subparsers( - dest="config_command", metavar="COMMAND", title="commands" - ) - - p = config_sub.add_parser( - "show", - help="Print the resolved config.", - description="Print the resolved config.", - allow_abbrev=False, - ) - _add_json(p, help="Emit JSON.") - p.set_defaults(func=_cmd_config_show) - - p = config_sub.add_parser( - "edit", - help="Open the config in $EDITOR.", - description="Open the config in $EDITOR.", - allow_abbrev=False, - ) - p.set_defaults(func=_cmd_config_edit) - - p = config_sub.add_parser( - "import-solkeep", - help="Migrate a legacy ~/.solkeep keep-list into the config's [keep] block.", - description="Migrate a legacy ~/.solkeep keep-list into the config's [keep] block.", - allow_abbrev=False, - ) - p.add_argument( - "--solkeep", - type=Path, - default=None, - metavar="FILE", - help="Keep-list to import (default: ~/.solkeep).", - ) - p.add_argument( - "-f", "--force", - action="store_true", - help="Accept a lossy import (an order-dependent re-include that " - "the [keep] block can't preserve).", - ) - _add_json(p) - p.set_defaults(func=_cmd_config_import_solkeep) - - # -- completions - p = sub.add_parser( - "completions", - help="Emit a shell completion script (bash, zsh, or fish).", - description="Emit a shell completion script (bash, zsh, or fish).", - allow_abbrev=False, - ) - p.add_argument("shell", help="Target shell: bash, zsh, or fish.") - p.set_defaults(func=_cmd_completions) - - # -- meta: version / help (no --json: their output is one fixed text) - p = sub.add_parser( - "version", - help="Show version and exit (alias of --version).", - description="Show version and exit (alias of --version).", - allow_abbrev=False, - ) - p.set_defaults(func=_cmd_version) - - p = sub.add_parser( - "help", - help="Show help and exit (alias of --help).", - description="Show help and exit (alias of --help).", - allow_abbrev=False, - ) - p.set_defaults(func=_cmd_help) - - return parser, p_start - - -# --- entry point ------------------------------------------------------------- - - -def main(argv: list[str] | None = None) -> None: - # Completion scripts generated by solx <= 0.4.0 call back into `solx` - # with _SOLX_COMPLETE set (the Typer runtime-completion protocol). Exit - # silently so a stale installed script offers zero candidates instead of - # parsing help text as completions. - if "_SOLX_COMPLETE" in os.environ: - raise SystemExit(0) - - args = list(sys.argv[1:] if argv is None else argv) - - # Exactly `solx --version` / `solx version` short-circuits everything - # else: no Sol check, no parser tree. Any longer argv goes through - # argparse, so junk around either version form still errors. - if args == ["--version"] or args == ["version"]: - print(__version__) - raise SystemExit(0) - - # Hidden aliases, rewritten before parsing so help stays clean: - # `solx jobs …` == `solx job …` and `solx job ls` == `solx job list`. - for i, tok in enumerate(args): - if tok == "--": - break - if tok.startswith("-"): - continue - if tok == "jobs": - args[i] = "job" - if args[i] == "job" and i + 1 < len(args) and args[i + 1] == "ls": - args[i + 1] = "list" - break - - parser, start_parser = _build_parser() - - # `job start` owns its tail (unrecognized tokens are salloc passthrough), - # so it is dispatched before argparse parses anything. Root options ahead - # of the subcommand are limited to --json on this path; anything else - # falls through to argparse for regular help/error handling. - head: list[str] = [] - k = 0 - while k < len(args) and args[k].startswith("-") and args[k] != "--": - head.append(args[k]) - k += 1 - if args[k : k + 2] == ["job", "start"] and all(t == "--json" for t in head): - _run_job_start("--json" in head, args[k + 2 :], start_parser) - - ns = parser.parse_args(args) - if ns.show_version: - # `--version` mixed into an otherwise-valid root line wins over any - # subcommand on it. - print(__version__) - raise SystemExit(0) - if ns.func is None: - # A group (or the root) given no subcommand: print its help, exit 2. - ns.help_parser.print_help() - raise SystemExit(2) - ns.func(ns) - raise SystemExit(0) diff --git a/solx/src/solx/output.py b/solx/src/solx/output.py deleted file mode 100644 index 543a363..0000000 --- a/solx/src/solx/output.py +++ /dev/null @@ -1,182 +0,0 @@ -"""Output layer: human Rich rendering vs machine-readable JSON. - -Principle (issue #16 — "CLI design for agents"): a CLI driven by an agent -should not have to know a flag exists to get parseable output. So: - -* When stdout is **not a TTY**, data commands emit JSON automatically; on a - TTY they render Rich tables. The global `--json` flag forces JSON anywhere - (a human on a terminal gets tables with no flag; the agent passes `--json`). -* All diagnostics, progress, and errors go to **stderr**, so stdout stays a - clean data channel an agent can parse without stripping noise. -* Interactivity (whether we may *prompt*) is decided by **stdin**, separately - from the stdout-format decision. A non-interactive session never blocks on - a confirmation prompt (see `solx.jobs` / `solx.keep`). - -`Out` bundles those three decisions plus the two streams so command bodies take -a single object and stay testable: a test builds an `Out` over ``StringIO`` -streams with an explicit mode instead of poking globals. - -**`rich` stays off the agent path.** On the JSON / non-interactive path -`Out.auto` builds a `_Plain` writer (plain text, markup stripped) instead of a -`rich.Console`, so an agent run (`--json`, or piped output) never imports -`rich` at all. `rich` is imported only when there's a human terminal to render -a table or coloured diagnostic for. Command modules import `rich.table` / -`rich.prompt` lazily for the same reason. -""" -from __future__ import annotations - -import json as _json -import re -import sys -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Callable - -if TYPE_CHECKING: - from rich.console import Console - - -# Output mode override. The CLI sets "json" via the global --json flag; None -# means auto-detect from the stdout TTY. "plain" (force human) is supported by -# Out.auto for embedders/tests but has no CLI flag — a human on a terminal -# already gets human output by default, so forcing it isn't worth a flag. -Force = str # "json" | "plain" | None - - -# Rich style tags ([red], [/], [bold dim], …). The char class deliberately -# excludes sentence punctuation (commas, quotes) so an interpolated exception -# string like "(at line 11, column 21)" isn't mistaken for markup. A literal -# bracket is written escaped as `\[` in our messages, so it is protected first. -_MARKUP = re.compile(r"\[/?[a-zA-Z0-9 #._]*\]") - - -def _plain(msg: str) -> str: - """Strip Rich markup from `msg` for the no-Rich (agent/JSON) path.""" - msg = msg.replace("\\[", "\x00") # protect escaped literal brackets - msg = _MARKUP.sub("", msg) - return msg.replace("\x00", "[") - - -class _Plain: - """Minimal stand-in for `rich.Console` on the no-Rich path. - - Exposes just the slice command bodies (and tests) touch — `.print`, which - strips markup and writes plain text, and `.file` — so nothing imports - `rich` when output is JSON / agent-facing. Only ever receives diagnostic - strings (the human table path constructs a real `rich.Console`). - """ - - is_terminal = False - - def __init__(self, file: Any) -> None: - self.file = file - - def print(self, obj: Any = "") -> None: - self.file.write(_plain(str(obj)) + "\n") - - -@dataclass -class Out: - """A resolved output target: format choice + the two streams. - - * ``json_mode`` — emit JSON on the data channel (stdout) instead of Rich. - * ``interactive`` — stdin is a TTY, so prompting a human is allowed. - * ``stdout`` / ``stderr`` — the data and diagnostic writers: a - ``rich.Console`` in human mode, a ``_Plain`` writer on the agent path. - Both expose ``.print`` and ``.file``. - """ - - json_mode: bool - interactive: bool - stdout: "Console | _Plain" - stderr: "Console | _Plain" - - @classmethod - def auto( - cls, - *, - force: Force | None = None, - stdout: Any | None = None, - stderr: Any | None = None, - interactive: bool | None = None, - ) -> "Out": - """Build an `Out`, auto-detecting format from the stdout TTY. - - ``force`` (`"json"`/`"plain"`/`None`) overrides the auto-detect; the CLI - passes `"json"` (global `--json`) or `None`. ``interactive`` defaults to - whether **stdin** is a TTY. On the JSON path no `rich.Console` is built - (and `rich` is never imported) — a `_Plain` writer is used instead. - """ - # TTY-ness for format detection — from a caller-supplied stream/console - # (tests, embedders) or sys.stdout (production), without importing rich. - probe = stdout if stdout is not None else sys.stdout - is_tty = getattr(probe, "is_terminal", None) - if is_tty is None: - try: - is_tty = probe.isatty() - except (AttributeError, ValueError, OSError): - is_tty = False - - if force == "json": - json_mode = True - elif force == "plain": - json_mode = False - else: - json_mode = not is_tty - - if interactive is None: - try: - interactive = sys.stdin.isatty() - except (ValueError, OSError): - interactive = False - - so, se = stdout, stderr - if so is None or se is None: - if json_mode: - if so is None: - so = _Plain(sys.stdout) - if se is None: - se = _Plain(sys.stderr) - else: - from rich.console import Console - - if so is None: - so = Console() - if se is None: - se = Console(stderr=True) - return cls(json_mode=json_mode, interactive=interactive, stdout=so, stderr=se) - - # --- diagnostics: always stderr, never on the JSON stdout stream -------- - - def status(self, msg: str) -> None: - """A progress / context line. Goes to stderr in every mode.""" - self.stderr.print(msg) - - def error(self, msg: str) -> None: - """An error line. Goes to stderr in every mode.""" - self.stderr.print(msg) - - # --- data channel: stdout ----------------------------------------------- - - def json(self, obj: Any) -> None: - """Write one clean JSON document to stdout (no ANSI, no wrapping).""" - # Write straight to the underlying file so Rich never injects color - # or soft-wraps the payload, even under a forced `--json` on a TTY. - self.stdout.file.write(_json.dumps(obj, indent=2, default=str) + "\n") - self.stdout.file.flush() - - def human(self, renderable: Any) -> None: - """Render something to stdout in human mode (Rich table, text, …).""" - self.stdout.print(renderable) - - def emit(self, *, data: Any, human: Callable[[], Any]) -> None: - """Emit a result: JSON ``data`` in json mode, else the ``human`` render. - - ``human`` is a thunk so the (possibly expensive) Rich renderable is - only built when it will actually be shown. - """ - if self.json_mode: - self.json(data) - else: - rendered = human() - if rendered is not None: - self.stdout.print(rendered) diff --git a/solx/src/solx/side.py b/solx/src/solx/side.py deleted file mode 100644 index 4a42bcb..0000000 --- a/solx/src/solx/side.py +++ /dev/null @@ -1,71 +0,0 @@ -"""Detect whether the current host is part of the Sol cluster. - -`solx` is Sol-only. Each subcommand asks `require_sol()` to enforce the -guard — wrong-side invocations exit 2 with a clear redirect rather than -attempting to talk to a Slurm controller that isn't there. -""" -from __future__ import annotations - -import socket -import subprocess -import sys -from typing import Literal - -Side = Literal["sol", "not-sol"] - -SOL_HOSTNAME_SUFFIX = ".sol.rc.asu.edu" - -_NOT_SOL_MESSAGE = ( - "solx is Sol-only — SSH to a Sol login node first, then re-run.\n" - "See: https://docs.rc.asu.edu/" -) - - -def detect(*, _runner=None) -> Side: - """Return "sol" if the current host is on the Sol cluster, else "not-sol". - - Looks for any token ending in `.sol.rc.asu.edu` in `hostname -a` and - `socket.getfqdn()`. Tests inject `_runner` to fake the command output - without shelling out. - """ - runner = _runner or _hostname_a - return "sol" if _matches_sol(runner()) else "not-sol" - - -def current_node() -> str: - """Best-effort short hostname for human-facing messages.""" - try: - return socket.gethostname().split(".")[0] - except OSError: - return "unknown" - - -def require_sol() -> None: - """Exit 2 with a redirect message if not on Sol. Used by every subcommand.""" - if detect() != "sol": - print(_NOT_SOL_MESSAGE, file=sys.stderr) - raise SystemExit(2) - - -def _matches_sol(text: str) -> bool: - return any(tok.endswith(SOL_HOSTNAME_SUFFIX) for tok in text.split()) - - -def _hostname_a() -> str: - """Run `hostname -a` and return its output; fall back to FQDN on failure.""" - fqdn = "" - try: - fqdn = socket.getfqdn() - except OSError: - pass - try: - result = subprocess.run( - ["hostname", "-a"], - capture_output=True, - text=True, - check=False, - timeout=2, - ) - except (OSError, subprocess.SubprocessError): - return fqdn - return f"{result.stdout or ''} {fqdn}" diff --git a/solx/src/solx/slurm.py b/solx/src/solx/slurm.py deleted file mode 100644 index a90a891..0000000 --- a/solx/src/solx/slurm.py +++ /dev/null @@ -1,320 +0,0 @@ -"""Thin wrappers around `squeue`, `scancel`, `salloc`, and `srun`. - -We don't try to be a Slurm client library — every function shells out and -parses the result. Tests inject `runner` so they can mock subprocess -without monkey-patching globals. -""" -from __future__ import annotations - -import os -import re -import shlex -import subprocess -from dataclasses import dataclass -from typing import Callable, Iterable - -from solx.config import JobTemplate - - -# --- types ----------------------------------------------------------------- - -# A Runner takes argv and returns (returncode, stdout, stderr). -Runner = Callable[[list[str]], tuple[int, str, str]] - - -@dataclass(frozen=True) -class Job: - """One row of `squeue -u $USER`.""" - - job_id: str - name: str - state: str - time_used: str - time_left: str - partition: str - node_list: str = "" - - @classmethod - def from_squeue_row(cls, line: str) -> "Job": - # Format-string in squeue_user_jobs() decides field count + order. - parts = line.split("|") - if len(parts) < 7: - raise ValueError(f"unexpected squeue row: {line!r}") - return cls( - job_id=parts[0], - name=parts[1], - state=parts[2], - time_used=parts[3], - time_left=parts[4], - partition=parts[5], - node_list=parts[6], - ) - - -class SlurmError(Exception): - """Raised for any Slurm-side failure surfaced to the user.""" - - -# --- runner --------------------------------------------------------------- - - -def real_runner(argv: list[str]) -> tuple[int, str, str]: - """Default runner: actual subprocess.run.""" - res = subprocess.run( - argv, capture_output=True, text=True, check=False - ) - return res.returncode, res.stdout, res.stderr - - -# --- squeue --------------------------------------------------------------- - - -_SQUEUE_FORMAT = "%i|%j|%T|%M|%L|%P|%R" - - -def squeue_user_jobs( - user: str | None = None, - *, - runner: Runner = real_runner, -) -> list[Job]: - """Return the user's current jobs (running, pending, etc.).""" - user = user or os.environ.get("USER") or "" - argv = [ - "squeue", - "-u", - user, - "-h", - "-o", - _SQUEUE_FORMAT, - ] - code, out, err = runner(argv) - if code != 0: - raise SlurmError(f"squeue failed: {err.strip() or out.strip()}") - rows = [line for line in out.splitlines() if line.strip()] - return [Job.from_squeue_row(line) for line in rows] - - -# --- jobid resolution ----------------------------------------------------- -# -# Resolution is VERB-AWARE. The conventions are inspired by tmux (a no-arg -# command acts on the obvious target; "most recent" when several exist; warn -# when you act on the session you're sitting in) but adapted to Slurm, where a -# cancelled job is unrecoverable and attaching spends real allocation time: -# -# * `time`/`jump` (read / attach): when several jobs match, auto-pick the -# MOST RECENT one (like `tmux attach`). Deterministic, so it's agent-safe. -# * `stop` (cancel): NEVER auto-picks among several — that's how you cancel -# the wrong job. It returns the candidates so the caller can print them and -# exit 2. This is the deliberate divergence from tmux's "act on most recent". -# * `jump`'s auto-pick considers RUNNING jobs only (you can't attach to a -# pending one). An EXPLICIT arg or $SLURM_JOB_ID is passed through as-is -# (no state pre-check) — by design, `srun` surfaces a wrong-state job far -# more clearly than we could, and it saves a squeue round-trip. -# -# "Inside an allocation" ($SLURM_JOB_ID set) is treated as "the current -# session": it's the default target, and acting on it carries a nesting/ -# self-cancel warning the caller surfaces. - - -VERB_JUMP = "jump" -VERB_STOP = "stop" -VERB_TIME = "time" - - -@dataclass(frozen=True) -class Resolution: - """Outcome of resolving a jobid for one verb. - - Exactly one of these holds: - * ``job_id`` is set → resolved; act on it. - * ``ambiguous`` is True → several candidates, caller must disambiguate. - * ``error`` is set → nothing to act on (no jobs / none running). - """ - - job_id: str | None = None - source: str = "arg" # arg | inside | single | most-recent - inside: bool = False # $SLURM_JOB_ID is set (acting from within an allocation) - inside_job_id: str | None = None - candidates: tuple[Job, ...] = () # set considered (for ambiguity / context) - ambiguous: bool = False - error: str | None = None - - @property - def acting_on_current(self) -> bool: - """True when the resolved job is the one we're sitting inside.""" - return self.inside and self.job_id is not None and self.job_id == self.inside_job_id - - -def _jobid_key(job_id: str) -> tuple[int, int]: - """Sort key making 'most recent' == 'highest job id'. - - Slurm assigns monotonically increasing ids, so the highest id is the - newest submission — which for `solx job start` is the one you just made. - Array ids like ``123_4`` sort by (base, index); a non-numeric id sorts - first so a real number always wins. - """ - base, _, idx = job_id.partition("_") - try: - return (int(base), int(idx) if idx.isdigit() else 0) - except ValueError: - return (-1, 0) - - -def most_recent(jobs: Iterable[Job]) -> Job: - """Return the most recently submitted job (highest job id).""" - return max(jobs, key=lambda j: _jobid_key(j.job_id)) - - -def resolve_jobid( - arg: str | None, - *, - verb: str = VERB_TIME, - user: str | None = None, - env: dict[str, str] | None = None, - runner: Runner = real_runner, -) -> Resolution: - """Resolve the jobid for `stop` / `jump` / `time`, verb-aware (see above). - - Order: explicit arg > inside-allocation ($SLURM_JOB_ID) > squeue. From - squeue, a single candidate is used; several are auto-resolved to the most - recent for read/attach verbs, or returned as ``ambiguous`` for ``stop``. - - Raises ``SlurmError`` if the squeue query fails (the explicit-arg and - inside-allocation paths short-circuit before any squeue call, so they never - raise). Every caller in ``jobs.py`` wraps this in try/except. - """ - env = env if env is not None else dict(os.environ) - inside_id = env.get("SLURM_JOB_ID") or None - inside = inside_id is not None - - if arg: - return Resolution(job_id=arg, source="arg", inside=inside, inside_job_id=inside_id) - if inside_id: - return Resolution( - job_id=inside_id, source="inside", inside=True, inside_job_id=inside_id - ) - - jobs = squeue_user_jobs(user=user, runner=runner) - candidates = [j for j in jobs if j.state == "RUNNING"] if verb == VERB_JUMP else jobs - - if not candidates: - # For jump, distinguish "you have jobs but none running" from "no jobs". - if verb == VERB_JUMP and jobs: - err = "no running job to attach to (jobs exist but none are RUNNING)" - else: - err = "no jobs found for the current user" - return Resolution(error=err, candidates=tuple(jobs), inside=inside) - - if len(candidates) == 1: - return Resolution( - job_id=candidates[0].job_id, source="single", - candidates=tuple(candidates), inside=inside, inside_job_id=inside_id, - ) - - if verb == VERB_STOP: - # Never auto-pick which job to cancel. - return Resolution( - ambiguous=True, candidates=tuple(candidates), - inside=inside, inside_job_id=inside_id, - ) - - chosen = most_recent(candidates) - return Resolution( - job_id=chosen.job_id, source="most-recent", - candidates=tuple(candidates), inside=inside, inside_job_id=inside_id, - ) - - -# --- salloc / scancel / srun argv builders ------------------------------- - - -def salloc_argv(template: JobTemplate, passthrough: Iterable[str] = ()) -> list[str]: - """Build the argv for `salloc --no-shell` from a template + CLI passthrough.""" - argv: list[str] = ["salloc", "--no-shell", "-J", f"solx-{template.name}"] - argv += ["-p", template.partition, "-t", template.time] - if template.qos: - argv += ["-q", template.qos] - if template.gres: - argv += [f"--gres={template.gres}"] - argv += list(template.extra_args) - argv += list(passthrough) - return argv - - -def scancel_argv(job_id: str) -> list[str]: - return ["scancel", job_id] - - -def srun_pty_argv(job_id: str, shell: str) -> list[str]: - """Argv for attaching a pty shell to a running allocation. - - `--overlap` lets the step share the allocation's resources with steps - already running in it. Without it, srun demands exclusive use of the node - and stalls with "step creation temporarily disabled (Requested nodes are - busy)" whenever the job already has a step occupying its resources. - """ - return ["srun", f"--jobid={job_id}", "--overlap", "--pty", shell] - - -def squeue_time_left_argv(job_id: str) -> list[str]: - return ["squeue", "-h", "-j", job_id, "-O", "TimeLeft"] - - -# --- salloc execution ------------------------------------------------------ - - -_GRANTED_RE = re.compile(r"Granted job allocation (\d+)") - - -def parse_granted_jobid(stderr_text: str) -> str: - """Extract the jobid from `salloc`'s stderr `Granted job allocation N` line.""" - m = _GRANTED_RE.search(stderr_text) - if not m: - raise SlurmError( - f"could not parse jobid from salloc output:\n{stderr_text}" - ) - return m.group(1) - - -def run_salloc( - argv: list[str], - *, - timeout_seconds: int, - runner: Runner | None = None, -) -> str: - """Invoke salloc and return the granted jobid. - - salloc --no-shell blocks until the allocation lands, then exits. If the - queue stalls beyond `timeout_seconds`, we kill the process and surface - a SlurmError so the user sees a clear timeout instead of a hang. - """ - if runner is not None: - # Test path: the runner returns the result directly. Timeout is - # the caller's problem in that mode — tests inject deterministic - # output without spawning subprocesses. - code, _, err = runner(argv) - if code != 0: - raise SlurmError(f"salloc failed: {err.strip()}") - return parse_granted_jobid(err) - - # Real path: subprocess with a wall-clock timeout. - try: - res = subprocess.run( - argv, - capture_output=True, - text=True, - check=False, - timeout=timeout_seconds, - ) - except subprocess.TimeoutExpired as e: - raise SlurmError( - f"salloc timed out after {timeout_seconds}s waiting for the queue. " - f"Cancel the request manually if needed; the request may still be " - f"queued. Argv: {shlex.join(argv)}" - ) from e - - if res.returncode != 0: - raise SlurmError( - f"salloc failed (exit {res.returncode}):\n{res.stderr.strip()}" - ) - return parse_granted_jobid(res.stderr) diff --git a/solx/tests/__init__.py b/solx/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/solx/tests/cli.rs b/solx/tests/cli.rs new file mode 100644 index 0000000..ad67301 --- /dev/null +++ b/solx/tests/cli.rs @@ -0,0 +1,603 @@ +//! End-to-end tests over the real binary. +//! +//! Each test runs `solx` in an isolated fake HOME with deterministic SLURM +//! mocks (`tests/mocks/bin`) on PATH, mirroring the behavioral parity +//! matrix: stdout is the data channel (JSON when piped), diagnostics land on +//! stderr, and exit codes follow the documented contract. + +use std::fs; +use std::path::{Path, PathBuf}; + +use assert_cmd::Command; +use predicates::prelude::*; + +const SAMPLE_CONFIG: &str = r#"default_shell = "zsh" +default_template = "default" +start_timeout = "5m" + +[jobs.default] +partition = "lightwork" +time = "1-0" +qos = "public" + +[jobs.debug] +partition = "htc" +time = "0-1" + +[jobs.gpu] +partition = "public" +gres = "gpu:a100:1" +time = "0-4" +extra_args = ["--mem=64G", "--cpus-per-task=8"] + +[keep] +include = ["/scratch/sparky/proj-a", "/scratch/sparky/proj-b/**"] +exclude = ["**/__pycache__", "**/.venv"] +"#; + +fn mocks_bin() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("mocks") + .join("bin") +} + +struct Sandbox { + home: tempfile::TempDir, +} + +impl Sandbox { + fn new() -> Self { + let home = tempfile::tempdir().expect("tempdir"); + fs::create_dir_all(home.path().join(".config/solx")).expect("config dir"); + Sandbox { home } + } + + fn with_config(self) -> Self { + fs::write( + self.home.path().join(".config/solx/config.toml"), + SAMPLE_CONFIG, + ) + .expect("write config"); + self + } + + fn write_home(&self, name: &str, content: &str) { + fs::write(self.home.path().join(name), content).expect("write fixture"); + } + + fn cmd(&self) -> Command { + let mut cmd = Command::cargo_bin("solx").expect("solx binary"); + cmd.env_clear() + .env("PATH", format!("{}:/usr/bin:/bin", mocks_bin().display())) + .env("HOME", self.home.path()) + .env("XDG_CONFIG_HOME", self.home.path().join(".config")) + .env("USER", "sparky") + .env("LOGNAME", "sparky") + .env("TERM", "dumb") + .env("LC_ALL", "C"); + cmd + } +} + +#[test] +fn version_flag_prints_bare_semver() { + let sb = Sandbox::new(); + sb.cmd() + .arg("--version") + .assert() + .success() + .stdout(format!("{}\n", env!("CARGO_PKG_VERSION"))) + .stderr(""); +} + +#[test] +fn version_command_matches_flag() { + let sb = Sandbox::new(); + sb.cmd() + .arg("version") + .assert() + .success() + .stdout(format!("{}\n", env!("CARGO_PKG_VERSION"))); +} + +#[test] +fn no_args_prints_help_and_exits_2() { + let sb = Sandbox::new(); + sb.cmd() + .assert() + .code(2) + .stdout(predicate::str::contains("keep").and(predicate::str::contains("job"))); +} + +#[test] +fn job_list_emits_json_when_piped() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "list"]) + .assert() + .success() + .stdout(predicate::str::contains("\"job_id\": \"54800001\"")) + .stdout(predicate::str::starts_with("[\n")) + .stderr(""); +} + +#[test] +fn job_list_squeue_failure_is_exit_1_on_stderr() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["job", "list"]) + .env("MOCK_SQUEUE_FAIL", "1") + .assert() + .code(1) + .stdout("") + .stderr("error: squeue failed: boom\n"); +} + +#[test] +fn job_time_inside_allocation_uses_env_jobid() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "time"]) + .env("SLURM_JOB_ID", "54800001") + .assert() + .success() + .stdout("{\n \"jobid\": \"54800001\",\n \"time_left\": \"2-03:04:05\"\n}\n"); +} + +#[test] +fn job_stop_dry_run_previews_scancel() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "stop", "12345", "-n"]) + .assert() + .success() + .stdout(predicate::str::contains("\"dry_run\": true")) + .stdout(predicate::str::contains("\"inside_allocation\": false")) + .stderr("dry-run — would run:\n"); +} + +#[test] +fn job_stop_non_interactive_refuses_without_yes() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["job", "stop", "12345"]) + .assert() + .code(2) + .stderr( + "error: non-interactive session — pass -y to cancel job 12345, or -n to preview.\n", + ); +} + +#[test] +fn job_start_dry_run_uses_default_template() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "start", "-n"]) + .assert() + .success() + .stdout(predicate::str::contains("\"template\": \"default\"")) + .stdout(predicate::str::contains("\"-q\",\n \"public\"")) + .stderr("dry-run — would run:\n"); +} + +#[test] +fn job_start_passthrough_after_dashdash() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "start", "gpu", "-n", "--", "--mem=128G"]) + .assert() + .success() + .stdout(predicate::str::contains("\"--mem=128G\"")) + .stdout(predicate::str::contains("\"template\": \"gpu\"")); +} + +#[test] +fn job_start_first_token_after_dashdash_is_template() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "start", "-n", "--", "--mem=128G"]) + .assert() + .code(1) + .stderr(predicate::str::contains( + "unknown job template '--mem=128G'. defined: debug, default, gpu", + )); +} + +#[test] +fn job_start_real_parses_granted_allocation() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "start"]) + .assert() + .success() + .stdout("{\n \"jobid\": \"54809999\",\n \"template\": \"default\"\n}\n") + .stderr(predicate::str::contains("allocated job 54809999")); +} + +#[test] +fn jump_exec_replaces_with_srun() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["jump", "12345", "-q"]) + .assert() + .success() + .stdout("MOCK_SRUN --jobid=12345 --overlap --pty zsh\n"); +} + +#[test] +fn keep_dry_run_plan_filters_by_keep_block() { + let sb = Sandbox::new().with_config(); + sb.write_home( + "scratch-dirs-pending-removal.csv", + "User,Directory,Size\nsparky,/scratch/sparky/proj-a,12G\nsparky,/scratch/sparky/other,3G\n", + ); + sb.write_home( + "scratch-dirs-over-90days.csv", + "User,Directory,Size\nsparky,/scratch/sparky/proj-b/data,40G\n", + ); + sb.cmd() + .args(["--json", "keep", "-n"]) + .assert() + .success() + .stdout(predicate::str::contains("\"kept_count\": 2")) + .stdout(predicate::str::contains("\"skipped_count\": 1")) + .stdout(predicate::str::contains("/scratch/sparky/proj-b/data")); +} + +#[test] +fn keep_renews_real_files() { + let sb = Sandbox::new(); + // A [keep] block pointing inside the sandbox, plus a flagged dir with a + // stale file. + let scratch = sb.home.path().join("scratch"); + fs::create_dir_all(scratch.join("proj/sub")).unwrap(); + let stale = scratch.join("proj/sub/stale.bin"); + fs::write(&stale, "x").unwrap(); + let old = filetime::FileTime::from_unix_time(1_000_000, 0); + filetime::set_file_times(&stale, old, old).unwrap(); + + fs::write( + sb.home.path().join(".config/solx/config.toml"), + format!( + "default_shell = \"bash\"\ndefault_template = \"default\"\n\n\ + [jobs.default]\npartition = \"x\"\ntime = \"1-0\"\n\n\ + [keep]\ninclude = [\"{}/**\"]\n", + scratch.display() + ), + ) + .unwrap(); + sb.write_home( + "scratch-dirs-pending-removal.csv", + &format!( + "User,Directory,Size\nsparky,{},1G\n", + scratch.join("proj").display() + ), + ); + + sb.cmd() + .args(["--json", "keep", "-y", "-j", "1"]) + .assert() + .success() + .stdout(predicate::str::contains("\"files_touched\": 1")) + .stdout(predicate::str::contains("\"failures\": 0")); + + let mtime = filetime::FileTime::from_last_modification_time(&stale.metadata().unwrap()); + assert!(mtime.unix_seconds() > 1_000_000, "stale file renewed"); +} + +#[test] +fn keep_invalid_stage_exits_2() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["keep", "--stage", "bogus"]) + .assert() + .code(2) + .stderr("invalid --stage 'bogus'. choose from: all, inactive, over90, pending\n"); +} + +#[test] +fn keep_without_rules_exits_2() { + let sb = Sandbox::new(); // no config + sb.cmd() + .args(["keep", "-n"]) + .assert() + .code(2) + .stderr("error: no [keep] block in config. add one with `solx config edit`.\n"); +} + +#[test] +fn keep_ignores_a_solkeep_file() { + // A ~/.solkeep on disk is never read: the keep-list comes only from the + // config `[keep]` block, so with no config `keep` errors rather than + // touching anything based on the legacy file. + let sb = Sandbox::new(); // no config.toml + sb.write_home(".solkeep", "/scratch/sparky/proj-a\n"); + sb.write_home( + "scratch-dirs-pending-removal.csv", + "User,Directory,Size\nsparky,/scratch/sparky/proj-a,12G\n", + ); + sb.cmd() + .args(["--json", "keep", "-n"]) + .assert() + .code(2) + .stderr(predicate::str::contains("no [keep] block in config")) + .stderr(predicate::str::contains("solkeep").not()); +} + +#[test] +fn config_show_json_preserves_file_order() { + let sb = Sandbox::new().with_config(); + let assert = sb + .cmd() + .args(["config", "show", "--json"]) + .assert() + .success(); + let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap(); + let d = stdout.find("\"default\"").unwrap(); + let g = stdout.find("\"gpu\"").unwrap(); + let b = stdout.find("\"debug\"").unwrap(); + assert!(d < b && b < g, "templates serialize in file order"); + assert!(stdout.contains("\"start_timeout_seconds\": 300")); +} + +#[test] +fn config_edit_propagates_editor_argv_and_exit() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["config", "edit"]) + .env("EDITOR", "/bin/echo -n") + .assert() + .success() + .stdout(predicate::str::ends_with("config.toml")) + .stdout(predicate::str::ends_with("\n").not()); +} + +#[test] +fn init_fresh_writes_starter_config() { + let sb = Sandbox::new(); // empty XDG + sb.cmd() + .args(["--json", "init"]) + .assert() + .success() + .stdout(predicate::str::contains("\"wrote\"")) + .stderr("edit it with `solx config edit`, then `solx job start`.\n"); + let written = sb.home.path().join(".config/solx/config.toml"); + let text = fs::read_to_string(&written).unwrap(); + assert!(text.contains("sparky")); + use std::os::unix::fs::PermissionsExt; + let mode = written.metadata().unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o600); +} + +#[test] +fn init_existing_without_force_exits_2() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["init"]) + .assert() + .code(2) + .stderr(predicate::str::contains( + "already exists. pass -f to overwrite.", + )); +} + +#[test] +fn completions_unknown_shell_exits_2() { + let sb = Sandbox::new(); + sb.cmd() + .args(["completions", "tcsh"]) + .assert() + .code(2) + .stdout("") + .stderr("unknown shell 'tcsh'; choose bash, zsh, or fish.\n"); +} + +#[test] +fn completions_zsh_is_compdef_script() { + let sb = Sandbox::new(); + sb.cmd() + .args(["completions", "zsh"]) + .assert() + .success() + .stdout(predicate::str::starts_with("#compdef solx")); +} + +#[test] +fn trailing_json_is_accepted_on_leaves() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["job", "list", "--json"]) + .assert() + .success() + .stdout(predicate::str::contains("\"job_id\"")); +} + +#[test] +fn version_with_junk_arguments_exits_2() { + // Only the bare `--version` / `version` forms print the version. + let sb = Sandbox::new(); + sb.cmd().args(["version", "bogus"]).assert().code(2); + sb.cmd().args(["--bogus", "--version"]).assert().code(2); + sb.cmd().args(["--version", "--bogus"]).assert().code(2); +} + +#[test] +fn help_command_rejects_arguments() { + let sb = Sandbox::new(); + sb.cmd() + .arg("help") + .assert() + .success() + .stdout(predicate::str::contains("Usage: solx")); + sb.cmd().args(["help", "job"]).assert().code(2).stdout(""); +} + +#[test] +fn dash_h_prints_help_and_exits_0() { + let sb = Sandbox::new(); + sb.cmd() + .arg("-h") + .assert() + .success() + .stdout(predicate::str::contains("Usage: solx")); +} + +#[test] +fn group_help_usage_carries_binary_name() { + let sb = Sandbox::new(); + sb.cmd() + .arg("job") + .assert() + .code(2) + .stdout(predicate::str::contains("Usage: solx job")); + sb.cmd() + .arg("config") + .assert() + .code(2) + .stdout(predicate::str::contains("Usage: solx config")); +} + +#[test] +fn job_start_help_documents_contract_options() { + let sb = Sandbox::new().with_config(); + for flags in [["job", "start", "--help"], ["job", "start", "-h"]] { + let assert = sb.cmd().args(flags).assert().success(); + let stdout = String::from_utf8(assert.get_output().stdout.clone()).unwrap(); + assert!(stdout.contains("Usage: solx job start")); + assert!(stdout.contains("-n, --dry-run")); + assert!(stdout.contains("--timeout")); + assert!(stdout.contains("[TEMPLATE]")); + assert!(stdout.contains("salloc")); + } +} + +#[test] +fn job_start_dry_run_with_value_exits_2() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["job", "start", "--dry-run=true"]) + .assert() + .code(2) + .stderr("error: Option '--dry-run' does not take a value.\n"); +} + +#[test] +fn job_start_submit_line_keeps_equals_tokens_bare() { + // The `submitting:` argv render quotes nothing in the gpu template. + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["--json", "job", "start", "gpu"]) + .assert() + .success() + .stderr(predicate::str::contains( + "submitting: salloc --no-shell -J solx-gpu -p public -t 0-4 \ + --gres=gpu:a100:1 --mem=64G --cpus-per-task=8", + )); +} + +#[test] +fn keep_jobs_zero_or_negative_exits_2() { + let sb = Sandbox::new().with_config(); + sb.cmd().args(["keep", "-n", "-j", "0"]).assert().code(2); + sb.cmd().args(["keep", "-n", "-j", "-2"]).assert().code(2); +} + +#[test] +fn keep_unreadable_csv_exits_1_naming_the_file() { + use std::os::unix::fs::PermissionsExt; + let sb = Sandbox::new().with_config(); + sb.write_home( + "scratch-dirs-pending-removal.csv", + "Directory,Size\n/scratch/sparky/proj-a,1G\n", + ); + let csv = sb.home.path().join("scratch-dirs-pending-removal.csv"); + fs::set_permissions(&csv, fs::Permissions::from_mode(0o000)).unwrap(); + sb.cmd() + .args(["--json", "keep", "-n"]) + .assert() + .code(1) + .stdout("") + .stderr( + predicate::str::contains("error: unable to read") + .and(predicate::str::contains("scratch-dirs-pending-removal.csv")), + ); + fs::set_permissions(&csv, fs::Permissions::from_mode(0o644)).unwrap(); +} + +#[test] +fn keep_trailing_slash_flagged_dir_is_kept() { + let sb = Sandbox::new().with_config(); + sb.write_home( + "scratch-dirs-pending-removal.csv", + "Directory,Size\n/scratch/sparky/proj-a/,1G\n", + ); + sb.cmd() + .args(["--json", "keep", "-n"]) + .assert() + .success() + .stdout( + predicate::str::contains("\"kept_count\": 1") + .and(predicate::str::contains("/scratch/sparky/proj-a/")), + ); +} + +#[test] +fn config_validation_error_strips_table_context() { + let sb = Sandbox::new(); + fs::write( + sb.home.path().join(".config/solx/config.toml"), + "default_shell = \"bash\"\ndefault_template = \"default\"\n\ + [jobs.default]\npartition = \"x\"\n", + ) + .unwrap(); + sb.cmd() + .args(["config", "show"]) + .assert() + .code(2) + .stderr(predicate::str::contains( + "config.toml:: required key `time` is missing", + )); +} + +#[test] +fn invalid_toml_error_is_one_line() { + let sb = Sandbox::new(); + fs::write( + sb.home.path().join(".config/solx/config.toml"), + "default_shell = [unclosed\n", + ) + .unwrap(); + let assert = sb.cmd().args(["config", "show"]).assert().code(2); + let stderr = String::from_utf8(assert.get_output().stderr.clone()).unwrap(); + assert_eq!( + stderr.lines().count(), + 1, + "single-line TOML error: {stderr}" + ); + assert!(stderr.contains("error: invalid TOML in")); + assert!(stderr.contains("(at line 1, column")); +} + +#[test] +fn config_edit_unparseable_editor_exits_1() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["config", "edit"]) + .env("EDITOR", "vim '") + .assert() + .code(1) + .stderr("error: unparseable $EDITOR value \"vim '\"\n"); +} + +#[test] +fn solx_complete_env_exits_0_silently() { + let sb = Sandbox::new().with_config(); + sb.cmd() + .args(["job", "list"]) + .env("_SOLX_COMPLETE", "complete_zsh") + .assert() + .success() + .stdout("") + .stderr(""); +} diff --git a/solx/tests/conftest.py b/solx/tests/conftest.py deleted file mode 100644 index fc26610..0000000 --- a/solx/tests/conftest.py +++ /dev/null @@ -1,64 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import pytest - - -@pytest.fixture(autouse=True) -def _isolate_slurm_env(monkeypatch): - """Clear SLURM_* env vars by default so tests are deterministic. - - The dev machine is Sol itself, and pytest may be invoked from inside an - allocation. Tests that *want* `$SLURM_JOB_ID` set (e.g. compute-node - behavior) must `monkeypatch.setenv` it themselves. - """ - for k in list(monkeypatch.__dict__): - pass # noop placeholder to satisfy linters - for var in [ - "SLURM_JOB_ID", - "SLURM_JOBID", - "SLURM_NODELIST", - "SLURM_STEP_ID", - ]: - monkeypatch.delenv(var, raising=False) - - -@pytest.fixture -def config_path(tmp_path: Path) -> Path: - return tmp_path / "config.toml" - - -@pytest.fixture -def write_config(config_path: Path): - def _write(contents: str) -> Path: - config_path.write_text(contents) - return config_path - - return _write - - -SAMPLE_CONFIG_TOML = """\ -default_shell = "zsh" -default_template = "default" -start_timeout = "5m" - -[jobs.default] -partition = "lightwork" -time = "1-0" -qos = "public" - -[jobs.debug] -partition = "htc" -time = "0-1" - -[jobs.gpu] -partition = "public" -gres = "gpu:a100:1" -time = "0-4" -extra_args = ["--mem=64G", "--cpus-per-task=8"] - -[keep] -include = ["/scratch/sparky/proj-a", "/scratch/sparky/proj-b/**"] -exclude = ["**/__pycache__", "**/.venv"] -""" diff --git a/evals/parity/bin/hostname b/solx/tests/mocks/bin/hostname similarity index 100% rename from evals/parity/bin/hostname rename to solx/tests/mocks/bin/hostname diff --git a/evals/parity/bin/salloc b/solx/tests/mocks/bin/salloc similarity index 100% rename from evals/parity/bin/salloc rename to solx/tests/mocks/bin/salloc diff --git a/evals/parity/bin/scancel b/solx/tests/mocks/bin/scancel similarity index 100% rename from evals/parity/bin/scancel rename to solx/tests/mocks/bin/scancel diff --git a/evals/parity/bin/squeue b/solx/tests/mocks/bin/squeue similarity index 100% rename from evals/parity/bin/squeue rename to solx/tests/mocks/bin/squeue diff --git a/evals/parity/bin/srun b/solx/tests/mocks/bin/srun similarity index 100% rename from evals/parity/bin/srun rename to solx/tests/mocks/bin/srun diff --git a/solx/tests/test_completions.py b/solx/tests/test_completions.py deleted file mode 100644 index 457391f..0000000 --- a/solx/tests/test_completions.py +++ /dev/null @@ -1,325 +0,0 @@ -"""Shape and syntax coverage for the static completion scripts. - -The scripts are fully static (no callback into solx at completion time), so -the tests assert on their text — every command listed, the right registration -footer per shell — pin the `COMMANDS` table to `main.py`'s argparse tree, and, -where the shell is installed, run its syntax checker over the emitted script -plus functional probes of the bash completer (simulated COMP_WORDS). -""" -from __future__ import annotations - -import argparse -import shlex -import shutil -import subprocess - -import pytest - -from solx import _completions -from solx import main as main_mod - -TOP_COMMANDS = [ - "init", "keep", "jump", "job", "config", "completions", "version", "help", -] -JOB_SUBCOMMANDS = ["list", "start", "stop", "jump", "time"] -CONFIG_SUBCOMMANDS = ["show", "edit", "import-solkeep"] - -SCRIPTS = { - "bash": _completions.bash_script, - "zsh": _completions.zsh_script, - "fish": _completions.fish_script, -} - - -# ---- golden-shape assertions --------------------------------------------- - - -def test_zsh_starts_with_compdef_tag() -> None: - assert _completions.zsh_script().startswith("#compdef solx\n") - - -def test_zsh_dual_mode_footer() -> None: - """The zsh script supports both install modes: autoloaded from fpath - (the `loadautofunc` branch calls the completer, so the first Tab of a - session completes) and eval/source (compdef registers it).""" - script = _completions.zsh_script() - assert "if [[ $zsh_eval_context[-1] == loadautofunc ]]; then" in script - assert '_solx "$@"' in script - assert "compdef _solx solx" in script - assert script.rstrip().endswith("fi") - - -def test_zsh_no_bare_compdef() -> None: - """A column-0 compdef would register-only on autoload installs, leaving - the first Tab of a session empty; the call must stay inside the guard.""" - for line in _completions.zsh_script().splitlines(): - assert not line.startswith("compdef") - - -def test_zsh_path_flags_complete_files() -> None: - script = _completions.zsh_script() - assert "_files" in script - assert "--csv-dir" in script - assert "--solkeep" in script - - -def test_bash_registers_completer() -> None: - script = _completions.bash_script() - assert "_solx()" in script - assert "complete -F _solx solx" in script - assert "COMP_WORDS" in script - assert "COMP_CWORD" in script - - -def test_fish_uses_complete_lines() -> None: - script = _completions.fish_script() - assert "complete -c solx" in script - assert "__fish_use_subcommand" in script - assert "__fish_seen_subcommand_from" in script - - -@pytest.mark.parametrize("shell", sorted(SCRIPTS)) -def test_all_commands_listed(shell: str) -> None: - script = SCRIPTS[shell]() - for cmd in TOP_COMMANDS: - assert cmd in script, f"{shell} script misses top-level command {cmd!r}" - for sub in JOB_SUBCOMMANDS: - assert sub in script, f"{shell} script misses job subcommand {sub!r}" - for sub in CONFIG_SUBCOMMANDS: - assert sub in script, f"{shell} script misses config subcommand {sub!r}" - - -# ---- COMMANDS table pinned to the argparse tree --------------------------- - - -def _subparsers_action(parser) -> argparse._SubParsersAction | None: - for action in parser._actions: - if isinstance(action, argparse._SubParsersAction): - return action - return None - - -def _optional_forms(parser) -> list[tuple[str, ...]]: - """Option-string tuples of every optional except the automatic -h/--help.""" - return [ - tuple(a.option_strings) - for a in parser._actions - if a.option_strings and tuple(a.option_strings) != ("-h", "--help") - ] - - -def _positional_dests(parser) -> list[str]: - """Completable positionals: skip subparser actions and REMAINDER tails.""" - return [ - a.dest - for a in parser._actions - if not a.option_strings - and not isinstance(a, argparse._SubParsersAction) - and a.nargs != argparse.REMAINDER - ] - - -def _assert_leaf_matches(parser, spec: dict, label: str) -> None: - forms = [tuple(f[0]) for f in spec.get("flags", [])] - assert _optional_forms(parser) == forms, f"{label}: flags drifted" - pos = spec.get("positional") - dests = _positional_dests(parser) - if pos is None: - assert dests == [], f"{label}: parser has a positional COMMANDS misses" - else: - assert dests == [pos[0]], f"{label}: positional drifted" - - -def test_commands_table_pins_parser_tree() -> None: - """COMMANDS is a hand-written mirror of `main._build_parser()`; walk the - argparse tree and assert the two agree exactly, so neither the parser nor - the completion scripts can drift without failing here.""" - parser, _start = main_mod._build_parser() - root_sub = _subparsers_action(parser) - assert root_sub is not None - assert list(root_sub.choices) == list(_completions.COMMANDS) - - expected_root = [ - tuple(f[0]) for f in _completions.GLOBAL_FLAGS if tuple(f[0]) != ("-h", "--help") - ] - assert _optional_forms(parser) == expected_root - - for name, spec in _completions.COMMANDS.items(): - p = root_sub.choices[name] - sub_action = _subparsers_action(p) - if "sub" in spec: - assert sub_action is not None, f"{name}: parser is a leaf, table a group" - assert list(sub_action.choices) == list(spec["sub"]), name - assert _optional_forms(p) == [], f"{name}: group grew flags" - for sname, sspec in spec["sub"].items(): - _assert_leaf_matches(sub_action.choices[sname], sspec, f"{name} {sname}") - else: - assert sub_action is None, f"{name}: parser is a group, table a leaf" - _assert_leaf_matches(p, spec, name) - - -def test_stage_choices_pin_keep_module() -> None: - """STAGE_CHOICES mirrors what `solx keep --stage` accepts.""" - from solx import keep - - assert _completions.STAGE_CHOICES == ("all", *keep.STAGE_ORDER) - - -def test_shell_choices_pin_dispatcher() -> None: - """SHELL_CHOICES mirrors what `solx completions` accepts and renders.""" - assert _completions.SHELL_CHOICES == ("bash", "zsh", "fish") - assert set(_completions.SHELL_CHOICES) == set(SCRIPTS) - - -# ---- group-level and re-offer behavior (script text) ----------------------- - - -def test_zsh_groups_offer_help_flags() -> None: - """`solx job -` / `solx config -` offer -h/--help.""" - script = _completions.zsh_script() - help_spec = "'(-h --help)'{-h,--help}'[Show this help message and exit.]'" - for fn in ("_solx_job()", "_solx_config()"): - body = script.split(fn, 1)[1].split("\n}", 1)[0] - assert help_spec in body, f"{fn} lacks a group-level help spec" - - -def test_fish_groups_offer_help_flags() -> None: - script = _completions.fish_script() - assert ( - "complete -c solx -n '__fish_seen_subcommand_from job jobs; " - "and not __fish_seen_subcommand_from list start stop jump time' " - "-s h -l help" in script - ) - assert ( - "complete -c solx -n '__fish_seen_subcommand_from config; " - "and not __fish_seen_subcommand_from show edit import-solkeep' " - "-s h -l help" in script - ) - - -def test_fish_leaves_offer_help_flags() -> None: - script = _completions.fish_script() - assert "complete -c solx -n '__fish_seen_subcommand_from keep' -s h -l help" in script - - -def test_fish_does_not_reoffer_completions_shell() -> None: - """After `solx completions bash`, the shell names are not offered again.""" - script = _completions.fish_script() - assert ( - "-n '__fish_seen_subcommand_from completions; " - "and not __fish_seen_subcommand_from bash zsh fish' -a 'bash zsh fish'" - in script - ) - - -# ---- shell syntax checks -------------------------------------------------- - - -@pytest.mark.skipif(shutil.which("zsh") is None, reason="zsh not installed") -def test_zsh_syntax(tmp_path) -> None: - f = tmp_path / "_solx" - f.write_text(_completions.zsh_script()) - subprocess.run(["zsh", "-n", str(f)], check=True) - - -@pytest.mark.skipif(shutil.which("bash") is None, reason="bash not installed") -def test_bash_syntax(tmp_path) -> None: - f = tmp_path / "solx.bash" - f.write_text(_completions.bash_script()) - subprocess.run(["bash", "-n", str(f)], check=True) - - -@pytest.mark.skipif(shutil.which("fish") is None, reason="fish not installed") -def test_fish_syntax(tmp_path) -> None: - f = tmp_path / "solx.fish" - f.write_text(_completions.fish_script()) - subprocess.run(["fish", "--no-execute", str(f)], check=True) - - -# ---- functional bash probes (simulated COMP_WORDS) ------------------------- - -bash_required = pytest.mark.skipif( - shutil.which("bash") is None, reason="bash not installed" -) - - -def _bash_compreply( - tmp_path, - words: list[str], - *, - line: str | None = None, - point: int | None = None, - cwd: str | None = None, -) -> list[str]: - """Source the bash script, call `_solx` under a simulated completion - context, and return COMPREPLY one candidate per element.""" - script = tmp_path / "solx.bash" - script.write_text(_completions.bash_script()) - if line is None: - line = " ".join(words) - if point is None: - point = len(line) - quoted_words = " ".join(shlex.quote(w) for w in words) - probe = "\n".join( - [ - f"source {shlex.quote(str(script))}", - f"cd {shlex.quote(cwd)}" if cwd else ":", - f"COMP_WORDS=({quoted_words})", - f"COMP_CWORD={len(words) - 1}", - f"COMP_LINE={shlex.quote(line)}", - f"COMP_POINT={point}", - "_solx", - 'for r in "${COMPREPLY[@]}"; do printf "%s\\n" "$r"; done', - ] - ) - res = subprocess.run( - ["bash", "-c", probe], capture_output=True, text=True, check=True - ) - return res.stdout.splitlines() - - -@bash_required -def test_bash_solkeep_completes_path_with_spaces(tmp_path) -> None: - """A path containing spaces is one candidate, not one per word.""" - files = tmp_path / "files" - files.mkdir() - (files / "my keep list.txt").write_text("") - reply = _bash_compreply( - tmp_path, ["solx", "keep", "--solkeep", "my"], cwd=str(files) - ) - assert reply == ["my keep list.txt"] - - -@bash_required -def test_bash_solkeep_candidates_stay_literal(tmp_path) -> None: - """Candidates containing glob characters are not expanded against the cwd.""" - files = tmp_path / "files" - files.mkdir() - (files / "a*b").write_text("") - (files / "axxb").write_text("") - reply = _bash_compreply(tmp_path, ["solx", "keep", "--solkeep", "a"], cwd=str(files)) - assert sorted(reply) == ["a*b", "axxb"] - - -@bash_required -def test_bash_midword_completion_uses_cursor_prefix(tmp_path) -> None: - """Tab in the middle of `jox` (cursor after `jo`) completes `job`.""" - reply = _bash_compreply( - tmp_path, ["solx", "jox"], line="solx jox", point=len("solx jo") - ) - assert reply == ["job"] - - -@bash_required -def test_bash_group_offers_help_flags(tmp_path) -> None: - assert _bash_compreply(tmp_path, ["solx", "job", "-"]) == ["-h", "--help"] - - -@bash_required -def test_bash_completions_offers_shells_once(tmp_path) -> None: - line = "solx completions " - assert _bash_compreply( - tmp_path, ["solx", "completions", ""], line=line - ) == ["bash", "zsh", "fish"] - line = "solx completions bash " - assert _bash_compreply(tmp_path, ["solx", "completions", "bash", ""], line=line) == [] diff --git a/solx/tests/test_config.py b/solx/tests/test_config.py deleted file mode 100644 index 1c585df..0000000 --- a/solx/tests/test_config.py +++ /dev/null @@ -1,264 +0,0 @@ -from __future__ import annotations - -from pathlib import Path - -import pytest - -from solx import config as cfg -from solx.config import ConfigError -from tests.conftest import SAMPLE_CONFIG_TOML - - -def test_load_full_config(write_config) -> None: - p = write_config(SAMPLE_CONFIG_TOML) - c = cfg.load(p) - assert c.default_shell == "zsh" - assert c.default_template == "default" - assert c.start_timeout_seconds == 300 - assert set(c.templates) == {"default", "debug", "gpu"} - - gpu = c.templates["gpu"] - assert gpu.partition == "public" - assert gpu.gres == "gpu:a100:1" - assert gpu.time == "0-4" - assert gpu.qos is None - assert gpu.extra_args == ("--mem=64G", "--cpus-per-task=8") - - -def test_template_lookup_missing_raises(write_config) -> None: - c = cfg.load(write_config(SAMPLE_CONFIG_TOML)) - with pytest.raises(ConfigError, match="unknown job template"): - c.template("nonexistent") - - -def test_load_missing_file(tmp_path: Path) -> None: - with pytest.raises(ConfigError, match="run `solx init`"): - cfg.load(tmp_path / "absent.toml") - - -def test_invalid_toml(write_config) -> None: - p = write_config("default_shell = [unclosed array") - with pytest.raises(ConfigError, match="invalid TOML"): - cfg.load(p) - - -def test_required_default_shell(write_config) -> None: - p = write_config( - """default_template = "default" -[jobs.default] -partition = "x" -time = "1-0" -""" - ) - with pytest.raises(ConfigError, match="default_shell"): - cfg.load(p) - - -def test_required_default_template(write_config) -> None: - p = write_config( - """default_shell = "bash" -[jobs.default] -partition = "x" -time = "1-0" -""" - ) - with pytest.raises(ConfigError, match="default_template"): - cfg.load(p) - - -def test_at_least_one_jobs_table(write_config) -> None: - p = write_config('default_shell = "bash"\ndefault_template = "x"\n') - with pytest.raises(ConfigError, match="\\[jobs\\.\\] table"): - cfg.load(p) - - -def test_default_template_must_exist(write_config) -> None: - p = write_config( - """default_shell = "bash" -default_template = "missing" - -[jobs.default] -partition = "x" -time = "1-0" -""" - ) - with pytest.raises(ConfigError, match="not defined"): - cfg.load(p) - - -def test_template_required_keys(write_config) -> None: - p = write_config( - """default_shell = "bash" -default_template = "default" - -[jobs.default] -partition = "x" -""" - ) - with pytest.raises(ConfigError, match="`time`"): - cfg.load(p) - - -def test_extra_args_must_be_string_array(write_config) -> None: - p = write_config( - """default_shell = "bash" -default_template = "default" - -[jobs.default] -partition = "x" -time = "1-0" -extra_args = [1, 2] -""" - ) - with pytest.raises(ConfigError, match="extra_args"): - cfg.load(p) - - -def test_keep_match_include_only() -> None: - keep = cfg._parse_keep( - {"include": ["/scratch/sparky/proj-a/**"]}, source="t" - ) - assert keep is not None - assert keep.matches("/scratch/sparky/proj-a/data.csv") - assert not keep.matches("/scratch/sparky/proj-b/data.csv") - - -def test_keep_exclude_carve_out() -> None: - keep = cfg._parse_keep( - { - "include": ["/scratch/sparky/proj-a/**"], - "exclude": ["**/__pycache__/**", "**/.venv/**"], - }, - source="t", - ) - assert keep is not None - assert keep.matches("/scratch/sparky/proj-a/run/data.csv") - assert not keep.matches("/scratch/sparky/proj-a/run/__pycache__/x.pyc") - assert not keep.matches("/scratch/sparky/proj-a/.venv/lib/x.py") - - -def test_keep_requires_include() -> None: - with pytest.raises(ConfigError, match="non-empty array"): - cfg._parse_keep({"exclude": ["x"]}, source="t") - - -def test_keep_absent_returns_none() -> None: - assert cfg._parse_keep(None, source="t") is None - - -def test_parse_duration() -> None: - assert cfg.parse_duration("30s") == 30 - assert cfg.parse_duration("10m") == 600 - assert cfg.parse_duration("1h") == 3600 - assert cfg.parse_duration(" 5M ") == 300 - - -def test_parse_duration_invalid() -> None: - with pytest.raises(ConfigError): - cfg.parse_duration("never") - - -def test_config_path_honors_xdg(monkeypatch, tmp_path: Path) -> None: - monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "x")) - assert cfg.config_path() == tmp_path / "x" / "solx" / "config.toml" - - -def test_config_path_falls_back_to_home(monkeypatch, tmp_path: Path) -> None: - monkeypatch.delenv("XDG_CONFIG_HOME", raising=False) - monkeypatch.setattr(Path, "home", classmethod(lambda cls: tmp_path)) - assert cfg.config_path() == tmp_path / ".config" / "solx" / "config.toml" - - -def test_starter_config_loads_clean(tmp_path: Path) -> None: - """The text `solx init` writes must round-trip through `load()`.""" - p = tmp_path / "starter.toml" - p.write_text(cfg.starter_config_text()) - c = cfg.load(p) - assert c.default_shell == "bash" - assert c.default_template == "default" - assert "default" in c.templates - assert "debug" in c.templates - assert c.keep is None # commented out in starter; user uncomments - - -def test_starter_config_no_maintainer_name() -> None: - """Public starter must use `sparky`, never the maintainer's name.""" - text = cfg.starter_config_text() - assert "swan16" not in text - assert "" not in text - assert "sparky" in text # in the commented [keep] example - - -def test_load_unreadable_raises_config_error(tmp_path: Path) -> None: - """A directory where a file is expected -> OSError -> clean ConfigError.""" - p = tmp_path / "config.toml" - p.mkdir() # exists() is True, but open('rb') raises IsADirectoryError - with pytest.raises(ConfigError, match="unable to read"): - cfg.load(p) - - -def test_load_solkeep(tmp_path: Path) -> None: - p = tmp_path / ".solkeep" - p.write_text( - "# comment\n" - "/scratch/sparky/proj\n" - "!/scratch/sparky/proj/**/__pycache__\n" - ) - rules = cfg.load_solkeep(p) - assert rules is not None - assert rules.matches("/scratch/sparky/proj/src") # kept (prefix) - assert not rules.matches("/scratch/sparky/proj/a/__pycache__") # negated - assert not rules.matches("/scratch/sparky/other") # not listed - - -def test_load_solkeep_missing(tmp_path: Path) -> None: - assert cfg.load_solkeep(tmp_path / "nope") is None - - -def test_load_solkeep_comments_only(tmp_path: Path) -> None: - p = tmp_path / ".solkeep" - p.write_text("# just a comment\n\n") - assert cfg.load_solkeep(p) is None - - -def test_import_solkeep_splits_include_exclude(tmp_path: Path) -> None: - p = tmp_path / ".solkeep" - p.write_text( - "# comment\n" - "/scratch/sparky/proj\n" - "/scratch/sparky/exp/**\n" - "!**/__pycache__\n" - ) - result = cfg.import_solkeep(p) - assert result is not None - include, exclude = result - assert include == ["/scratch/sparky/proj", "/scratch/sparky/exp/**"] - assert exclude == ["**/__pycache__"] - - -def test_import_solkeep_missing(tmp_path: Path) -> None: - assert cfg.import_solkeep(tmp_path / "nope") is None - - -def test_import_solkeep_no_includes(tmp_path: Path) -> None: - p = tmp_path / ".solkeep" - p.write_text("# only comments\n!**/__pycache__\n") # exclude-only: nothing to keep - assert cfg.import_solkeep(p) is None - - -def test_starter_config_with_imported_keep_round_trips(tmp_path: Path) -> None: - text = cfg.starter_config_text(keep=(["/scratch/sparky/proj"], ["**/__pycache__"])) - p = tmp_path / "config.toml" - p.write_text(text) - c = cfg.load(p) - assert c.keep is not None - assert c.keep.matches("/scratch/sparky/proj/src") - assert not c.keep.matches("/scratch/sparky/proj/a/__pycache__") - - -def test_starter_config_default_keeps_placeholder(tmp_path: Path) -> None: - text = cfg.starter_config_text() # no import - assert "sparky" in text - p = tmp_path / "config.toml" - p.write_text(text) - assert cfg.load(p).keep is None # [keep] is a commented placeholder diff --git a/solx/tests/test_init.py b/solx/tests/test_init.py deleted file mode 100644 index 40fbb28..0000000 --- a/solx/tests/test_init.py +++ /dev/null @@ -1,340 +0,0 @@ -from __future__ import annotations - -import json -import stat -from io import StringIO -from pathlib import Path - -import pytest -import rich.prompt -from rich.console import Console - -from solx import init as init_mod -from solx import config as cfg -from solx.output import Out - - -def make_out(*, json_mode: bool = False, interactive: bool = False) -> Out: - so = Console(file=StringIO(), force_terminal=False, width=200) - se = Console(file=StringIO(), force_terminal=False, width=200) - return Out(json_mode=json_mode, interactive=interactive, stdout=so, stderr=se) - - -def test_init_writes_fresh_config(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - code = init_mod.cmd_init(path=p, force=False, out=make_out()) - assert code == 0 - assert p.exists() - # Round-trips via load: - loaded = cfg.load(p) - assert loaded.default_template == "default" - - -def test_init_creates_parent_dirs(tmp_path: Path) -> None: - p = tmp_path / "deep" / "config" / "solx" / "config.toml" - code = init_mod.cmd_init(path=p, force=False, out=make_out()) - assert code == 0 - assert p.exists() - - -def test_init_mode_0600(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - init_mod.cmd_init(path=p, force=False, out=make_out()) - mode = stat.S_IMODE(p.stat().st_mode) - assert mode == 0o600 - - -def test_init_json_mode(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - out = make_out(json_mode=True, interactive=False) - code = init_mod.cmd_init(path=p, force=False, out=out) - assert code == 0 - assert json.loads(out.stdout.file.getvalue()) == {"wrote": str(p)} - - -def test_init_refuses_existing_without_force(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - p.write_text("# existing user config\n") - code = init_mod.cmd_init( - path=p, - force=False, - out=make_out(interactive=True), - confirm_fn=lambda *a, **kw: False, - ) - assert code == 1 - assert p.read_text() == "# existing user config\n" # unchanged - - -def test_init_non_interactive_existing_refuses(tmp_path: Path) -> None: - """No TTY + existing config + no -f -> exit 2, never prompt, never overwrite.""" - p = tmp_path / "config.toml" - p.write_text("# existing user config\n") - out = make_out(interactive=False) - code = init_mod.cmd_init( - path=p, - force=False, - out=out, - confirm_fn=lambda *a, **kw: pytest.fail("must not prompt"), - ) - assert code == 2 - assert p.read_text() == "# existing user config\n" - assert "-f" in out.stderr.file.getvalue() - - -def test_init_overwrites_with_force(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - p.write_text("# old\n") - code = init_mod.cmd_init(path=p, force=True, out=make_out()) - assert code == 0 - assert "default_template" in p.read_text() - - -def test_init_overwrites_when_user_confirms(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - p.write_text("# old\n") - code = init_mod.cmd_init( - path=p, - force=False, - out=make_out(interactive=True), - confirm_fn=lambda *a, **kw: True, - walkthrough_fn=lambda out, sk: None, # skip the walkthrough - ) - assert code == 0 - assert "default_template" in p.read_text() - - -def test_init_walkthrough_picks_shell(tmp_path: Path) -> None: - """An interactive walkthrough that picks a shell sets default_shell.""" - p = tmp_path / "config.toml" - code = init_mod.cmd_init( - path=p, force=False, out=make_out(interactive=True), - walkthrough_fn=lambda out, sk: {"shell": "zsh", "keep": None}, - ) - assert code == 0 - assert cfg.load(p).default_shell == "zsh" - - -def test_init_walkthrough_declined_keeps_default(tmp_path: Path) -> None: - p = tmp_path / "config.toml" - init_mod.cmd_init( - path=p, force=False, out=make_out(interactive=True), - walkthrough_fn=lambda out, sk: None, # declined - ) - assert cfg.load(p).default_shell == "bash" - - -def test_init_no_walkthrough_when_noninteractive(tmp_path: Path) -> None: - """A non-interactive session never runs the walkthrough (no silent import).""" - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n") - p = tmp_path / "config.toml" - init_mod.cmd_init( - path=p, force=False, solkeep=solkeep, out=make_out(interactive=False), - walkthrough_fn=lambda out, sk: pytest.fail("walkthrough must not run"), - ) - c = cfg.load(p) - assert c.default_shell == "bash" - assert c.keep is None # nothing imported without the prompt - - -def test_init_walkthrough_imports_solkeep(tmp_path: Path) -> None: - """The walkthrough's import step carries ~/.solkeep into [keep].""" - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n!**/__pycache__\n") - cfgpath = tmp_path / "config.toml" - out = make_out(interactive=True) - code = init_mod.cmd_init( - path=cfgpath, force=False, solkeep=solkeep, out=out, - walkthrough_fn=lambda o, sk: {"shell": "bash", "keep": cfg.import_solkeep(sk)}, - ) - assert code == 0 - c = cfg.load(cfgpath) - assert c.keep is not None - assert c.keep.matches("/scratch/sparky/proj/x") - assert not c.keep.matches("/scratch/sparky/proj/x/__pycache__") - assert "imported" in out.stderr.file.getvalue() - - -def test_default_walkthrough_prompts_import_and_shell(tmp_path: Path, monkeypatch) -> None: - """The real walkthrough asks to import .solkeep, then picks a shell.""" - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n") - monkeypatch.setattr(rich.prompt.Confirm, "ask", lambda *a, **kw: True) # walkthrough + import - monkeypatch.setattr(rich.prompt.Prompt, "ask", lambda *a, **kw: "zsh") - res = init_mod._default_walkthrough(make_out(interactive=True), solkeep) - assert res == {"shell": "zsh", "keep": (["/scratch/sparky/proj"], [])} - - -def test_default_walkthrough_declines_import(tmp_path: Path, monkeypatch) -> None: - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n") - answers = iter([True, False]) # walkthrough yes, import no - monkeypatch.setattr(rich.prompt.Confirm, "ask", lambda *a, **kw: next(answers)) - monkeypatch.setattr(rich.prompt.Prompt, "ask", lambda *a, **kw: "bash") - res = init_mod._default_walkthrough(make_out(interactive=True), solkeep) - assert res == {"shell": "bash", "keep": None} - - -def test_default_walkthrough_declined(monkeypatch) -> None: - monkeypatch.setattr(rich.prompt.Confirm, "ask", lambda *a, **kw: False) - assert init_mod._default_walkthrough(make_out(interactive=True), None) is None - - -def test_init_no_solkeep_keeps_placeholder(tmp_path: Path) -> None: - """With no walkthrough/import, the starter keeps the commented [keep] placeholder.""" - cfgpath = tmp_path / "config.toml" - init_mod.cmd_init( - path=cfgpath, force=False, solkeep=tmp_path / "absent", out=make_out() - ) - c = cfg.load(cfgpath) - assert c.keep is None - assert "sparky" in cfgpath.read_text() - - -# ---- config import-solkeep (the .solkeep -> [keep] migration) ------------ - -_CONFIG_NO_KEEP = """\ -default_shell = "bash" -default_template = "default" - -[jobs.default] -partition = "lightwork" -time = "1-0" -""" - - -def test_import_solkeep_appends_keep_block(tmp_path: Path) -> None: - """A config without [keep] + a ~/.solkeep -> a [keep] block is appended.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n!**/__pycache__\n") - - out = make_out() - code = init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=out) - assert code == 0 - c = cfg.load(cfgpath) - assert c.keep is not None - assert c.keep.matches("/scratch/sparky/proj/x") - assert not c.keep.matches("/scratch/sparky/proj/x/__pycache__") - assert "imported" in out.stderr.file.getvalue() - - -def test_import_solkeep_refuses_when_keep_exists(tmp_path: Path) -> None: - """A config that already has [keep] is left alone (a 2nd table is invalid TOML).""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text( - _CONFIG_NO_KEEP + '\n[keep]\ninclude = ["/scratch/sparky/existing"]\n' - ) - before = cfgpath.read_text() - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n") - - out = make_out() - code = init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=out) - assert code == 2 - assert cfgpath.read_text() == before # untouched - assert "already has" in out.stderr.file.getvalue() - - -def test_import_solkeep_no_config_exits_2(tmp_path: Path) -> None: - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n") - out = make_out() - code = init_mod.cmd_import_solkeep( - path=tmp_path / "absent.toml", solkeep=solkeep, out=out - ) - assert code == 2 - assert "solx init" in out.stderr.file.getvalue() - - -def test_import_solkeep_no_patterns_exits_2(tmp_path: Path) -> None: - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text("# just a comment\n\n") - out = make_out() - code = init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=out) - assert code == 2 - assert cfg.load(cfgpath).keep is None # nothing appended - - -def test_import_solkeep_escapes_control_char(tmp_path: Path) -> None: - """A pattern with a control byte is escaped, not left to corrupt the config.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/a\x1bb\n") # interior ESC - code = init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=make_out()) - assert code == 0 - c = cfg.load(cfgpath) # must still parse — no corruption on disk - assert c.keep is not None - assert "/scratch/sparky/a\x1bb" in c.keep.raw_include - - -_ORDER_SENSITIVE_SOLKEEP = ( - "/scratch/sparky/proj\n" - "!/scratch/sparky/proj/big\n" - "/scratch/sparky/proj/big/keep\n" # re-include AFTER the carve-out -) - - -def test_import_solkeep_order_sensitive_refuses_without_force(tmp_path: Path) -> None: - """A lossy re-include is refused (exit 2, nothing written) unless -f.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text(_ORDER_SENSITIVE_SOLKEEP) - out = make_out() - code = init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=out) - assert code == 2 - assert "carve-out" in out.stderr.file.getvalue() - assert cfg.load(cfgpath).keep is None # nothing written - - -def test_import_solkeep_order_sensitive_force_writes_with_warning(tmp_path: Path) -> None: - """With -f the lossy import proceeds but warns that ordering isn't preserved.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text(_ORDER_SENSITIVE_SOLKEEP) - out = make_out() - code = init_mod.cmd_import_solkeep( - path=cfgpath, solkeep=solkeep, force=True, out=out - ) - assert code == 0 - assert "warning" in out.stderr.file.getvalue() - assert cfg.load(cfgpath).keep is not None - - -def test_import_solkeep_faithful_shape_no_warn(tmp_path: Path) -> None: - """Includes-then-carve-outs (the safe shape) migrates without a warning.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n!**/__pycache__\n") - out = make_out() - init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=out) - assert "warning" not in out.stderr.file.getvalue() - - -def test_import_solkeep_bare_bang_dropped(tmp_path: Path) -> None: - """A bare `!` carves nothing and must not become an empty exclude pattern.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / ".solkeep" - solkeep.write_text("/scratch/sparky/proj\n! \n") - code = init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=make_out()) - assert code == 0 - c = cfg.load(cfgpath) - assert "" not in (c.keep.raw_exclude or ()) - - -def test_import_solkeep_records_source_path(tmp_path: Path) -> None: - """Importing from a non-default path records that path in the block comment.""" - cfgpath = tmp_path / "config.toml" - cfgpath.write_text(_CONFIG_NO_KEEP) - solkeep = tmp_path / "mykeep.txt" - solkeep.write_text("/scratch/sparky/proj\n") - init_mod.cmd_import_solkeep(path=cfgpath, solkeep=solkeep, out=make_out()) - assert str(solkeep) in cfgpath.read_text() # provenance comment names the real source diff --git a/solx/tests/test_jobs.py b/solx/tests/test_jobs.py deleted file mode 100644 index 2af5130..0000000 --- a/solx/tests/test_jobs.py +++ /dev/null @@ -1,397 +0,0 @@ -from __future__ import annotations - -import json -from io import StringIO - -import pytest -from rich.console import Console - -from solx import jobs as jobs_mod -from solx.config import Config, JobTemplate -from solx.output import Out - - -# ---- helpers ------------------------------------------------------------- - - -def make_out(*, json_mode: bool = False, interactive: bool = True) -> Out: - so = Console(file=StringIO(), force_terminal=False, width=200) - se = Console(file=StringIO(), force_terminal=False, width=200) - return Out(json_mode=json_mode, interactive=interactive, stdout=so, stderr=se) - - -def make_runner(*, code: int = 0, stdout: str = "", stderr: str = ""): - captured: list[list[str]] = [] - - def runner(argv): - captured.append(list(argv)) - return code, stdout, stderr - - return runner, captured - - -def routed_runner(*, jobs_out: str = "", time_out: str = "00:10:00\n", scancel_code: int = 0): - """A runner that returns different output per Slurm subcommand. - - Needed when one command makes several calls (e.g. `time` does squeue -u for - resolution, then squeue -O TimeLeft). - """ - captured: list[list[str]] = [] - - def runner(argv): - captured.append(list(argv)) - if "-O" in argv: # squeue ... -O TimeLeft - return 0, time_out, "" - if argv[:1] == ["squeue"]: # squeue -u $USER - return 0, jobs_out, "" - if argv[:1] == ["scancel"]: - return scancel_code, "", ("scancel error" if scancel_code else "") - return 0, "", "" - - return runner, captured - - -def basic_config() -> Config: - return Config( - default_shell="zsh", - default_template="default", - start_timeout_seconds=600, - templates={ - "default": JobTemplate(name="default", partition="lightwork", time="1-0", qos="public"), - "debug": JobTemplate(name="debug", partition="htc", time="0-1"), - }, - ) - - -TWO_RUNNING = ( - "12345|solx-default|RUNNING|00:01:00|00:59:00|lightwork|sg045\n" - "67890|notebook|RUNNING|00:01:00|00:59:00|htc|sg010\n" -) - - -# ---- list ---------------------------------------------------------------- - - -def test_list_empty() -> None: - runner, _ = make_runner(stdout="") - assert jobs_mod.cmd_list(runner=runner, out=make_out()) == 0 - - -def test_list_renders_jobs() -> None: - runner, _ = make_runner(stdout=TWO_RUNNING) - out = make_out() - assert jobs_mod.cmd_list(runner=runner, out=out) == 0 - assert "67890" in out.stdout.file.getvalue() - - -def test_list_json() -> None: - runner, _ = make_runner(stdout=TWO_RUNNING) - out = make_out(json_mode=True) - assert jobs_mod.cmd_list(runner=runner, out=out) == 0 - data = json.loads(out.stdout.file.getvalue()) - assert [j["job_id"] for j in data] == ["12345", "67890"] - assert data[0]["state"] == "RUNNING" - - -def test_list_propagates_squeue_failure() -> None: - runner, _ = make_runner(code=1, stderr="slurmctld is down") - assert jobs_mod.cmd_list(runner=runner, out=make_out()) == 1 - - -# ---- start --------------------------------------------------------------- - - -def test_start_dry_run_prints_argv() -> None: - out = make_out() - code = jobs_mod.cmd_start( - config=basic_config(), template_name="debug", dry_run=True, - timeout_override=None, passthrough=[], out=out, - ) - assert code == 0 - assert "salloc" in out.stdout.file.getvalue() - - -def test_start_dry_run_json() -> None: - out = make_out(json_mode=True) - jobs_mod.cmd_start( - config=basic_config(), template_name="debug", dry_run=True, - timeout_override=None, passthrough=[], out=out, - ) - data = json.loads(out.stdout.file.getvalue()) - assert data["dry_run"] is True - assert data["argv"][0] == "salloc" - assert data["template"] == "debug" - - -def test_start_uses_default_template_when_none() -> None: - captured: dict = {} - - def fake_runner(argv): - captured["argv"] = argv - return 0, "", "salloc: Granted job allocation 99999\n" - - code = jobs_mod.cmd_start( - config=basic_config(), template_name=None, dry_run=False, - timeout_override=None, passthrough=[], salloc_runner=fake_runner, out=make_out(), - ) - assert code == 0 - assert "-J" in captured["argv"] and "solx-default" in captured["argv"] - - -def test_start_json_emits_jobid() -> None: - def fake_runner(argv): - return 0, "", "salloc: Granted job allocation 99999\n" - - out = make_out(json_mode=True) - jobs_mod.cmd_start( - config=basic_config(), template_name="debug", dry_run=False, - timeout_override=None, passthrough=[], salloc_runner=fake_runner, out=out, - ) - assert json.loads(out.stdout.file.getvalue()) == {"jobid": "99999", "template": "debug"} - - -def test_start_unknown_template() -> None: - code = jobs_mod.cmd_start( - config=basic_config(), template_name="nope", dry_run=True, - timeout_override=None, passthrough=[], out=make_out(), - ) - assert code == 1 - - -def test_start_passthrough_appended() -> None: - captured: dict = {} - - def fake_runner(argv): - captured["argv"] = argv - return 0, "", "salloc: Granted job allocation 11111\n" - - jobs_mod.cmd_start( - config=basic_config(), template_name="debug", dry_run=False, - timeout_override=None, passthrough=["--mem=128G"], salloc_runner=fake_runner, out=make_out(), - ) - assert captured["argv"][-1] == "--mem=128G" - - -def test_start_salloc_failure() -> None: - def fake_runner(argv): - return 1, "", "salloc: error: invalid partition\n" - - code = jobs_mod.cmd_start( - config=basic_config(), template_name="debug", dry_run=False, - timeout_override=None, passthrough=[], salloc_runner=fake_runner, out=make_out(), - ) - assert code == 1 - - -# ---- stop ---------------------------------------------------------------- - - -def test_stop_yes_and_dry_run_mutually_exclusive() -> None: - runner, _ = make_runner() - code = jobs_mod.cmd_stop(jobid_arg="12345", yes=True, dry_run=True, runner=runner, out=make_out()) - assert code == 2 - - -def test_stop_dry_run() -> None: - runner, captured = make_runner() - code = jobs_mod.cmd_stop(jobid_arg="12345", yes=False, dry_run=True, runner=runner, out=make_out()) - assert code == 0 - assert captured == [] - - -def test_stop_with_yes_skips_prompt() -> None: - runner, captured = make_runner() - confirms: list = [] - code = jobs_mod.cmd_stop( - jobid_arg="12345", yes=True, dry_run=False, runner=runner, out=make_out(), - confirm_fn=lambda *a, **kw: confirms.append(True) or True, - ) - assert code == 0 - assert confirms == [] - assert captured == [["scancel", "12345"]] - - -def test_stop_prompts_and_proceeds() -> None: - runner, captured = make_runner() - code = jobs_mod.cmd_stop( - jobid_arg="12345", yes=False, dry_run=False, runner=runner, - out=make_out(interactive=True), confirm_fn=lambda *a, **kw: True, - ) - assert code == 0 - assert captured == [["scancel", "12345"]] - - -def test_stop_prompts_and_aborts() -> None: - runner, captured = make_runner() - code = jobs_mod.cmd_stop( - jobid_arg="12345", yes=False, dry_run=False, runner=runner, - out=make_out(interactive=True), confirm_fn=lambda *a, **kw: False, - ) - assert code == 1 - assert captured == [] - - -def test_stop_non_interactive_refuses() -> None: - """No TTY on stdin + no -y/-n -> refuse, exit 2, never prompt or cancel.""" - runner, captured = make_runner() - out = make_out(interactive=False) - code = jobs_mod.cmd_stop( - jobid_arg="12345", yes=False, dry_run=False, runner=runner, out=out, - confirm_fn=lambda *a, **kw: pytest.fail("must not prompt"), - ) - assert code == 2 - assert captured == [] - assert "non-interactive" in out.stderr.file.getvalue() - - -def test_stop_ambiguous_jobs_no_autopick() -> None: - runner, captured = routed_runner(jobs_out=TWO_RUNNING) - out = make_out() - code = jobs_mod.cmd_stop(jobid_arg=None, yes=True, dry_run=False, runner=runner, out=out) - assert code == 2 - # never cancelled anything - assert not any(a[:1] == ["scancel"] for a in captured) - assert "multiple jobs" in out.stderr.file.getvalue() - - -def test_stop_ambiguous_json_lists_candidates() -> None: - runner, _ = routed_runner(jobs_out=TWO_RUNNING) - out = make_out(json_mode=True) - code = jobs_mod.cmd_stop(jobid_arg=None, yes=True, dry_run=False, runner=runner, out=out) - assert code == 2 - data = json.loads(out.stdout.file.getvalue()) - assert {j["job_id"] for j in data["jobs"]} == {"12345", "67890"} - - -def test_stop_self_cancel_warns(monkeypatch) -> None: - monkeypatch.setenv("SLURM_JOB_ID", "55555") - runner, captured = routed_runner() - out = make_out() - code = jobs_mod.cmd_stop(jobid_arg=None, yes=True, dry_run=False, runner=runner, out=out) - assert code == 0 - assert ["scancel", "55555"] in captured - err = out.stderr.file.getvalue() - assert "55555" in err and "allocation you're inside" in err - - -def test_stop_self_cancel_warns_in_dry_run(monkeypatch) -> None: - """Dry-run preview must still surface that the target is the current session.""" - monkeypatch.setenv("SLURM_JOB_ID", "55555") - runner, captured = routed_runner() - out = make_out(json_mode=True) - code = jobs_mod.cmd_stop(jobid_arg=None, yes=False, dry_run=True, runner=runner, out=out) - assert code == 0 - assert not any(a[:1] == ["scancel"] for a in captured) # nothing cancelled - assert "allocation you're inside" in out.stderr.file.getvalue() - assert json.loads(out.stdout.file.getvalue())["inside_allocation"] is True - - -def test_stop_json_cancelled() -> None: - runner, _ = make_runner() - out = make_out(json_mode=True) - jobs_mod.cmd_stop(jobid_arg="12345", yes=True, dry_run=False, runner=runner, out=out) - assert json.loads(out.stdout.file.getvalue()) == {"cancelled": "12345"} - - -# ---- jump ---------------------------------------------------------------- - - -def test_jump_builds_correct_argv() -> None: - runner, _ = make_runner() - captured: list = [] - code = jobs_mod.cmd_jump( - config=basic_config(), jobid_arg="12345", runner=runner, - exec_fn=lambda argv: captured.append(argv), out=make_out(), - ) - assert code == 0 - assert captured == [["srun", "--jobid=12345", "--overlap", "--pty", "zsh"]] - - -def test_jump_from_inside_warns_and_proceeds(monkeypatch) -> None: - """Attach is non-destructive: warn about nesting but still attach (exit 0).""" - monkeypatch.setenv("SLURM_JOB_ID", "99999") - runner, _ = make_runner() - captured: list = [] - out = make_out() - code = jobs_mod.cmd_jump( - config=basic_config(), jobid_arg=None, runner=runner, - exec_fn=lambda argv: captured.append(argv), out=out, - ) - assert code == 0 - assert captured == [["srun", "--jobid=99999", "--overlap", "--pty", "zsh"]] - assert "already inside job 99999" in out.stderr.file.getvalue() - - -def test_jump_inside_quiet_suppresses_warning(monkeypatch) -> None: - monkeypatch.setenv("SLURM_JOB_ID", "99999") - runner, _ = make_runner() - captured: list = [] - out = make_out() - code = jobs_mod.cmd_jump( - config=basic_config(), jobid_arg=None, quiet=True, runner=runner, - exec_fn=lambda argv: captured.append(argv), out=out, - ) - assert code == 0 - assert captured == [["srun", "--jobid=99999", "--overlap", "--pty", "zsh"]] - assert out.stderr.file.getvalue() == "" - - -def test_jump_most_recent_running() -> None: - runner, _ = routed_runner(jobs_out=TWO_RUNNING) - captured: list = [] - out = make_out() - code = jobs_mod.cmd_jump( - config=basic_config(), jobid_arg=None, runner=runner, - exec_fn=lambda argv: captured.append(argv), out=out, - ) - assert code == 0 - # highest jobid (67890) is "most recent" - assert captured == [["srun", "--jobid=67890", "--overlap", "--pty", "zsh"]] - assert "most recent" in out.stderr.file.getvalue() - - -def test_jump_no_running_job() -> None: - pending = "12345|nb|PENDING|00:00|01:00|htc|(Resources)\n" - runner, _ = routed_runner(jobs_out=pending) - captured: list = [] - out = make_out() - code = jobs_mod.cmd_jump( - config=basic_config(), jobid_arg=None, runner=runner, - exec_fn=lambda argv: captured.append(argv), out=out, - ) - assert code == 1 - assert captured == [] - assert "no running job" in out.stderr.file.getvalue() - - -# ---- time ---------------------------------------------------------------- - - -def test_time_prints_left() -> None: - runner, _ = make_runner(stdout="00:42:13\n") - out = make_out() - code = jobs_mod.cmd_time(jobid_arg="12345", runner=runner, out=out) - assert code == 0 - assert "00:42:13" in out.stdout.file.getvalue() - - -def test_time_json() -> None: - runner, _ = make_runner(stdout="00:42:13\n") - out = make_out(json_mode=True) - jobs_mod.cmd_time(jobid_arg="12345", runner=runner, out=out) - assert json.loads(out.stdout.file.getvalue()) == {"jobid": "12345", "time_left": "00:42:13"} - - -def test_time_most_recent() -> None: - runner, _ = routed_runner(jobs_out=TWO_RUNNING, time_out="01:00:00\n") - out = make_out() - code = jobs_mod.cmd_time(jobid_arg=None, runner=runner, out=out) - assert code == 0 - data_line = out.stdout.file.getvalue() - assert "01:00:00" in data_line - assert "most recent 67890" in out.stderr.file.getvalue() - - -def test_time_squeue_error() -> None: - runner, _ = make_runner(code=1, stderr="invalid jobid") - assert jobs_mod.cmd_time(jobid_arg="12345", runner=runner, out=make_out()) == 1 diff --git a/solx/tests/test_keep.py b/solx/tests/test_keep.py deleted file mode 100644 index 574a41f..0000000 --- a/solx/tests/test_keep.py +++ /dev/null @@ -1,592 +0,0 @@ -from __future__ import annotations - -import os -import time -from io import StringIO -from pathlib import Path - -import pathspec -import pytest -from rich.console import Console - -from solx import keep as keep_mod -from solx.config import Config, JobTemplate, KeepRules -from solx.output import Out - - -def make_out(*, json_mode: bool = False, interactive: bool = True) -> Out: - so = Console(file=StringIO(), force_terminal=False, width=200) - se = Console(file=StringIO(), force_terminal=False, width=200) - return Out(json_mode=json_mode, interactive=interactive, stdout=so, stderr=se) - - -def make_config(*, keep: KeepRules | None = None) -> Config: - return Config( - default_shell="bash", - default_template="default", - start_timeout_seconds=600, - templates={ - "default": JobTemplate(name="default", partition="lightwork", time="1-0") - }, - keep=keep, - ) - - -def make_keep(*, include: list[str], exclude: list[str] | None = None) -> KeepRules: - return KeepRules( - include=pathspec.GitIgnoreSpec.from_lines(include), - exclude=pathspec.GitIgnoreSpec.from_lines(exclude or []), - raw_include=tuple(include), - raw_exclude=tuple(exclude or []), - ) - - -def write_csv(path: Path, dirs: list[str]) -> None: - lines = ["Directory,LastAccess,Size"] - lines += [f"{d},2026-01-01,1G" for d in dirs] - path.write_text("\n".join(lines) + "\n") - - -# A stub execute_fn that records which directories the plan would touch and -# returns (files_touched, failures). Replaces the real process pool. -def recording_execute(record: list[str], *, files_each: int = 1, failures: int = 0): - def _execute(plan, jobs_n, out): - record.extend(d for _, d in plan.kept) - return len(plan.kept) * files_each, failures - - return _execute - - -# ---- planning ------------------------------------------------------------ - - -def test_load_csv_dirs(tmp_path: Path) -> None: - p = tmp_path / "scratch-dirs-pending-removal.csv" - write_csv(p, ["/scratch/sparky/a", "/scratch/sparky/b"]) - assert keep_mod.load_csv_dirs(p) == [ - "/scratch/sparky/a", - "/scratch/sparky/b", - ] - - -def test_load_csv_dirs_missing(tmp_path: Path) -> None: - assert keep_mod.load_csv_dirs(tmp_path / "absent.csv") == [] - - -def test_build_plan_filters_by_keep(tmp_path: Path) -> None: - write_csv( - tmp_path / "scratch-dirs-pending-removal.csv", - ["/scratch/sparky/proj-a", "/scratch/sparky/proj-z"], - ) - write_csv( - tmp_path / "scratch-dirs-over-90days.csv", - ["/scratch/sparky/proj-b"], - ) - keep = make_keep( - include=["/scratch/sparky/proj-a", "/scratch/sparky/proj-b"], - ) - plan = keep_mod.build_plan(tmp_path, list(keep_mod.STAGE_ORDER), keep) - assert {d for _, d in plan.kept} == { - "/scratch/sparky/proj-a", - "/scratch/sparky/proj-b", - } - assert {d for _, d in plan.skipped} == {"/scratch/sparky/proj-z"} - - -def test_build_plan_dedupes_across_stages(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - write_csv(tmp_path / "scratch-dirs-over-90days.csv", ["/scratch/sparky/a"]) - keep = make_keep(include=["/scratch/sparky/a"]) - plan = keep_mod.build_plan(tmp_path, list(keep_mod.STAGE_ORDER), keep) - assert len(plan.kept) == 1 - - -def test_build_plan_exclude_carve_out(tmp_path: Path) -> None: - write_csv( - tmp_path / "scratch-dirs-pending-removal.csv", - [ - "/scratch/sparky/proj/run-1", - "/scratch/sparky/proj/__pycache__", - ], - ) - keep = make_keep( - include=["/scratch/sparky/proj/**"], - exclude=["**/__pycache__"], - ) - plan = keep_mod.build_plan(tmp_path, ["pending"], keep) - assert {d for _, d in plan.kept} == {"/scratch/sparky/proj/run-1"} - assert {d for _, d in plan.skipped} == {"/scratch/sparky/proj/__pycache__"} - - -# ---- shard / enumerate / touch (the renewal mechanism) ------------------- - - -def test_shard_even_batches() -> None: - files = [bytes([i]) for i in range(0, 10)] - batches = keep_mod.shard(files, batch_size=3) - assert [len(b) for b in batches] == [3, 3, 3, 1] - assert sum(batches, []) == files - - -def test_shard_empty() -> None: - assert keep_mod.shard([]) == [] - - -def test_enumerate_dir_lists_all_including_hidden_and_ignored(tmp_path: Path) -> None: - (tmp_path / "a.txt").write_text("x") - (tmp_path / ".hidden").write_text("x") - sub = tmp_path / "sub" - sub.mkdir() - (sub / "b.txt").write_text("x") - # A .gitignore plus an ignored file: --no-ignore must still list it. - (tmp_path / ".gitignore").write_text("ignored.txt\n") - (tmp_path / "ignored.txt").write_text("x") - - directory, files, msg = keep_mod.enumerate_dir(str(tmp_path)) - assert msg == "ok" - assert all(os.path.isfile(p) for p in files) - # 5 regular files: a.txt, .hidden, sub/b.txt, .gitignore, ignored.txt - assert len(files) == 5 - - -def test_enumerate_dir_not_a_directory(tmp_path: Path) -> None: - missing = tmp_path / "nope" - _, files, msg = keep_mod.enumerate_dir(str(missing)) - assert files == [] - assert msg.startswith("skipped") - - -def test_touch_files_refreshes_mtime(tmp_path: Path) -> None: - f = tmp_path / "stale.txt" - f.write_text("x") - old = time.time() - 60 * 60 * 24 * 100 # 100 days ago - os.utime(f, (old, old)) - assert f.stat().st_mtime < time.time() - 1000 - - attempted, errors, msg = keep_mod.touch_files([str(f).encode()]) - assert errors == 0 - assert attempted == 1 - assert f.stat().st_mtime > time.time() - 10 - - -def test_touch_files_empty_batch() -> None: - assert keep_mod.touch_files([]) == (0, 0, "ok") - - -def test_execute_survives_raising_enumerate() -> None: - """A worker that raises is counted as a failure, not an uncaught crash.""" - plan = keep_mod.Plan(kept=[("pending", "/scratch/sparky/a")]) - - def boom(_d): - raise RuntimeError("worker died") - - total, failures = keep_mod._execute(plan, 1, make_out(), enumerate_fn=boom) - assert (total, failures) == (0, 1) - - -def test_execute_survives_raising_touch() -> None: - plan = keep_mod.Plan(kept=[("pending", "/scratch/sparky/a")]) - - def enum_ok(d): - return (d, [b"/scratch/sparky/a/f1"], "ok") - - def boom(_batch): - raise RuntimeError("touch died") - - total, failures = keep_mod._execute( - plan, 1, make_out(), enumerate_fn=enum_ok, touch_fn=boom - ) - assert (total, failures) == (0, 1) - - -# ---- cmd_keep ------------------------------------------------------------ - - -def test_keep_yes_and_dry_run_mutually_exclusive(tmp_path: Path) -> None: - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=True, verbose=False, out=make_out(), - ) - assert code == 2 - - -def test_keep_bad_csv_dir_exits_2(tmp_path: Path) -> None: - """A --csv-dir that isn't a directory is surfaced, not silently empty.""" - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - bad = tmp_path / "does-not-exist" - out = make_out() - code = keep_mod.cmd_keep( - config=cfg, csv_dir=bad, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=out, - ) - assert code == 2 - assert "not a directory" in out.stderr.file.getvalue() - - -def test_keep_no_keep_block_exits_2(tmp_path: Path, monkeypatch) -> None: - monkeypatch.setenv("HOME", str(tmp_path)) # no ~/.solkeep here - cfg = make_config(keep=None) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=False, verbose=False, out=make_out(), - ) - assert code == 2 - - -def test_keep_explicit_solkeep(tmp_path: Path) -> None: - """--solkeep uses a gitignore-style keep-list (skill compatibility).""" - write_csv( - tmp_path / "scratch-dirs-pending-removal.csv", - ["/scratch/sparky/proj/run", "/scratch/sparky/other"], - ) - solkeep = tmp_path / "mykeep" - solkeep.write_text("/scratch/sparky/proj\n") # bare path = dir + everything under - touched: list[str] = [] - out = make_out() - code = keep_mod.cmd_keep( - config=make_config(keep=None), csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=out, solkeep=solkeep, - execute_fn=recording_execute(touched), - ) - assert code == 0 - assert touched == ["/scratch/sparky/proj/run"] - # An explicit --solkeep is a deliberate choice, not the deprecated fallback. - assert "deprecated" not in out.stderr.file.getvalue() - - -def test_keep_explicit_solkeep_empty_exits_2(tmp_path: Path) -> None: - solkeep = tmp_path / "empty" - solkeep.write_text("# only a comment\n\n") - code = keep_mod.cmd_keep( - config=make_config(keep=None), csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), solkeep=solkeep, - ) - assert code == 2 - - -def test_keep_autodetects_home_solkeep(tmp_path: Path, monkeypatch) -> None: - """With no [keep] block, an existing ~/.solkeep is picked up automatically.""" - home = tmp_path / "home" - home.mkdir() - (home / ".solkeep").write_text("/scratch/sparky/proj\n!**/__pycache__\n") - monkeypatch.setenv("HOME", str(home)) - csvdir = tmp_path / "csv" - csvdir.mkdir() - write_csv( - csvdir / "scratch-dirs-pending-removal.csv", - ["/scratch/sparky/proj/run", "/scratch/sparky/proj/__pycache__", "/scratch/sparky/x"], - ) - touched: list[str] = [] - code = keep_mod.cmd_keep( - config=make_config(keep=None), csv_dir=csvdir, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), - execute_fn=recording_execute(touched), - ) - assert code == 0 - assert touched == ["/scratch/sparky/proj/run"] # negation carved out __pycache__ - - -def test_keep_solkeep_fallback_warns_deprecated(tmp_path: Path, monkeypatch) -> None: - """Falling back to ~/.solkeep emits a deprecation warning naming the cutoff.""" - home = tmp_path / "home" - home.mkdir() - (home / ".solkeep").write_text("/scratch/sparky/proj\n") - monkeypatch.setenv("HOME", str(home)) - csvdir = tmp_path / "csv" - csvdir.mkdir() - write_csv(csvdir / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/proj/run"]) - out = make_out() - keep_mod.cmd_keep( - config=make_config(keep=None), csv_dir=csvdir, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=out, - execute_fn=recording_execute([]), - ) - err = out.stderr.file.getvalue() - assert "deprecated" in err - assert keep_mod.SOLKEEP_REMOVED_IN in err - assert "import-solkeep" in err - - -def test_keep_config_keep_emits_no_deprecation(tmp_path: Path) -> None: - """Using a config [keep] block (the supported path) warns about nothing.""" - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - out = make_out() - keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=out, - execute_fn=recording_execute([]), - ) - assert "deprecated" not in out.stderr.file.getvalue() - - -def test_keep_config_precedence_over_solkeep(tmp_path: Path, monkeypatch) -> None: - """A config [keep] block wins over a present ~/.solkeep.""" - home = tmp_path / "home" - home.mkdir() - (home / ".solkeep").write_text("/scratch/sparky/from-solkeep\n") - monkeypatch.setenv("HOME", str(home)) - csvdir = tmp_path / "csv" - csvdir.mkdir() - write_csv( - csvdir / "scratch-dirs-pending-removal.csv", - ["/scratch/sparky/from-config", "/scratch/sparky/from-solkeep"], - ) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/from-config"])) - touched: list[str] = [] - keep_mod.cmd_keep( - config=cfg, csv_dir=csvdir, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), - execute_fn=recording_execute(touched), - ) - assert touched == ["/scratch/sparky/from-config"] - - -def test_keep_dry_run_does_not_execute(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - touched: list[str] = [] - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=True, verbose=False, out=make_out(), - execute_fn=recording_execute(touched), - ) - assert code == 0 - assert touched == [] - - -def test_keep_executes_with_yes(tmp_path: Path) -> None: - write_csv( - tmp_path / "scratch-dirs-pending-removal.csv", - ["/scratch/sparky/a", "/scratch/sparky/b"], - ) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/**"])) - touched: list[str] = [] - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), - execute_fn=recording_execute(touched, files_each=5), - ) - assert code == 0 - assert sorted(touched) == ["/scratch/sparky/a", "/scratch/sparky/b"] - - -def test_keep_prompts_and_aborts(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - touched: list[str] = [] - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=False, verbose=False, out=make_out(interactive=True), - confirm_fn=lambda *a, **kw: False, - execute_fn=recording_execute(touched), - ) - assert code == 1 - assert touched == [] - - -def test_keep_prompts_and_proceeds(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - touched: list[str] = [] - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=False, verbose=False, out=make_out(interactive=True), - confirm_fn=lambda *a, **kw: True, - execute_fn=recording_execute(touched), - ) - assert code == 0 - assert touched == ["/scratch/sparky/a"] - - -def test_keep_non_interactive_refuses(tmp_path: Path) -> None: - """No TTY on stdin and no -y/-n -> refuse with exit 2, never prompt.""" - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - touched: list[str] = [] - out = make_out(interactive=False) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=False, verbose=False, out=out, - confirm_fn=lambda *a, **kw: pytest.fail("must not prompt"), - execute_fn=recording_execute(touched), - ) - assert code == 2 - assert touched == [] - assert "non-interactive" in out.stderr.file.getvalue() - - -def test_keep_no_matches_no_prompt(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/z"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - confirms: list = [] - touched: list[str] = [] - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=False, verbose=False, out=make_out(), - confirm_fn=lambda *a, **kw: confirms.append(True) or True, - execute_fn=recording_execute(touched), - ) - assert code == 0 - assert confirms == [] - assert touched == [] - - -def test_keep_specific_stage_only(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/p"]) - write_csv(tmp_path / "scratch-dirs-over-90days.csv", ["/scratch/sparky/o"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/**"])) - touched: list[str] = [] - keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="pending", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), - execute_fn=recording_execute(touched), - ) - assert touched == ["/scratch/sparky/p"] - - -def test_keep_propagates_failures(tmp_path: Path) -> None: - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), - execute_fn=lambda plan, jobs_n, out: (0, 1), - ) - assert code == 1 - - -def test_keep_json_summary(tmp_path: Path) -> None: - import json as _json - - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - out = make_out(json_mode=True, interactive=False) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=out, - execute_fn=lambda plan, jobs_n, out: (7, 0), - ) - assert code == 0 - data = _json.loads(out.stdout.file.getvalue()) - assert data["files_touched"] == 7 - assert data["dirs"] == 1 - assert data["kept"] == ["/scratch/sparky/a"] - - -def test_keep_dry_run_json_plan(tmp_path: Path) -> None: - import json as _json - - write_csv( - tmp_path / "scratch-dirs-pending-removal.csv", - ["/scratch/sparky/a", "/scratch/sparky/z"], - ) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - out = make_out(json_mode=True, interactive=False) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=True, verbose=False, out=out, - ) - assert code == 0 - data = _json.loads(out.stdout.file.getvalue()) - assert data["dry_run"] is True - assert [k["dir"] for k in data["kept"]] == ["/scratch/sparky/a"] - assert [s["dir"] for s in data["skipped"]] == ["/scratch/sparky/z"] - assert data["kept_count"] == 1 and data["skipped_count"] == 1 - assert data["kept_truncated"] is False - - -def test_keep_json_plan_bounded(tmp_path: Path) -> None: - """Sol can flag thousands of dirs; JSON inlines a capped sample + exact counts.""" - import json as _json - - n = keep_mod.JSON_LIST_CAP + 50 - dirs = [f"/scratch/sparky/proj/run-{i:04d}" for i in range(n)] - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", dirs) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/proj/**"])) - out = make_out(json_mode=True, interactive=False) - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=True, verbose=False, out=out, - ) - assert code == 0 - data = _json.loads(out.stdout.file.getvalue()) - assert data["kept_count"] == n # exact total - assert data["kept_truncated"] is True - assert len(data["kept"]) == keep_mod.JSON_LIST_CAP # sample is capped - # full detail spilled to a temp file whose path is returned - full_path = data["full_plan_path"] - assert os.path.exists(full_path) - full = _json.load(open(full_path)) - assert len(full["kept"]) == n # complete list on disk - os.unlink(full_path) - - -def test_keep_json_plan_small_no_spill(tmp_path: Path) -> None: - """A small plan stays inline with no temp file.""" - import json as _json - - write_csv(tmp_path / "scratch-dirs-pending-removal.csv", ["/scratch/sparky/a"]) - cfg = make_config(keep=make_keep(include=["/scratch/sparky/a"])) - out = make_out(json_mode=True, interactive=False) - keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=False, dry_run=True, verbose=False, out=out, - ) - data = _json.loads(out.stdout.file.getvalue()) - assert "full_plan_path" not in data - - -# ---- end-to-end: real filesystem mutation (real-touch over a real tree) -- - - -def test_keep_end_to_end_real_touch(tmp_path: Path) -> None: - """Build a real scratch tree with stale mtimes; cmd_keep refreshes the - kept files recursively and leaves carve-outs / non-kept dirs alone. - - Sol flags *leaf* directories, so the CSV lists leaves — never a parent - that contains another flagged row. A kept dir is walked recursively, so a - carve-out only protects a tree when it is its own flagged row (a sibling - leaf). This is the L2 renewal coverage that the standalone renewal eval - used to provide. - """ - scratch = tmp_path / "scratch" - src = scratch / "proj" / "src" - pycache = scratch / "proj" / "__pycache__" - other = scratch / "other" - (src / "nested").mkdir(parents=True) - pycache.mkdir(parents=True) - other.mkdir(parents=True) - - kept_file = src / "keep-me.bin" - nested_file = src / "nested" / "deep.bin" # recursion within a kept leaf - carve_file = pycache / "skip.pyc" # carve-out sibling leaf - other_file = other / "not-flagged.bin" # never in [keep] - for f in (kept_file, nested_file, carve_file, other_file): - f.write_text("x") - - stale = time.time() - 60 * 60 * 24 * 100 - for f in (kept_file, nested_file, carve_file, other_file): - os.utime(f, (stale, stale)) - - # Leaves only: kept tree, the carve-out sibling, and an unkept dir. - write_csv( - tmp_path / "scratch-dirs-pending-removal.csv", - [str(src), str(pycache), str(other)], - ) - cfg = make_config( - keep=make_keep(include=[f"{scratch}/proj/**"], exclude=["**/__pycache__"]) - ) - - code = keep_mod.cmd_keep( - config=cfg, csv_dir=tmp_path, stage="all", jobs_n=1, - yes=True, dry_run=False, verbose=False, out=make_out(), - ) - assert code == 0 - - now = time.time() - assert kept_file.stat().st_mtime > now - 30 # kept leaf renewed - assert nested_file.stat().st_mtime > now - 30 # recursion renewed - assert carve_file.stat().st_mtime < now - 1000 # carve-out untouched - assert other_file.stat().st_mtime < now - 1000 # non-kept untouched diff --git a/solx/tests/test_main.py b/solx/tests/test_main.py deleted file mode 100644 index 36bde2f..0000000 --- a/solx/tests/test_main.py +++ /dev/null @@ -1,624 +0,0 @@ -"""CLI dispatch + alias coverage for the argparse entry point. - -These tests verify the wiring (subcommand routing, alias paths, flag -parsing). Behavior of each command body is tested in test_jobs.py / -test_keep.py / test_init.py / test_config.py. We mock `require_sol` here -so the suite passes off-Sol. -""" -from __future__ import annotations - -import json -import subprocess -import sys - -import pytest - -from solx import __version__ -from solx import config as cfg -from solx import main as main_mod -from solx import side -from solx.config import Config, JobTemplate - - -@pytest.fixture(autouse=True) -def _force_on_sol(monkeypatch): - """Skip the side guard so every test runs as if on Sol.""" - monkeypatch.setattr(side, "require_sol", lambda: None) - - -def invoke(argv: list[str]) -> int: - """Run main(argv) and return the exit code carried by SystemExit.""" - with pytest.raises(SystemExit) as excinfo: - main_mod.main(argv) - code = excinfo.value.code - return 0 if code is None else int(code) - - -def fake_config() -> Config: - return Config( - default_shell="zsh", - default_template="default", - start_timeout_seconds=600, - templates={"default": JobTemplate(name="default", partition="x", time="1-0")}, - ) - - -# ---- top level ---------------------------------------------------------- - - -def test_version(capsys) -> None: - assert invoke(["--version"]) == 0 - assert capsys.readouterr().out.strip() == __version__ - - -def test_version_flag_after_other_root_options(capsys) -> None: - assert invoke(["--json", "--version"]) == 0 - assert capsys.readouterr().out.strip() == __version__ - - -def test_help_lists_commands(capsys) -> None: - assert invoke(["--help"]) == 0 - out = capsys.readouterr().out - for cmd in ("init", "keep", "jump", "job", "jobs", "config", "completions"): - assert cmd in out - - -def test_version_subcommand_aliases_flag(capsys) -> None: - """`solx version` matches `solx --version`.""" - assert invoke(["version"]) == 0 - assert capsys.readouterr().out.strip() == __version__ - - -def test_help_subcommand_aliases_flag(capsys) -> None: - """`solx help` shows the root help, same as `solx --help`.""" - assert invoke(["help"]) == 0 - out = capsys.readouterr().out - for cmd in ("init", "keep", "job", "config", "completions"): - assert cmd in out - - -def test_version_subcommand_rejects_extra_args(capsys) -> None: - """`solx version bogus` is a usage error, not a version print.""" - assert invoke(["version", "bogus"]) == 2 - assert __version__ not in capsys.readouterr().out - - -def test_version_subcommand_help(capsys) -> None: - """`solx version --help` shows the command's help, not the version.""" - assert invoke(["version", "--help"]) == 0 - assert "usage: solx version" in capsys.readouterr().out - - -def test_version_flag_with_unknown_option_before(capsys) -> None: - assert invoke(["--bogus", "--version"]) == 2 - assert __version__ not in capsys.readouterr().out - - -def test_version_flag_with_unknown_option_after(capsys) -> None: - assert invoke(["--version", "--bogus"]) == 2 - assert __version__ not in capsys.readouterr().out - - -def test_help_subcommand_rejects_extra_args(capsys) -> None: - """`solx help` takes no arguments: `solx help job` is a usage error.""" - assert invoke(["help", "job"]) == 2 - - -def test_solx_complete_env_exits_silently(monkeypatch, capsys) -> None: - """With _SOLX_COMPLETE set (a <=0.4.0 completion script calling back in), - solx exits 0 with no output, so the script offers zero candidates.""" - monkeypatch.setenv("_SOLX_COMPLETE", "complete_zsh") - assert invoke(["job", "list"]) == 0 - out, err = capsys.readouterr() - assert out == "" - assert err == "" - - -def test_no_args_prints_help_and_exits_2(capsys) -> None: - assert invoke([]) == 2 - assert "usage: solx" in capsys.readouterr().out - - -def test_job_group_no_args_prints_help_and_exits_2(capsys) -> None: - assert invoke(["job"]) == 2 - assert "usage: solx job" in capsys.readouterr().out - - -def test_unknown_command_exits_2(capsys) -> None: - assert invoke(["frobnicate"]) == 2 - assert "usage" in capsys.readouterr().err - - -def test_no_option_abbreviation(monkeypatch, capsys) -> None: - """Option prefixes are never expanded (`--dry` must not match --dry-run).""" - from solx import keep as keep_mod - - monkeypatch.setattr(keep_mod, "cmd_keep", lambda **kw: 0) - assert invoke(["keep", "--dry"]) == 2 - - -# ---- alias coverage ----------------------------------------------------- - - -def test_jobs_alias_routes_to_job_group(monkeypatch) -> None: - """`solx jobs list` should dispatch the same as `solx job list`.""" - called: list[str] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_list", lambda **kw: called.append("list") or 0) - assert invoke(["jobs", "list"]) == 0 - assert called == ["list"] - - -def test_ls_alias(monkeypatch) -> None: - called: list[str] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_list", lambda **kw: called.append("list") or 0) - assert invoke(["job", "ls"]) == 0 - assert called == ["list"] - - -def test_jobs_alias_after_root_json(monkeypatch) -> None: - """The alias rewrite also applies after root options: `--json jobs list`.""" - called: list[str] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_list", lambda **kw: called.append("list") or 0) - assert invoke(["--json", "jobs", "list"]) == 0 - assert called == ["list"] - - -def test_top_level_jump_routes_to_job_jump(monkeypatch) -> None: - """`solx jump` should run the same body as `solx job jump`.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_jump", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["jump", "12345", "--quiet"]) == 0 - assert captured[0]["jobid_arg"] == "12345" - assert captured[0]["quiet"] is True - - -# ---- global output flags ------------------------------------------------ - - -def test_global_json_forces_json(monkeypatch, capsys) -> None: - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - # global --json before the subcommand; config show has no local --json here - assert invoke(["--json", "config", "show"]) == 0 - assert json.loads(capsys.readouterr().out)["default_shell"] == "zsh" - - -# ---- job subcommands ---------------------------------------------------- - - -def test_job_start_passthrough(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "default", "--", "--mem=128G"]) == 0 - assert captured[0]["template_name"] == "default" - assert captured[0]["passthrough"] == ["--mem=128G"] - - -def test_job_start_dry_run_flag(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "--dry-run"]) == 0 - assert captured[0]["dry_run"] is True - - -def test_job_start_timeout_override(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "--timeout", "5m"]) == 0 - assert captured[0]["timeout_override"] == 300 - - -def test_job_start_timeout_equals_form(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "--timeout=5m", "-n"]) == 0 - assert captured[0]["timeout_override"] == 300 - assert captured[0]["dry_run"] is True - - -def test_job_start_invalid_timeout(monkeypatch) -> None: - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - assert invoke(["job", "start", "--timeout", "never"]) == 2 - - -def test_job_start_template_after_double_dash(monkeypatch) -> None: - """The first token not consumed by a known option names the template, - including tokens after `--`.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "-n", "--", "--mem=128G"]) == 0 - assert captured[0]["template_name"] == "--mem=128G" - assert captured[0]["passthrough"] == [] - assert captured[0]["dry_run"] is True - - -def test_job_start_double_dash_shields_dry_run_flag(monkeypatch) -> None: - """With the template set, a `-n` after `--` is salloc passthrough, not - a dry-run flag.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "gpu", "--", "-n"]) == 0 - assert captured[0]["template_name"] == "gpu" - assert captured[0]["dry_run"] is False - assert captured[0]["passthrough"] == ["-n"] - - -def test_job_start_double_dash_shields_flag_with_value(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "gpu", "--", "-n", "4"]) == 0 - assert captured[0]["dry_run"] is False - assert captured[0]["passthrough"] == ["-n", "4"] - - -def test_job_start_double_dash_shields_timeout(monkeypatch) -> None: - """With the template unset, the first token after `--` names the - template even when it looks like a known flag.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "--", "--timeout", "30s"]) == 0 - assert captured[0]["template_name"] == "--timeout" - assert captured[0]["timeout_override"] is None - assert captured[0]["passthrough"] == ["30s"] - - -def test_job_start_second_double_dash_forwarded(monkeypatch) -> None: - """The first `--` is consumed; later `--` tokens are forwarded literally.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "gpu", "-n", "--", "--mem=1G", "--", "-c", "2"]) == 0 - assert captured[0]["template_name"] == "gpu" - assert captured[0]["dry_run"] is True - assert captured[0]["passthrough"] == ["--mem=1G", "--", "-c", "2"] - - -def test_job_start_bundled_shorts_expand(monkeypatch) -> None: - """`-nn` unbundles to `-n -n` when every letter is a known short flag.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "-nn"]) == 0 - assert captured[0]["dry_run"] is True - assert captured[0]["template_name"] is None - assert captured[0]["passthrough"] == [] - - -def test_job_start_bundle_with_unknown_letter_is_passthrough(monkeypatch) -> None: - """A short bundle with any unknown letter is forwarded whole to salloc.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "-nx"]) == 0 - assert captured[0]["dry_run"] is False - assert captured[0]["template_name"] is None - assert captured[0]["passthrough"] == ["-nx"] - - -def test_job_start_dry_run_rejects_explicit_value(capsys) -> None: - """`--dry-run=true` is a usage error: the flag takes no value.""" - assert invoke(["job", "start", "--dry-run=true"]) == 2 - assert "--dry-run" in capsys.readouterr().err - - -def test_job_start_mixed_passthrough_order(monkeypatch) -> None: - """Known options are consumed wherever they appear; everything else is - passthrough in its original order.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "gpu", "-n", "--mem=128G", "-c", "8"]) == 0 - assert captured[0]["template_name"] == "gpu" - assert captured[0]["passthrough"] == ["--mem=128G", "-c", "8"] - assert captured[0]["dry_run"] is True - - -def test_job_start_trailing_json_is_passthrough(monkeypatch) -> None: - """A --json after `job start` belongs to the salloc passthrough.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["job", "start", "gpu", "--json"]) == 0 - assert captured[0]["template_name"] == "gpu" - assert captured[0]["passthrough"] == ["--json"] - - -def test_root_json_before_job_start(monkeypatch) -> None: - """The root --json still applies on the `job start` path.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_start", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["--json", "job", "start", "-n"]) == 0 - assert captured[0]["dry_run"] is True - assert captured[0]["out"].json_mode is True - - -def test_job_stop_yes_flag(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_stop", lambda **kw: captured.append(kw) or 0) - assert invoke(["job", "stop", "12345", "-y"]) == 0 - assert captured[0]["yes"] is True - assert captured[0]["dry_run"] is False - - -def test_job_stop_force_is_alias_for_yes(monkeypatch) -> None: - """`-f`/`--force` is interchangeable with `-y`/`--yes` for skipping the prompt.""" - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_stop", lambda **kw: captured.append(kw) or 0) - assert invoke(["job", "stop", "12345", "--force"]) == 0 - assert captured[0]["yes"] is True - - -def test_job_time_no_arg(monkeypatch) -> None: - captured: list[dict] = [] - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_time", lambda **kw: captured.append(kw) or 0) - assert invoke(["job", "time"]) == 0 - assert captured[0]["jobid_arg"] is None - - -# ---- keep --------------------------------------------------------------- - - -def test_keep_dry_run(monkeypatch) -> None: - captured: list[dict] = [] - from solx import keep as keep_mod - - monkeypatch.setattr(keep_mod, "cmd_keep", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert invoke(["keep", "-n"]) == 0 - assert captured[0]["dry_run"] is True - - -def test_keep_invalid_stage(capsys) -> None: - assert invoke(["keep", "--stage", "bogus"]) == 2 - captured = capsys.readouterr() - assert "invalid --stage" in captured.err or "invalid --stage" in captured.out - - -@pytest.mark.parametrize("jobs", ["0", "-2"]) -def test_keep_jobs_below_one_exits_2(jobs: str, capsys) -> None: - assert invoke(["keep", "-n", "-j", jobs]) == 2 - captured = capsys.readouterr() - assert "invalid --jobs" in captured.err or "invalid --jobs" in captured.out - - -def test_keep_solkeep_flag_and_missing_config(monkeypatch, tmp_path) -> None: - """`solx keep --solkeep ...` works with no config.toml (config passed as None).""" - captured: list[dict] = [] - from solx import keep as keep_mod - - monkeypatch.setattr(keep_mod, "cmd_keep", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(cfg, "config_path", lambda: tmp_path / "absent.toml") - assert invoke(["keep", "--solkeep", "/tmp/mk", "-y"]) == 0 - assert str(captured[0]["solkeep"]) == "/tmp/mk" - assert captured[0]["config"] is None # missing config tolerated for keep - - -def test_keep_full_flag_set(monkeypatch, tmp_path) -> None: - captured: list[dict] = [] - from solx import keep as keep_mod - - monkeypatch.setattr(keep_mod, "cmd_keep", lambda **kw: captured.append(kw) or 0) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - - assert ( - invoke( - [ - "keep", - "--stage", "pending", - "--csv-dir", str(tmp_path), - "-j", "4", - "-y", - "-v", - ] - ) - == 0 - ) - kw = captured[0] - assert kw["stage"] == "pending" - assert kw["csv_dir"] == tmp_path - assert kw["jobs_n"] == 4 - assert kw["yes"] is True - assert kw["verbose"] is True - - -# ---- init --------------------------------------------------------------- - - -def test_init_default(monkeypatch) -> None: - captured: list[dict] = [] - from solx import init as init_mod - - monkeypatch.setattr(init_mod, "cmd_init", lambda **kw: captured.append(kw) or 0) - assert invoke(["init"]) == 0 - assert captured[0]["force"] is False - - -def test_init_force(monkeypatch) -> None: - captured: list[dict] = [] - from solx import init as init_mod - - monkeypatch.setattr(init_mod, "cmd_init", lambda **kw: captured.append(kw) or 0) - assert invoke(["init", "-f"]) == 0 - assert captured[0]["force"] is True - - -def test_init_yes_is_alias_for_force(monkeypatch) -> None: - captured: list[dict] = [] - from solx import init as init_mod - - monkeypatch.setattr(init_mod, "cmd_init", lambda **kw: captured.append(kw) or 0) - assert invoke(["init", "-y"]) == 0 - assert captured[0]["force"] is True - - -# ---- config ------------------------------------------------------------- - - -def test_config_show(monkeypatch, capsys) -> None: - config = Config( - default_shell="zsh", - default_template="default", - start_timeout_seconds=600, - templates={ - "default": JobTemplate( - name="default", partition="lightwork", time="1-0", qos="public" - ) - }, - ) - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: config) - assert invoke(["config", "show"]) == 0 - assert "lightwork" in capsys.readouterr().out - - -def test_config_show_json(monkeypatch, capsys) -> None: - monkeypatch.setattr(main_mod, "_load_or_exit", lambda *a, **kw: fake_config()) - assert invoke(["config", "show", "--json"]) == 0 - data = json.loads(capsys.readouterr().out) - assert data["default_shell"] == "zsh" - assert "default" in data["templates"] - - -def test_config_edit_no_config(monkeypatch, tmp_path, capsys) -> None: - monkeypatch.setattr(cfg, "config_path", lambda: tmp_path / "absent.toml") - assert invoke(["config", "edit"]) == 2 - assert "no config at" in capsys.readouterr().err - - -def test_config_edit_splits_editor_flags(monkeypatch, tmp_path) -> None: - """$EDITOR with flags (e.g. `code --wait`) is split into argv, not one binary.""" - cfgfile = tmp_path / "config.toml" - cfgfile.write_text("default_shell = 'bash'\n") - monkeypatch.setattr(cfg, "config_path", lambda: cfgfile) - monkeypatch.setenv("EDITOR", "myed --wait") - captured: dict = {} - - def fake_call(argv): - captured["argv"] = argv - return 0 - - monkeypatch.setattr(subprocess, "call", fake_call) - assert invoke(["config", "edit"]) == 0 - assert captured["argv"] == ["myed", "--wait", str(cfgfile)] - - -def test_config_import_solkeep_wiring(monkeypatch) -> None: - """`solx config import-solkeep --solkeep F` routes to init.cmd_import_solkeep.""" - captured: list[dict] = [] - from solx import init as init_mod - - monkeypatch.setattr( - init_mod, "cmd_import_solkeep", lambda **kw: captured.append(kw) or 0 - ) - assert invoke(["config", "import-solkeep", "--solkeep", "/tmp/mk", "-f"]) == 0 - assert str(captured[0]["solkeep"]) == "/tmp/mk" - assert captured[0]["force"] is True - - -# ---- completions -------------------------------------------------------- - - -def test_completions_invalid_shell(capsys) -> None: - assert invoke(["completions", "tcsh"]) == 2 - assert "unknown shell 'tcsh'" in capsys.readouterr().err - - -def test_completions_bash_emits_script(capsys) -> None: - assert invoke(["completions", "bash"]) == 0 - out = capsys.readouterr().out - assert "complete -F _solx solx" in out - - -# ---- import hygiene ------------------------------------------------------- - - -def test_dispatch_never_imports_typer(monkeypatch) -> None: - from solx import jobs as jobs_mod - - monkeypatch.setattr(jobs_mod, "cmd_list", lambda **kw: 0) - assert invoke(["job", "list"]) == 0 - assert "typer" not in sys.modules - - -def test_importing_main_is_lean() -> None: - """`import solx.main` must not pull in rich (or any CLI framework).""" - code = ( - "import solx.main, sys; " - "assert 'rich' not in sys.modules; " - "assert 'typer' not in sys.modules" - ) - subprocess.run([sys.executable, "-c", code], check=True) - - -def test_python_m_solx_version() -> None: - res = subprocess.run( - [sys.executable, "-m", "solx", "--version"], capture_output=True, text=True - ) - assert res.returncode == 0 - assert res.stdout.strip() == __version__ diff --git a/solx/tests/test_output.py b/solx/tests/test_output.py deleted file mode 100644 index 8552a0a..0000000 --- a/solx/tests/test_output.py +++ /dev/null @@ -1,98 +0,0 @@ -from __future__ import annotations - -import json -from io import StringIO - -from rich.console import Console - -from solx.output import Out, _Plain, _plain - - -def make_out(*, json_mode: bool = False, interactive: bool = True) -> Out: - so = Console(file=StringIO(), force_terminal=False, width=200) - se = Console(file=StringIO(), force_terminal=False, width=200) - return Out(json_mode=json_mode, interactive=interactive, stdout=so, stderr=se) - - -# ---- force / auto-detect ------------------------------------------------- - - -def test_force_json() -> None: - out = Out.auto(force="json", stdout=Console(file=StringIO(), force_terminal=True)) - assert out.json_mode is True - - -def test_force_plain_overrides_non_tty() -> None: - # Non-TTY stdout would auto-detect JSON; --plain forces human. - out = Out.auto(force="plain", stdout=Console(file=StringIO(), force_terminal=False)) - assert out.json_mode is False - - -def test_auto_non_tty_is_json() -> None: - out = Out.auto(stdout=Console(file=StringIO(), force_terminal=False), interactive=False) - assert out.json_mode is True - - -def test_auto_tty_is_human() -> None: - out = Out.auto(stdout=Console(file=StringIO(), force_terminal=True), interactive=True) - assert out.json_mode is False - - -# ---- streams ------------------------------------------------------------- - - -def test_status_goes_to_stderr_not_stdout() -> None: - out = make_out(json_mode=True) - out.status("hello") - assert out.stderr.file.getvalue().strip() == "hello" - assert out.stdout.file.getvalue() == "" - - -def test_error_goes_to_stderr() -> None: - out = make_out(json_mode=True) - out.error("boom") - assert "boom" in out.stderr.file.getvalue() - assert out.stdout.file.getvalue() == "" - - -def test_json_is_clean_parseable() -> None: - out = make_out(json_mode=True) - out.json({"jobid": "123", "state": "RUNNING"}) - payload = out.stdout.file.getvalue() - assert json.loads(payload) == {"jobid": "123", "state": "RUNNING"} - - -def test_emit_json_mode() -> None: - out = make_out(json_mode=True) - out.emit(data={"n": 1}, human=lambda: "human-text") - assert json.loads(out.stdout.file.getvalue()) == {"n": 1} - - -def test_emit_human_mode() -> None: - out = make_out(json_mode=False) - out.emit(data={"n": 1}, human=lambda: "human-text") - assert "human-text" in out.stdout.file.getvalue() - assert out.stdout.file.getvalue().strip() != '{"n": 1}' - - -# ---- no-rich (agent) path ------------------------------------------------ - - -def test_plain_strips_rich_markup() -> None: - assert _plain(r"[red]error:[/] bad \[keep] thing") == "error: bad [keep] thing" - # interpolated punctuation (a TOML error) is not mistaken for markup - assert _plain("oops (at line 11, column 21)") == "oops (at line 11, column 21)" - - -def test_plain_writer_writes_plain_text() -> None: - buf = StringIO() - _Plain(buf).print("[yellow]warning:[/] x") - assert buf.getvalue().strip() == "warning: x" - - -def test_auto_json_uses_plain_writer() -> None: - """The JSON path builds a _Plain writer, so nothing imports rich.Console.""" - out = Out.auto(force="json") - assert isinstance(out.stdout, _Plain) - assert isinstance(out.stderr, _Plain) - assert out.json_mode is True diff --git a/solx/tests/test_side.py b/solx/tests/test_side.py deleted file mode 100644 index ea21c36..0000000 --- a/solx/tests/test_side.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import annotations - -import pytest - -from solx import side - - -@pytest.mark.parametrize( - "hostname_output, expected", - [ - ("login02.sol.rc.asu.edu", "sol"), - ("sg045.sol.rc.asu.edu", "sol"), - ("sg045 sg045.sol.rc.asu.edu sg045-ib", "sol"), - ("my-laptop.local", "not-sol"), - ("", "not-sol"), - ("login02.example.com", "not-sol"), - ], -) -def test_detect_branches(hostname_output: str, expected: str) -> None: - assert side.detect(_runner=lambda: hostname_output) == expected - - -def test_detect_uses_runner_injection() -> None: - """The runner is what determines the result, not the live host.""" - sentinel = "fake-host.sol.rc.asu.edu" - assert side.detect(_runner=lambda: sentinel) == "sol" - - -def test_current_node_returns_short_name() -> None: - """Best-effort, non-crashing on any host.""" - node = side.current_node() - assert isinstance(node, str) - assert "." not in node # short form, no FQDN - - -def test_require_sol_off_sol_exits_2(monkeypatch, capsys) -> None: - monkeypatch.setattr(side, "detect", lambda: "not-sol") - with pytest.raises(SystemExit) as excinfo: - side.require_sol() - assert excinfo.value.code == 2 - assert "Sol-only" in capsys.readouterr().err - - -def test_require_sol_on_sol_passes(monkeypatch) -> None: - monkeypatch.setattr(side, "detect", lambda: "sol") - assert side.require_sol() is None diff --git a/solx/tests/test_slurm.py b/solx/tests/test_slurm.py deleted file mode 100644 index aa21734..0000000 --- a/solx/tests/test_slurm.py +++ /dev/null @@ -1,242 +0,0 @@ -from __future__ import annotations - -import pytest - -from solx.config import JobTemplate -from solx import slurm -from solx.slurm import Job, SlurmError - - -# ---- runner helper ------------------------------------------------------- - - -def make_runner(*, code: int = 0, stdout: str = "", stderr: str = ""): - captured: dict = {} - - def runner(argv): - captured["argv"] = argv - return code, stdout, stderr - - return runner, captured - - -# ---- squeue -------------------------------------------------------------- - - -def test_squeue_user_jobs_parses_rows() -> None: - out = ( - "12345|solx-default|RUNNING|00:05:23|00:54:37|lightwork|sg045\n" - "12346|notebook|PENDING|00:00:00|01:00:00|htc|(Resources)\n" - ) - runner, cap = make_runner(stdout=out) - jobs = slurm.squeue_user_jobs(user="sparky", runner=runner) - assert len(jobs) == 2 - assert jobs[0] == Job( - job_id="12345", - name="solx-default", - state="RUNNING", - time_used="00:05:23", - time_left="00:54:37", - partition="lightwork", - node_list="sg045", - ) - assert "-u" in cap["argv"] and "sparky" in cap["argv"] - - -def test_squeue_user_jobs_empty() -> None: - runner, _ = make_runner(stdout="") - assert slurm.squeue_user_jobs(user="sparky", runner=runner) == [] - - -def test_squeue_user_jobs_failure() -> None: - runner, _ = make_runner(code=1, stderr="slurmctld is down") - with pytest.raises(SlurmError, match="slurmctld is down"): - slurm.squeue_user_jobs(user="sparky", runner=runner) - - -# ---- resolve_jobid ------------------------------------------------------- - - -TWO_RUNNING = ( - "12345|solx-default|RUNNING|00:01:00|00:59:00|lightwork|sg045\n" - "67890|notebook|RUNNING|00:01:00|00:59:00|htc|sg010\n" -) - - -def test_resolve_jobid_arg_wins() -> None: - runner, cap = make_runner() - res = slurm.resolve_jobid( - "99999", verb=slurm.VERB_STOP, env={"SLURM_JOB_ID": "11111"}, runner=runner - ) - assert res.job_id == "99999" - assert res.source == "arg" - assert res.inside is True and res.inside_job_id == "11111" - assert "argv" not in cap # never queried squeue - - -def test_resolve_jobid_uses_env_on_compute_node() -> None: - runner, cap = make_runner() - res = slurm.resolve_jobid(None, verb=slurm.VERB_TIME, env={"SLURM_JOB_ID": "55555"}, runner=runner) - assert res.job_id == "55555" - assert res.source == "inside" - assert res.acting_on_current is True - assert "argv" not in cap - - -def test_resolve_jobid_single_running_job() -> None: - out = "12345|solx-default|RUNNING|00:01:00|00:59:00|lightwork|sg045\n" - runner, _ = make_runner(stdout=out) - res = slurm.resolve_jobid(None, verb=slurm.VERB_STOP, env={}, user="sparky", runner=runner) - assert res.job_id == "12345" - assert res.source == "single" - assert res.ambiguous is False - - -def test_resolve_jobid_zero_jobs() -> None: - runner, _ = make_runner(stdout="") - res = slurm.resolve_jobid(None, verb=slurm.VERB_TIME, env={}, user="sparky", runner=runner) - assert res.job_id is None - assert res.error and "no jobs found" in res.error - - -def test_resolve_jobid_stop_ambiguous_no_autopick() -> None: - runner, _ = make_runner(stdout=TWO_RUNNING) - res = slurm.resolve_jobid(None, verb=slurm.VERB_STOP, env={}, user="sparky", runner=runner) - assert res.job_id is None - assert res.ambiguous is True - assert {j.job_id for j in res.candidates} == {"12345", "67890"} - - -def test_resolve_jobid_time_picks_most_recent() -> None: - runner, _ = make_runner(stdout=TWO_RUNNING) - res = slurm.resolve_jobid(None, verb=slurm.VERB_TIME, env={}, user="sparky", runner=runner) - assert res.job_id == "67890" # highest jobid == most recent - assert res.source == "most-recent" - assert res.ambiguous is False - - -def test_resolve_jobid_jump_filters_running_only() -> None: - out = ( - "12345|a|RUNNING|00:01|00:59|p|sg045\n" - "67890|b|PENDING|00:00|01:00|p|(Resources)\n" - ) - runner, _ = make_runner(stdout=out) - res = slurm.resolve_jobid(None, verb=slurm.VERB_JUMP, env={}, user="sparky", runner=runner) - # only the RUNNING job is an attach candidate -> unambiguous - assert res.job_id == "12345" - assert res.source == "single" - - -def test_resolve_jobid_jump_no_running() -> None: - out = "67890|b|PENDING|00:00|01:00|p|(Resources)\n" - runner, _ = make_runner(stdout=out) - res = slurm.resolve_jobid(None, verb=slurm.VERB_JUMP, env={}, user="sparky", runner=runner) - assert res.job_id is None - assert res.error and "no running job" in res.error - - -def test_most_recent_highest_jobid() -> None: - jobs = [ - Job("100", "a", "RUNNING", "", "", "p"), - Job("9999", "b", "RUNNING", "", "", "p"), - Job("250", "c", "RUNNING", "", "", "p"), - ] - assert slurm.most_recent(jobs).job_id == "9999" - - -def test_most_recent_array_ids() -> None: - jobs = [Job("100_1", "a", "R", "", "", "p"), Job("100_7", "b", "R", "", "", "p")] - assert slurm.most_recent(jobs).job_id == "100_7" - - -# ---- argv builders ------------------------------------------------------- - - -def test_salloc_argv_minimal() -> None: - t = JobTemplate(name="default", partition="lightwork", time="1-0") - argv = slurm.salloc_argv(t) - assert argv == [ - "salloc", - "--no-shell", - "-J", - "solx-default", - "-p", - "lightwork", - "-t", - "1-0", - ] - - -def test_salloc_argv_full() -> None: - t = JobTemplate( - name="gpu", - partition="public", - time="0-4", - qos="public", - gres="gpu:a100:1", - extra_args=("--mem=64G", "--cpus-per-task=8"), - ) - argv = slurm.salloc_argv(t, passthrough=["--mail-type=END"]) - assert argv == [ - "salloc", "--no-shell", "-J", "solx-gpu", - "-p", "public", - "-t", "0-4", - "-q", "public", - "--gres=gpu:a100:1", - "--mem=64G", "--cpus-per-task=8", - "--mail-type=END", - ] - - -def test_scancel_argv() -> None: - assert slurm.scancel_argv("12345") == ["scancel", "12345"] - - -def test_srun_pty_argv() -> None: - # --overlap lets the step share the allocation's busy resources. - assert slurm.srun_pty_argv("12345", "zsh") == [ - "srun", - "--jobid=12345", - "--overlap", - "--pty", - "zsh", - ] - - -def test_squeue_time_left_argv() -> None: - argv = slurm.squeue_time_left_argv("12345") - assert argv == ["squeue", "-h", "-j", "12345", "-O", "TimeLeft"] - - -# ---- salloc parse + run -------------------------------------------------- - - -def test_parse_granted_jobid() -> None: - text = ( - "salloc: Pending job allocation 51642835\n" - "salloc: job 51642835 queued and waiting for resources\n" - "salloc: job 51642835 has been allocated resources\n" - "salloc: Granted job allocation 51642835\n" - ) - assert slurm.parse_granted_jobid(text) == "51642835" - - -def test_parse_granted_jobid_missing() -> None: - with pytest.raises(SlurmError, match="could not parse"): - slurm.parse_granted_jobid("salloc: error: queue down\n") - - -def test_run_salloc_success_via_runner() -> None: - runner, cap = make_runner( - stderr="salloc: Granted job allocation 99999\n", - ) - argv = ["salloc", "--no-shell"] - jid = slurm.run_salloc(argv, timeout_seconds=60, runner=runner) - assert jid == "99999" - assert cap["argv"] == argv - - -def test_run_salloc_failure_via_runner() -> None: - runner, _ = make_runner(code=1, stderr="salloc: error: invalid partition\n") - with pytest.raises(SlurmError, match="invalid partition"): - slurm.run_salloc(["salloc"], timeout_seconds=60, runner=runner) diff --git a/solx/uv.lock b/solx/uv.lock deleted file mode 100644 index a9c1311..0000000 --- a/solx/uv.lock +++ /dev/null @@ -1,238 +0,0 @@ -version = 1 -revision = 3 -requires-python = ">=3.10" - -[[package]] -name = "colorama" -version = "0.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, -] - -[[package]] -name = "exceptiongroup" -version = "1.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, -] - -[[package]] -name = "iniconfig" -version = "2.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, -] - -[[package]] -name = "markdown-it-py" -version = "4.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, -] - -[[package]] -name = "mdurl" -version = "0.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, -] - -[[package]] -name = "packaging" -version = "26.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" }, -] - -[[package]] -name = "pathspec" -version = "1.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5a/82/42f767fc1c1143d6fd36efb827202a2d997a375e160a71eb2888a925aac1/pathspec-1.1.1.tar.gz", hash = "sha256:17db5ecd524104a120e173814c90367a96a98d07c45b2e10c2f3919fff91bf5a", size = 135180, upload-time = "2026-04-27T01:46:08.907Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/d9/7fb5aa316bc299258e68c73ba3bddbc499654a07f151cba08f6153988714/pathspec-1.1.1-py3-none-any.whl", hash = "sha256:a00ce642f577bf7f473932318056212bc4f8bfdf53128c78bbd5af0b9b20b189", size = 57328, upload-time = "2026-04-27T01:46:07.06Z" }, -] - -[[package]] -name = "pluggy" -version = "1.6.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, -] - -[[package]] -name = "pygments" -version = "2.20.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, -] - -[[package]] -name = "pytest" -version = "9.0.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "iniconfig" }, - { name = "packaging" }, - { name = "pluggy" }, - { name = "pygments" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, -] - -[[package]] -name = "rich" -version = "15.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" }, -] - -[[package]] -name = "ruff" -version = "0.15.16" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a6/bd/5f7ec371001337d8fa61701c186ff8b613ecac1651848c5950f4c4d5f2e9/ruff-0.15.16.tar.gz", hash = "sha256:d05e78d38c78caf020b03789e25106c93017db5a0cb6e2819885018c61343b78", size = 4714267, upload-time = "2026-06-04T16:33:09.974Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/42/53ef1c3953f157956db9bf7861e3bc50b9b887ce93300aa48cdba8336fe6/ruff-0.15.16-py3-none-linux_armv6l.whl", hash = "sha256:6ac3c0b3969cc6cf6b158c4e2f8f682acb58e7d700d8a44b65ecdc72d66ab0b2", size = 10709025, upload-time = "2026-06-04T16:32:51.935Z" }, - { url = "https://files.pythonhosted.org/packages/93/9a/a79159346f19134a956607754e57d8d128f7a4c00f4ad2f7514d224c172c/ruff-0.15.16-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:197c207ed75ffba54a0dec23db4aa939a27a3053073e085e0042433cbdc58e4a", size = 11063550, upload-time = "2026-06-04T16:32:42.24Z" }, - { url = "https://files.pythonhosted.org/packages/bc/72/3ce2ac000a5299ec238e01f51397b3b653c93b077d9b1bfe8715bb895f20/ruff-0.15.16-py3-none-macosx_11_0_arm64.whl", hash = "sha256:3a39fec45ab316cc23e7558f23fea4a70403ddb5648ea9a4a3854a16973d0071", size = 10421345, upload-time = "2026-06-04T16:32:37.251Z" }, - { url = "https://files.pythonhosted.org/packages/b0/c2/cc7fad3ec9169373f5b6a18f1917b91080feec40c3f9658334a1d28e2f03/ruff-0.15.16-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba93191d79003116b95128c9d306e045200fdbd0bccb782b110f3cd1d4abc5cf", size = 10757217, upload-time = "2026-06-04T16:32:54.722Z" }, - { url = "https://files.pythonhosted.org/packages/69/d2/3474009eaa0a65b31fa7152a2fad5e2f050c640ceb1e6b02ee6922e94c82/ruff-0.15.16-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c6ee4b90520630120ef032aa5cc10db483852dff950e78b1d717e2993a61ac8d", size = 10507035, upload-time = "2026-06-04T16:33:05.343Z" }, - { url = "https://files.pythonhosted.org/packages/ca/81/b7ae6ccbd11f0c8dc3d5d67fc4be9b57ff57ca86ba56152021378e1277f2/ruff-0.15.16-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4e4215bc938bc3c8215c1472c1aa437e310fee20cd427335fec9d7e609563628", size = 11255291, upload-time = "2026-06-04T16:32:49.49Z" }, - { url = "https://files.pythonhosted.org/packages/d9/e1/46e526f1a7cc90857ce6ddf25fbb77eb6568651ac38d71b033af07076dd5/ruff-0.15.16-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7c8d26be963b090f10e29abc8b3e74a2a321f6fa34e02424e30b5af89350ecbb", size = 12124922, upload-time = "2026-06-04T16:33:07.821Z" }, - { url = "https://files.pythonhosted.org/packages/1a/da/5c791b088b596b24d0deb967fa28ae02ad751a140c0b9ea81c5ab915d6c0/ruff-0.15.16-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f198cf4123602a2280ed46c307bcbafe41758d6fee5b456b6b6058ca1514b3b4", size = 11332186, upload-time = "2026-06-04T16:33:02.971Z" }, - { url = "https://files.pythonhosted.org/packages/72/11/5da87abe20047c8962361473923ebb2f62b595250126aadfad8c20649c1e/ruff-0.15.16-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb27515fa6240fb586ae82b901a59e67d24acff86f2190b433dc542fe0435aeb", size = 11373541, upload-time = "2026-06-04T16:32:47.007Z" }, - { url = "https://files.pythonhosted.org/packages/fe/2a/8554754c23a854ae3fd6b507e36ad61ddb121e298c6d5d617dec94ed0f14/ruff-0.15.16-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a267c46ba1593fc26b8eecbea050b39d40c0b6bb7781ee11c90a02cd10032951", size = 11353014, upload-time = "2026-06-04T16:32:34.795Z" }, - { url = "https://files.pythonhosted.org/packages/62/25/62ea41529ec89f742ea3fed9cb1059c72877ec7cf9b9e99ac9cf3294d1d9/ruff-0.15.16-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:528c68f39a91498a8d50e91ff5985df3d105782bab49cc378e73ac26bff083e8", size = 10737467, upload-time = "2026-06-04T16:32:26.348Z" }, - { url = "https://files.pythonhosted.org/packages/90/17/334d3ad9de4d40f9dd58fdd09e35ce64553bb501e2f19a839e2fb6be14fc/ruff-0.15.16-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7ed55c58950df60589a9a7a5d2f8fa5f54ebd287163be805adfe6ee95a9de123", size = 10521910, upload-time = "2026-06-04T16:32:32.54Z" }, - { url = "https://files.pythonhosted.org/packages/4d/bd/3ac7c6ae77a885c1004b3dda2446ea401768d24f851c14b4ad4b24f6639c/ruff-0.15.16-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d482feaf51512b50f9790ceb417a56a61dd1e9d9bf967662b9ed27c01b34f53a", size = 10979190, upload-time = "2026-06-04T16:32:57.492Z" }, - { url = "https://files.pythonhosted.org/packages/33/d7/609546e6a413c3f216fbf2a50c928f97c80939154f6a0503114094a86191/ruff-0.15.16-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1e15bc8c94513dae2a40cc9ef07c94fdd4ecc9e29dabebeebe170f952322c9e3", size = 11477014, upload-time = "2026-06-04T16:32:44.687Z" }, - { url = "https://files.pythonhosted.org/packages/74/0d/f2cd247ad32633a5c36e97141a2c21b11c6279f7957bc2ff360b1e08fddd/ruff-0.15.16-py3-none-win32.whl", hash = "sha256:580378f7bd4aa25f72e74aa54948a9622f142b1e509521dd10902e886681cc1e", size = 10735541, upload-time = "2026-06-04T16:32:30.145Z" }, - { url = "https://files.pythonhosted.org/packages/8b/9e/02e845ef151b1dee585e55c4739f8e1734ae1d9f1221dff65761c162208b/ruff-0.15.16-py3-none-win_amd64.whl", hash = "sha256:408256017284eddf98fff77b29aa4fb30f586042d535b2d9befc6512f400aaec", size = 11843403, upload-time = "2026-06-04T16:32:39.76Z" }, - { url = "https://files.pythonhosted.org/packages/15/19/016553f86f207450aebebc2b2b5088d086b901cc8186c02ac4284db3bd88/ruff-0.15.16-py3-none-win_arm64.whl", hash = "sha256:8cd61783afb39638a7133ef0d2dfb1e91277593962f81b5a8423eb0b888a6121", size = 11134555, upload-time = "2026-06-04T16:33:00.136Z" }, -] - -[[package]] -name = "solx" -version = "0.5.1" -source = { editable = "." } -dependencies = [ - { name = "pathspec" }, - { name = "rich" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] - -[package.dev-dependencies] -dev = [ - { name = "pytest" }, - { name = "ruff" }, -] - -[package.metadata] -requires-dist = [ - { name = "pathspec", specifier = ">=0.12" }, - { name = "rich", specifier = ">=13" }, - { name = "tomli", marker = "python_full_version < '3.11'", specifier = ">=2.0" }, -] - -[package.metadata.requires-dev] -dev = [ - { name = "pytest", specifier = ">=8" }, - { name = "ruff", specifier = ">=0.13" }, -] - -[[package]] -name = "tomli" -version = "2.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" }, - { url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" }, - { url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" }, - { url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" }, - { url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" }, - { url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" }, - { url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" }, - { url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" }, - { url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" }, - { url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" }, - { url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" }, - { url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" }, - { url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" }, - { url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" }, - { url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" }, - { url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" }, - { url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" }, - { url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" }, - { url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" }, - { url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" }, - { url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" }, - { url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" }, - { url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" }, - { url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" }, - { url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" }, - { url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" }, - { url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" }, - { url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" }, - { url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" }, - { url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" }, - { url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" }, - { url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" }, - { url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" }, - { url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" }, - { url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" }, - { url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" }, - { url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" }, - { url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" }, - { url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" }, - { url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" }, - { url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" }, - { url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" }, - { url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" }, - { url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" }, - { url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" }, - { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" }, -] - -[[package]] -name = "typing-extensions" -version = "4.15.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, -]