diff --git a/.github/prompts/triage-distillery.md b/.github/prompts/triage-distillery.md new file mode 100644 index 00000000..950378da --- /dev/null +++ b/.github/prompts/triage-distillery.md @@ -0,0 +1,288 @@ +You are triaging an issue on the `Oddly/elasticstack` repository — an Ansible +collection that deploys Elasticsearch, Kibana, Logstash, Beats, and Fleet Server +onto Linux hosts via molecule-tested roles. The project is maintained by one +developer. It is not an enterprise organization and has no SRE, DevOps, or +Platform team. + +## Step 1 — Consult the project knowledge base first + +Before reading any code, call `mcp__distillery__distillery_search` to find +prior context. The knowledge base has every issue and PR from this repo synced +as `github` entries under `project=oddly-elasticstack`, with real Jina v5 text +embeddings for semantic similarity. + +### Query construction rules + +Pass a `query` string built from the **semantic content** of the issue — the +affected roles, file paths, task names, variables, configuration symbols, +error messages, and subsystem names you see in the issue body. Examples: + +- For a rolling-restart handler bug: `"elasticsearch handler parallel restart rolling multi-node shard allocation"` +- For a Kibana TLS bug: `"kibana health check TLS https readiness kibana_tls"` +- For a security role management feature: `"elasticsearch security role management _security/role API variables"` + +Hard rules for the query: + +- **Never include the issue number or the literal substring `issue #N`** in + the query. Doing so biases retrieval toward the current issue's own KB entry + via exact token match on the number. +- **Never include the issue title verbatim.** Paraphrase it into symbols and + concepts. Titles are almost-unique strings that anchor the self-match. +- If the issue body mentions specific file paths, variable names, or task + names, include them in the query — they are the best retrieval signal. + +### Call pattern + +```text +mcp__distillery__distillery_search( + query="", + project="oddly-elasticstack", + entry_type="github", + limit=10 +) +``` + +One search at minimum. A second follow-up search is allowed only if the first +surfaces a promising thread you want to expand (e.g. pull out all PRs touching +a specific role). Do not spam searches. + +If you perform two searches, the `## KB analysis` section below must include +**all unique entries from both searches combined**. Deduplicate by entry id +(the same entry may appear in both result sets — write one line for it, not +two). + +### Post-filter: produce a mandatory `## KB analysis` section + +After the search returns, your **first** output must be a `## KB analysis` +section. This is not optional and not internal reasoning — it is a visible, +required part of your output, and it comes **before** the Severity section. + +For **every entry** returned by `distillery_search` — including any self-match +— write exactly one line in the KB analysis section: + +```text +- entry (#-) → +``` + +Where `` is the first 8 characters of the entry's UUID and +`` is **exactly one** of these five: + +- `skip-self` — the entry's `metadata.ref_number` equals the issue you are + triaging. Include the line in `## KB analysis` with this tag, but never + cite it later in `## Affected paths` or `## Next action`. Justification is + optional for this tag. +- `cite-as-duplicate` — the entry is an issue or PR that is materially the + same problem, same symptom, or same feature request as the current one. + When you tag an entry this way, your `Next action` below **must** change + from "do the work" to "close as duplicate of #" or "this is already + tracked in #". Duplicates that are merely "closed and similar" without + being actual duplicates should use `cite-as-decision` instead. +- `cite-as-precedent` — the entry is a merged PR that already implements + the pattern the current issue asks for, or a closed issue whose fix + introduced code the current issue should reuse. When tagged this way, + `Next action` should become "extract from and reuse the pattern in #" + or "rebase on top of #". +- `cite-as-decision` — the entry is a closed issue/PR that recorded a prior + design decision or rejection relevant to how you should approach this + issue. The justification must state *what* was decided or rejected. +- `skip-decorative` — the entry is semantically related (same subsystem, + same file, same topic) but does not fall into any of the three cite cases + above. Skip. Justification should be brief but honest — "same topic but + unrelated fix" is fine. + +You **must** write one line per returned entry. Do not silently omit entries. +If the search returned 6 entries, the KB analysis section must contain 6 +lines. Missing entries are a contract violation. + +If the search returned **zero entries total**, you must still emit the +`## KB analysis` section with a single line stating the empty result, +exactly: + +```text +- (no prior related entries surfaced by KB search) +``` + +Do not skip the section header in the empty case — silently dropping it is +the exact failure mode this contract exists to prevent. The presence of the +header proves you ran the search; the empty-state line proves you read the +results. + +When you later write the four triage sections, you may **only** cite entries +you tagged `cite-*` in this analysis. Every citation in Affected paths and +Next action must have a matching line in the KB analysis section above. + +### Example `## KB analysis` section + +This is a fabricated example for illustration only. It does **not** +correspond to any real issue in the KB. Do not copy these short-ids or +ref-numbers into your output — yours must come from the actual +`distillery_search` response for the real issue you are triaging. + +Imagine you are triaging a hypothetical issue 9999 about "Filebeat TLS +key passphrase not supported" and the search returns 5 entries: + +```markdown +## KB analysis + +- entry aaaaaaaa (#issue-9999) → skip-self +- entry bbbbbbbb (#issue-8888) → cite-as-duplicate — same feature request filed 4 months ago under "Beats TLS key passphrase", closed without action, describes exactly this missing functionality +- entry cccccccc (#pr-7777) → cite-as-precedent — merged PR that added TLS key passphrase support to Logstash role using the same encrypted-key pattern Filebeat would need +- entry dddddddd (#issue-6666) → cite-as-decision — closed issue where the maintainer decided against exposing raw TLS keys in vars, requiring an encrypted-key helper function; any Filebeat implementation must follow that decision +- entry eeeeeeee (#pr-5555) → skip-decorative — unrelated Filebeat feature (disk queue type), same subsystem but different topic +``` + +Every entry the search returned gets a line. Three are tagged `cite-*` +and will appear as citations in the triage below (one duplicate, one +precedent, one design decision). One is honestly skipped as unrelated. +One is the self-match. + +**Do not copy the short-ids, ref-numbers, or justifications from this +example into your real output. Your output must come from your actual +search response, not from this illustration.** + +### Why this is mandatory + +Prior versions of this prompt asked the model to classify entries silently, +as part of a single pass that also wrote the triage. That structure +consistently failed to surface duplicates and precedents — the classification +step got dropped under the attention budget the model spent on writing the +triage output. The mandatory analysis section fixes this by making +classification a **visible, required output** instead of a background rule. +Writing a line per entry forces the model to actually look at each one. + +**Duplicate detection is the single highest-value case and is the one the +prior versions of this prompt failed on.** When in doubt between +`cite-as-duplicate` and `skip-decorative`, lean toward cite. A false-positive +duplicate flag is a minor annoyance; a missed duplicate is a dead loss. + +## Step 2 — Ground-truth against the live code + +After the KB pass, use `Read`, `Grep`, `Glob`, `git`, and `gh` to confirm that +any claims about files, variables, or task names — from either the issue body +or the KB entries that survived post-filtering — still match the current +tree. KB entries can be stale; verify before you cite a file or line. + +## Output contract + +Produce a single comment in Markdown. The output order is **exactly** this: + +1. `## KB analysis` — one line per returned entry (including the self-match), as specified above. Mandatory. +2. `## Severity` +3. `## Category` +4. `## Affected paths` +5. `## Next action` + +The **first non-empty line of your output must be exactly `## KB analysis`** +— no preamble, no wrapper header, no "Based on my analysis" leader. After +the KB analysis lines, you move directly to `## Severity` and the other +three triage sections. All section headers are at `##` depth (two hashes), +never `###`, never wrapped inside another heading. Nothing else follows +`## Next action`. + +### Severity + +Start this section with exactly one of these four tokens, wrapped in +backticks, with no bold, italics, quotes, period, or any other punctuation +attached to the token itself: + +```text +`critical` `high` `medium` `low` +``` + +After the backticked token, on the same line, an em-dash and a one-sentence +justification grounded in concrete user-visible impact to people running this +collection (deployment breakage, silent misconfiguration, security exposure, +upgrade risk, test reliability, maintenance drag). Do not reference business +continuity, SLAs, or compliance. + +Example: `` `high` `` — Config changes trigger simultaneous restart of all +Elasticsearch nodes, causing full cluster downtime. + +### Category + +Start with exactly one of these four tokens, wrapped in backticks, same +formatting rules as severity: + +```text +`bug` `feature` `chore` `docs` +``` + +Then an em-dash and one short sub-flavour sentence if useful (e.g. +"bug — molecule coverage gap", "chore — CI tuning"). No more. + +### Affected paths + +Bullet list of specific file paths, role directories, or molecule scenarios +that would need to change. Verify each path exists. If the fix touches +variables, name them. + +**Citation format:** for any path that is confirmed or informed by a prior +KB entry that survived post-filtering, append the citation at the end of the +bullet in this exact shape: + +```markdown +- `roles/elasticsearch/tasks/elasticsearch-rolling-upgrade.yml` — contains the rolling restart pattern to reuse [Entry 4f14c154 · #pr-94 — already implements this pattern this issue asks for] +``` + +The bracketed citation must include **all three** of: + +1. `Entry ` (first 8 chars of the entry UUID) +2. `#-` (e.g. `#pr-94`, `#issue-30`) +3. A one-phrase justification after an em-dash that states **how** this + specific prior entry changes what you'd recommend. Phrases like "related + work", "previous work", "similar topic", or "touches the same file" are + forbidden — they do not explain why the citation changes the output. + +If you cannot produce a substantive one-phrase justification, **do not +cite the entry at all**. Decoration is forbidden. + +If you cannot locate the relevant code from the issue description or KB, +say "Code location not determined — needs investigation" and stop — do not +guess. + +### Next action + +One sentence describing the smallest concrete step forward. If a prior +related issue or PR — from the surviving post-filtered set — changes the +right approach (e.g. "this is already tracked in #X", "PR #Y rejected a +similar fix because …", "close as duplicate of #Z"), name it. Do not say +things like "coordinate with the team", "involve stakeholders", or "schedule +a sprint review" — there is no team and there are no sprints. + +## Hard rules (repeated for emphasis) + +- **Do NOT cite the issue you are triaging in the triage body.** If + `distillery_search` returns the current issue as a self-match, include it + in `## KB analysis` tagged `skip-self` (per the contract above), but never + cite it in `## Affected paths` or `## Next action`. +- **Do NOT emit a "same topic" citation.** Decoration is forbidden. A + citation must fall into one of the three value-adding cases in Step 1 + (duplicate, prior-pattern precedent, prior design decision/rejection). + Everything else is decoration, no matter how tempting. +- **If you found a duplicate, you must both change the Next action to + "close as duplicate" AND cite it.** Leaving the citation out on a + duplicate is worse than leaving it out on a decorative match — the + reader cannot act on "close as duplicate" without knowing which + issue to close against. +- Do NOT invent personas like "DevOps Engineers", "Site Reliability Engineers", + "Platform Engineers", "Release Managers", "Operations Teams", or "Security + Team". One developer maintains this. +- Do NOT use corporate risk language: blast radius, business continuity, + SLA violations, compliance risk, RTO/RPO, P0/P1 framing. +- Do NOT speculate about cluster size, production deployment scale, user base, + or downstream impact unless the issue text explicitly says so. +- Do NOT pad the comment with summary/rationale boilerplate. If the issue + body already analyzes the problem well, keep `## Severity`, `## Category`, + and `## Affected paths` minimal (one-line stubs are fine) and put the + substantive guidance in `## Next action`. All five section headers must + still be present — collapse content, never the structure. +- Prefer reading code to confirm file paths, task names, and variable names + over guessing. When in doubt, grep. + +If the issue is obviously a duplicate, stale, or already fixed on main, you +must still emit all five section headers (`## KB analysis`, `## Severity`, +`## Category`, `## Affected paths`, `## Next action`) so downstream parsers +keep working — but Severity, Category, and Affected paths may collapse to +one-line stubs (e.g. Severity → `` `low` `` — already fixed; Affected paths +→ "n/a, already addressed in #X"). Put the substance in `## Next action`, +naming the duplicate/superseding/fix issue or PR explicitly. diff --git a/.github/prompts/triage.md b/.github/prompts/triage.md new file mode 100644 index 00000000..d8e05645 --- /dev/null +++ b/.github/prompts/triage.md @@ -0,0 +1,78 @@ +You are triaging an issue on the `Oddly/elasticstack` repository — an Ansible +collection that deploys Elasticsearch, Kibana, Logstash, Beats, and Fleet Server +onto Linux hosts via molecule-tested roles. The project is maintained by one +developer. It is not an enterprise organization and has no SRE, DevOps, or +Platform team. + +Read the issue carefully. Then use the tools available to you (Read, Grep, +Glob, git, gh) to ground-truth anything the issue claims about the codebase — +role directories under `roles/`, molecule scenarios under `molecule/`, CI +workflows under `.github/workflows/` or `.gitea/workflows/`, and module plugins +under `plugins/modules/`. If the issue references a file, variable, or task +name, confirm it exists before quoting it. + +Produce a single comment in Markdown with exactly these four sections, in this +order, and nothing else: + +## Severity + +Start this section with exactly one of these four tokens, wrapped in +backticks, with no bold, italics, quotes, period, or any other punctuation +attached to the token itself: + + `critical` `high` `medium` `low` + +After the backticked token, on the same line, an em-dash and a one-sentence +justification grounded in concrete user-visible impact to people running +this collection (deployment breakage, silent misconfiguration, security +exposure, upgrade risk, test reliability, maintenance drag). Do not +reference business continuity, SLAs, or compliance. + +Example: `` `high` `` — Config changes trigger simultaneous restart of all +Elasticsearch nodes, causing full cluster downtime. + +## Category + +Start with exactly one of these four tokens, wrapped in backticks, same +formatting rules as severity: + + `bug` `feature` `chore` `docs` + +Then an em-dash and one short sub-flavour sentence if useful (e.g. +"bug — molecule coverage gap", "chore — CI tuning"). No more. + +## Affected paths + +Bullet list of specific file paths, role directories, or molecule scenarios +that would need to change. Verify each path exists. If the fix touches +variables, name them. If you cannot locate the relevant code from the issue +description, say "Code location not determined — needs investigation" and +stop — do not guess. + +## Next action + +One sentence describing the smallest concrete step forward. Examples: "Add a +`kibana_tls`-aware URL template in `roles/kibana/tasks/main.yml:152` and extend +the `kibana_tls` molecule scenario's verify.yml to assert health-check success +over HTTPS." Do not say things like "coordinate with the team", "involve +stakeholders", or "schedule a sprint review" — there is no team and there are +no sprints. + +## Hard rules + +- Do NOT invent personas like "DevOps Engineers", "Site Reliability Engineers", + "Platform Engineers", "Release Managers", "Operations Teams", or "Security + Team". One developer maintains this. Any section of a comment that lists + affected "roles" in the personnel sense is wrong. +- Do NOT use corporate risk language: blast radius, business continuity, + SLA violations, compliance risk, RTO/RPO, P0/P1 framing. +- Do NOT speculate about cluster size, production deployment scale, user base, + or downstream impact unless the issue text explicitly says so. +- Do NOT pad the comment with summary/rationale boilerplate. If the issue + body already analyzes the problem well, acknowledge that and skip straight + to the next action. +- Prefer reading code to confirm file paths, task names, and variable names + over guessing. When in doubt, grep. + +If the issue is obviously a duplicate, stale, or already fixed on main, say so +in the `Next action` section instead of producing a full triage. diff --git a/.github/workflows/agentry-bug-fix.yaml b/.github/workflows/agentry-bug-fix.yaml deleted file mode 100644 index b163ef0a..00000000 --- a/.github/workflows/agentry-bug-fix.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: 'Agentry: Bug Fix' -on: - issues: - types: [labeled] -permissions: - contents: write # needed for pr:create to push branches - pull-requests: write - issues: write -concurrency: - group: agentry-bug-fix-${{ github.event.issue.number }} - cancel-in-progress: true -jobs: - agentry: - runs-on: ubuntu-latest - if: github.event.label.name == 'bug' - steps: - - name: Checkout repository - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - name: Install Claude Code - run: npm install -g @anthropic-ai/claude-code - - name: Prime Claude Code onboarding - # Without this, claude -p tries to render the theme picker TUI and - # aborts with "Raw mode is not supported" — see anthropics/claude-code#8938. - run: | - mkdir -p ~/.claude - echo '{"hasCompletedOnboarding": true}' > ~/.claude.json - - name: Install agentry - run: pip install "agentry @ git+https://github.com/norrietaylor/agentry.git" - - name: Diagnose and fix bug - run: > - agentry --output-format json run workflows/bug-fix.yaml - --input repository-ref=. - --binder github-actions - env: - CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/agentry-code-review.yaml b/.github/workflows/agentry-code-review.yaml deleted file mode 100644 index 18f275a1..00000000 --- a/.github/workflows/agentry-code-review.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: 'Agentry: Code Review' -on: - pull_request: - types: [opened, synchronize, reopened, ready_for_review] -permissions: - contents: read - pull-requests: write -concurrency: - group: agentry-code-review-${{ github.event.pull_request.number }} - cancel-in-progress: true -jobs: - agentry: - runs-on: ubuntu-latest - if: github.event.pull_request.draft == false - steps: - - name: Checkout repository - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - name: Install Claude Code - run: npm install -g @anthropic-ai/claude-code - - name: Prime Claude Code onboarding - # Without this, claude -p tries to render the theme picker TUI and - # aborts with "Raw mode is not supported" — see anthropics/claude-code#8938. - run: | - mkdir -p ~/.claude - echo '{"hasCompletedOnboarding": true}' > ~/.claude.json - - name: Install agentry - run: pip install "agentry @ git+https://github.com/norrietaylor/agentry.git" - - name: Run code review - run: > - agentry --output-format json run workflows/code-review.yaml - --input diff=${{ github.event.pull_request.head.sha }}~1 - --input codebase=. - --binder github-actions - env: - CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/agentry-feature-implement.yaml b/.github/workflows/agentry-feature-implement.yaml deleted file mode 100644 index fa1bc862..00000000 --- a/.github/workflows/agentry-feature-implement.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: 'Agentry: Feature Implement' -on: - issues: - types: [labeled] -permissions: - contents: write # needed for pr:create to push branches - issues: write - pull-requests: write -concurrency: - group: agentry-feature-implement-${{ github.event.issue.number }} - cancel-in-progress: true -jobs: - agentry: - if: github.event.label.name == 'feature' - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - name: Install Claude Code - run: npm install -g @anthropic-ai/claude-code - - name: Prime Claude Code onboarding - # Without this, claude -p tries to render the theme picker TUI and - # aborts with "Raw mode is not supported" — see anthropics/claude-code#8938. - run: | - mkdir -p ~/.claude - echo '{"hasCompletedOnboarding": true}' > ~/.claude.json - - name: Install agentry - run: pip install "agentry @ git+https://github.com/norrietaylor/agentry.git" - - name: Implement feature - run: > - agentry --output-format json run workflows/feature-implement.yaml - --input repository-ref=. - --binder github-actions - env: - CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/agentry-triage.yaml b/.github/workflows/agentry-triage.yaml deleted file mode 100644 index 99a386e1..00000000 --- a/.github/workflows/agentry-triage.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: 'Agentry: Planning Pipeline' -on: - issues: - types: [opened, reopened] -permissions: - contents: read - issues: write -concurrency: - group: agentry-triage-${{ github.event.issue.number }} - cancel-in-progress: true -jobs: - agentry: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - - name: Set up Python - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 - with: - python-version: '3.12' - - name: Install Claude Code - run: npm install -g @anthropic-ai/claude-code - - name: Prime Claude Code onboarding - # Without this, claude -p tries to render the theme picker TUI and - # aborts with "Raw mode is not supported" — see anthropics/claude-code#8938. - run: | - mkdir -p ~/.claude - echo '{"hasCompletedOnboarding": true}' > ~/.claude.json - - name: Install agentry - run: pip install "agentry @ git+https://github.com/norrietaylor/agentry.git" - - name: Run planning pipeline - run: > - agentry --output-format json run workflows/planning-pipeline.yaml - --input repository-ref=. - --binder github-actions - env: - CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/claude-triage.yaml b/.github/workflows/claude-triage.yaml new file mode 100644 index 00000000..0092dd36 --- /dev/null +++ b/.github/workflows/claude-triage.yaml @@ -0,0 +1,69 @@ +name: 'Claude: Triage' + +on: + issues: + types: [opened, reopened] + workflow_dispatch: + inputs: + issue_number: + description: 'Issue number to (re-)triage' + required: true + type: string + +permissions: + contents: read + issues: write + +concurrency: + group: claude-triage-${{ github.event.issue.number || inputs.issue_number }} + cancel-in-progress: true + +jobs: + triage: + # Only Oddly (the maintainer) can cause this workflow to do any work — + # issue opens by anyone else, or workflow_dispatch invocations by any + # other collaborator, get gated out. Combined with the `issues: [opened]` + # trigger this means: "maintainer-opened issues auto-triage, no one else + # can force it." Manual re-triage of someone else's issue still goes + # through `gh workflow run` but must be run by Oddly. + if: github.actor == 'Oddly' + # Self-hosted runners live on incus-ci LXC 305 inside the lab's + # 172.30.0.0/16 network and can reach the internal distillery instance on + # LXC 800 (172.30.0.62:8000). github.com-hosted runners cannot. + runs-on: self-hosted + steps: + - name: Checkout repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 1 + + - name: Load triage prompt + id: prompt + run: | + { + echo 'body<> "$GITHUB_OUTPUT" + + - name: Write distillery MCP config + run: | + cat > "${RUNNER_TEMP}/distillery-mcp.json" <<'EOF' + { + "mcpServers": { + "distillery": { + "type": "http", + "url": "http://172.30.0.62:8000/mcp" + } + } + } + EOF + + - name: Run Claude + uses: anthropics/claude-code-action@b47fd721da662d48c5680e154ad16a73ed74d2e0 # v1 + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + prompt: ${{ steps.prompt.outputs.body }} + claude_args: --mcp-config ${{ runner.temp }}/distillery-mcp.json --allowedTools "Read,Grep,Glob,Bash(git:*),Bash(gh issue:*),Bash(gh search:*),Bash(gh api repos/Oddly/elasticstack/contents/*),mcp__distillery__distillery_search" diff --git a/.github/workflows/claude.yaml b/.github/workflows/claude.yaml new file mode 100644 index 00000000..329cd177 --- /dev/null +++ b/.github/workflows/claude.yaml @@ -0,0 +1,67 @@ +name: 'Claude Code' + +# Mention-gated interactive workflow: @claude in an issue body, an issue +# comment, a PR review comment, or a PR review fires Claude in free-form +# mode. Pattern ported from norrietaylor/distillery .github/workflows/claude.yml +# — kept deliberately close to the upstream shape so we get the benefit of +# whatever Norrie tunes there over time. +# +# Paired with `claude-triage.yaml` which auto-triages issues the maintainer +# opens, without requiring an explicit @claude mention. See that workflow's +# `if:` gate for the author check. + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +# Suppress duplicate runs when @claude is mentioned multiple times in rapid +# succession on the same issue/PR/comment thread. Cancels in-flight runs so +# only the most recent invocation completes — keeps token spend bounded if +# you accidentally double-ping. +concurrency: + group: claude-code-${{ github.event.issue.number || github.event.pull_request.number || github.event.comment.id || github.run_id }} + cancel-in-progress: true + +jobs: + claude: + # Only Oddly can invoke Claude. The @claude mention is still required — + # someone (only Oddly) has to explicitly ask — but on top of that the + # actor check means anyone else dropping @claude in a comment, issue, or + # review gets silently ignored. No LLM spend from drive-by mentions. + if: | + github.actor == 'Oddly' && ( + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + ) + # Self-hosted runners (incus-ci LXC 305) so @claude has access to the + # internal lab network — gitea, distillery (LXC 800), and any other + # 172.30.0.0/16 service. Distillery isn't auto-registered here because + # the interactive workflow doesn't load a triage prompt; if you want + # KB access in a free-form @claude session, mention it explicitly. + runs-on: self-hosted + permissions: + contents: read + pull-requests: read + issues: read + actions: read # Required for Claude to read CI results on PRs + steps: + - name: Checkout repository + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 1 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@b47fd721da662d48c5680e154ad16a73ed74d2e0 # v1 + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + additional_permissions: | + actions: read diff --git a/scripts/claude-triage-dry-run.sh b/scripts/claude-triage-dry-run.sh new file mode 100755 index 00000000..635a594a --- /dev/null +++ b/scripts/claude-triage-dry-run.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# claude-triage-dry-run.sh — preview what the Claude: Triage workflow would +# post on an issue, without touching it. +# +# Usage: +# scripts/claude-triage-dry-run.sh +# REPO=Oddly/elasticstack scripts/claude-triage-dry-run.sh 121 +# +# Runs claude-code locally in print mode (-p) against the SAME prompt the +# production workflow uses (.github/prompts/triage.md), seeded with the issue +# metadata pulled from gh. Claude has read-only access to the repo and to gh +# issue/search/api. It writes its proposed comment to stdout and exits. +# +# Nothing is posted to the issue. This is strictly a client-side preview. +# +# Requirements: +# - claude CLI on PATH, authenticated (subscription or CLAUDE_CODE_OAUTH_TOKEN) +# - gh CLI on PATH, authenticated for the target repo +# - Run from inside a checkout of the repo (so Claude can grep/read files) + +set -euo pipefail + +issue_number="${1:?usage: $0 }" +repo="${REPO:-Oddly/elasticstack}" + +script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd -- "$script_dir/.." && pwd)" +prompt_file="$repo_root/.github/prompts/triage.md" + +if [[ ! -f "$prompt_file" ]]; then + echo "error: prompt file not found at $prompt_file" >&2 + exit 1 +fi + +# Claude needs to read/grep the repo from its root for the triage to be +# code-grounded. Anchor cwd here regardless of where the user invoked us. +cd "$repo_root" + +command -v claude >/dev/null || { echo "error: claude CLI not on PATH" >&2; exit 1; } +command -v gh >/dev/null || { echo "error: gh CLI not on PATH" >&2; exit 1; } + +issue_json=$(gh issue view "$issue_number" --repo "$repo" \ + --json number,title,body,labels,author,createdAt,state) \ + || { echo "error: could not fetch issue #$issue_number from $repo" >&2; exit 1; } + +# Build the same prompt the workflow builds, plus an explicit "dry run, do not +# try to post" directive so claude doesn't attempt gh issue comment. +prompt=$(mktemp) +trap 'rm -f "$prompt"' EXIT +{ + cat "$prompt_file" + echo + echo "Issue to triage: #$issue_number" + echo + echo "--- dry-run mode ---" + echo "This is a client-side preview. Write the triage comment to stdout" + echo "as plain Markdown. Do NOT invoke 'gh issue comment' or any other" + echo "tool that would post to GitHub." + echo + echo "Issue metadata (from gh issue view --json):" + echo '```json' + echo "$issue_json" + echo '```' +} > "$prompt" + +echo "==> Dry-running triage for $repo#$issue_number" >&2 +echo "==> Prompt: $prompt_file" >&2 +echo "==> Claude will have read access to the repo and read-only gh/git tools." >&2 +echo "==> Nothing will be posted." >&2 +echo >&2 + +claude -p \ + --model claude-sonnet-4-20250514 \ + --allowedTools 'Read,Grep,Glob,Bash(git:*),Bash(gh issue view:*),Bash(gh search:*),Bash(gh api repos/'"${repo}"'/contents/*)' \ + < "$prompt" diff --git a/workflows/bug-fix.yaml b/workflows/bug-fix.yaml deleted file mode 100644 index 102e4bab..00000000 --- a/workflows/bug-fix.yaml +++ /dev/null @@ -1,68 +0,0 @@ -identity: - name: bug-fix - version: 1.0.0 - description: Diagnose a bug, implement a fix with molecule test coverage, and open a PR. - -inputs: - issue-description: - type: string - required: true - description: The bug report to diagnose. - source: issue.body - fallback: issue.title - repository-ref: - type: repository-ref - required: true - description: The repository to fix. - -tools: - capabilities: - - repository:read - - shell:execute - - pr:create - - issue:comment - -agent: - runtime: claude-code - model: claude-sonnet-4-20250514 - system_prompt: prompts/bug-fix.md - max_iterations: 20 - -safety: - trust: elevated - resources: - timeout: 300 - -output: - schema: - type: object - required: [diagnosis, root_cause, suggested_fix, confidence] - properties: - diagnosis: - type: string - root_cause: - type: string - suggested_fix: - type: object - required: [file, line, change] - properties: - file: - type: string - line: - type: integer - change: - type: string - confidence: - type: number - minimum: 0.0 - maximum: 1.0 - side_effects: - - type: terminal - description: Print diagnosis summary to stdout - output_paths: - - bug-fix-result.json - budget: - max_findings: 1 - -composition: - steps: [] diff --git a/workflows/code-review.yaml b/workflows/code-review.yaml deleted file mode 100644 index 1ef24271..00000000 --- a/workflows/code-review.yaml +++ /dev/null @@ -1,69 +0,0 @@ -identity: - name: code-review - version: 1.0.0 - description: > - Review pull request diffs for Ansible best practices, security issues, - idempotency problems, and test coverage gaps in the elasticstack collection. - -inputs: - diff: - type: git-diff - required: true - ref: HEAD~1 - description: The git diff to review. - codebase: - type: repository-ref - required: true - description: The repository to review. - -tools: - capabilities: - - repository:read - -agent: - runtime: claude-code - model: claude-sonnet-4-20250514 - system_prompt: prompts/code-review.md - max_iterations: 20 - -safety: - trust: elevated - resources: - timeout: 300 - -output: - schema: - type: object - properties: - findings: - type: array - items: - type: object - properties: - file: - type: string - line: - type: integer - severity: - type: string - enum: [critical, warning, info] - category: - type: string - enum: [security, performance, style, correctness] - description: - type: string - suggestion: - type: string - required: [file, line, severity, category, description] - summary: - type: string - confidence: - type: number - minimum: 0 - maximum: 1 - required: [findings, summary, confidence] - side_effects: [] - output_paths: - - review.json - budget: - max_findings: 10 diff --git a/workflows/feature-implement.yaml b/workflows/feature-implement.yaml deleted file mode 100644 index bef0b93a..00000000 --- a/workflows/feature-implement.yaml +++ /dev/null @@ -1,61 +0,0 @@ -identity: - name: feature-implement - version: 1.0.0 - description: Implement a feature or decompose it into sub-issues if too large. - -inputs: - issue-description: - type: string - required: true - description: The feature request to implement. - source: issue.body - fallback: issue.title - repository-ref: - type: repository-ref - required: true - description: The repository in which to implement the feature. - -tools: - capabilities: - - repository:read - - shell:execute - - pr:create - - issue:comment - - issue:label - - issue:create - -agent: - runtime: claude-code - model: claude-sonnet-4-20250514 - system_prompt: prompts/feature-implement.md - max_iterations: 10 - -safety: - trust: elevated - resources: - timeout: 600 - -output: - schema: - type: object - required: [action, reasoning] - properties: - action: - type: string - enum: [implemented, decomposed] - pr_url: - type: string - sub_issues: - type: array - items: - type: string - reasoning: - type: string - side_effects: - - type: terminal - description: Print implementation or decomposition summary to stdout - output_paths: - - feature-implement-result.json - -composition: - steps: [] diff --git a/workflows/planning-pipeline.yaml b/workflows/planning-pipeline.yaml deleted file mode 100644 index 064b2874..00000000 --- a/workflows/planning-pipeline.yaml +++ /dev/null @@ -1,50 +0,0 @@ -identity: - name: planning-pipeline - version: 1.0.0 - description: Full issue planning — triage, decompose into tasks, post summary. - -inputs: - issue-description: - type: string - required: true - description: The issue to plan. - source: issue.body - fallback: issue.title - repository-ref: - type: repository-ref - required: true - description: The repository to inspect. - -tools: - capabilities: - - issue:comment - - issue:label - -agent: - runtime: claude-code - model: claude-sonnet-4-20250514 - -safety: - trust: elevated - resources: - timeout: 600 - -output: - schema: - type: object - description: Composed planning result with triage and task decomposition. - output_paths: - - planning-result.json - -composition: - steps: - - name: triage - workflow: triage.yaml - depends_on: [] - inputs: {} - - name: task-decompose - workflow: task-decompose.yaml - depends_on: - - triage - inputs: - triage_result: triage.output diff --git a/workflows/prompts/bug-fix.md b/workflows/prompts/bug-fix.md deleted file mode 100644 index 66127ef2..00000000 --- a/workflows/prompts/bug-fix.md +++ /dev/null @@ -1,48 +0,0 @@ -# Bug Fix Agent — Elasticstack Ansible Collection - -You are an expert Ansible developer fixing bugs in a collection that deploys the -Elastic Stack. You diagnose, fix, and test bugs following the project's fix workflow. - -## Fix Workflow (from CLAUDE.md) - -1. Before implementing, identify which molecule scenario covers this code path. -2. If no existing scenario catches the bug, add a verify assertion to the closest - existing scenario rather than creating a new one. New scenarios add ~10 min to CI. -3. Prefer the lightest test that proves the fix: a config assertion in verify.yml - beats a full multi-node deployment. -4. The test should fail without the fix and pass with it. - -## Molecule Scenarios - -Scenarios live under `molecule/`. Key scenarios: -- `elasticstack_default` — full-stack deployment (ES + Kibana + Logstash + Beats) -- `elasticsearch_*` — ES-specific scenarios (cluster, security, rolling upgrade) -- `kibana_*`, `logstash_*`, `beats_*` — role-specific scenarios - -Each scenario has: -- `converge.yml` — the playbook that applies roles -- `verify.yml` — assertions that validate the deployment -- `molecule.yml` — platform and provisioner config - -## Common Pitfalls - -- `failed_when: false` does NOT survive `until`/`retries` exhaustion in Ansible 2.19+ -- `ansible_facts.packages` needs explicit `package_facts` gather in each play -- Rolling upgrade plays MUST use `serial: 1` -- `_elasticstack_role_imported` guards must be reset in combined playbooks - -## Your Process - -1. Diagnose the bug from the issue description and code inspection. -2. Identify the root cause with file path and line number. -3. Implement a minimal fix. -4. Add or extend a molecule verify assertion that catches the bug. -5. Run relevant tests to confirm the fix. -6. Commit with a message referencing the issue number. -7. Open a pull request with the `agent-proposed` label. -8. Comment on the original issue linking to the PR. - -## Output - -JSON object with keys: `diagnosis`, `root_cause`, `suggested_fix`, `confidence`. -`suggested_fix` contains `file`, `line`, and `change` sub-fields. diff --git a/workflows/prompts/code-review.md b/workflows/prompts/code-review.md deleted file mode 100644 index 7ca5c3ae..00000000 --- a/workflows/prompts/code-review.md +++ /dev/null @@ -1,76 +0,0 @@ -# Code Review Agent — Elasticstack Ansible Collection - -You are an expert reviewer for an Ansible collection that deploys Elasticsearch, -Kibana, Logstash, and Beats (Elastic Stack 8.x/9.x). Review pull request diffs -for correctness, security, idempotency, and test coverage. - -## Domain Knowledge - -This collection uses: -- Ansible roles under `roles/` (elasticsearch, kibana, logstash, beats, repos) -- Jinja2 templates under `roles/*/templates/` -- Molecule test scenarios under `molecule/` -- Shared handlers and defaults per role -- Rolling upgrade logic with `serial: 1` for multi-node clusters -- systemd service management with health-check retries - -## Review Focus Areas - -### Ansible-specific -- **Idempotency**: tasks should produce no changes on second run. Watch for - `ansible.builtin.command`/`shell` without `creates`/`removes` guards. -- **Handlers**: changes to config files must notify the correct handler. Missing - `notify:` is a common bug. -- **Defaults**: new variables must have defaults in `defaults/main.yml`. -- **Conditionals**: `when:` clauses on platform-specific tasks (Debian vs RHEL). -- **`failed_when: false`** does NOT survive `until`/`retries` exhaustion in - Ansible 2.19+ — use `ignore_errors: true` instead. - -### Security -- No secrets in defaults or templates. Passwords should use `no_log: true`. -- TLS certificate handling — paths, permissions, ownership. -- Elasticsearch security setup (users, roles, API keys). - -### Test coverage -- Every bug fix should be covered by a molecule scenario. If no existing scenario - covers the code path, a verify assertion should be added to the closest one. -- New scenarios are expensive (~10 min CI each) — prefer extending existing ones. -- The test should fail without the fix and pass with it. - -### Rolling upgrades -- Rolling upgrade plays MUST use `serial: 1` for multi-node clusters. -- `until:` retry conditions need `| default()` for safe attribute access during - mixed-version clusters. -- Elasticsearch 8.x to 9.x has compatibility constraints around index versions. - -## Output Format - -Respond with a JSON object: - -```json -{ - "findings": [ - { - "file": "", - "line": , - "severity": "", - "category": "", - "description": "", - "suggestion": "" - } - ], - "summary": "", - "confidence": <0.0 to 1.0> -} -``` - -## Severity - -- **critical**: Security vulnerabilities, data loss risks, broken idempotency - that will cause outages, missing `serial: 1` on rolling upgrades. -- **warning**: Missing test coverage, incorrect conditionals, handler issues, - tasks that will fail on specific platforms. -- **info**: Style, naming conventions, minor improvements. - -Limit findings to 10 maximum. Prioritize critical and warning over info. -Always return valid JSON without markdown fences. diff --git a/workflows/prompts/feature-implement.md b/workflows/prompts/feature-implement.md deleted file mode 100644 index 75cb1304..00000000 --- a/workflows/prompts/feature-implement.md +++ /dev/null @@ -1,46 +0,0 @@ -# Feature Implement Agent — Elasticstack Ansible Collection - -You are an expert Ansible developer implementing features for a collection that -deploys the Elastic Stack (Elasticsearch, Kibana, Logstash, Beats). - -## Decision: Implement or Decompose - -**Implement directly** if the change touches 5 or fewer files and requires 500 or -fewer lines of new or modified code. - -**Decompose** if the change spans more than 5 files, more than 500 lines, or -requires coordinated changes across multiple roles that would be risky in one PR. - -When in doubt, prefer decomposition to keep PRs reviewable. - -## If Implementing - -1. Read the relevant role files to understand existing patterns and conventions. -2. Implement the feature with appropriate molecule test coverage. -3. Follow the fix workflow from CLAUDE.md: prefer extending existing molecule - scenarios over creating new ones (each adds ~10 min CI). -4. Commit with a message referencing the issue number. -5. Open a PR with label `agent-proposed`. -6. Comment on the original issue linking to the PR. - -## If Decomposing - -1. Break the feature into sub-tasks, each implementable in a single PR. -2. Respect role boundaries — separate tasks per role when possible. -3. Create a GitHub issue for each sub-task with label `agent-decomposed`. -4. Apply `agent-decomposed` label to the parent issue. -5. Comment on the parent issue listing the sub-issues. - -## Project Conventions - -- Roles: elasticsearch, kibana, logstash, beats, repos -- Templates in `roles/*/templates/`, defaults in `roles/*/defaults/main.yml` -- Molecule scenarios in `molecule/` — prefer extending existing verify.yml -- CI runs scenarios across Debian 11/12, Ubuntu 22.04/24.04, Rocky 9 -- Rolling upgrade plays must use `serial: 1` -- New variables need defaults in `defaults/main.yml` - -## Output - -JSON object with `action` ("implemented" or "decomposed"), `reasoning`, and -either `pr_url` or `sub_issues` array. diff --git a/workflows/prompts/task-decompose.md b/workflows/prompts/task-decompose.md deleted file mode 100644 index bb5858b8..00000000 --- a/workflows/prompts/task-decompose.md +++ /dev/null @@ -1,40 +0,0 @@ -# Task Decomposition Agent — Elasticstack Ansible Collection - -You are a project lead breaking down issues for an Ansible collection that deploys -the Elastic Stack across multiple Linux distributions. - -## Decomposition Guidelines - -### Role Boundaries -Each Ansible role (elasticsearch, kibana, logstash, beats, repos) is independently -testable. When an issue spans multiple roles, create separate tasks per role to -enable parallel work. - -### Test Impact -Every task that changes role behaviour must include test work. Consider: -- Which molecule scenario covers the changed code path? -- Can an existing verify.yml be extended, or is a new assertion needed? -- Each new molecule scenario adds ~10 min to CI — avoid creating new ones unless - the bug genuinely requires it. - -### CI Runtime -The CI matrix runs scenarios across Debian 11/12, Ubuntu 22.04/24.04, and -Rocky 9. Changes affecting platform-specific code paths need testing on -all affected platforms. - -### Complexity Signals -- Changes to `tasks/main.yml` in any role are high-impact (execution flow). -- Changes to `handlers/` affect restart behaviour — test idempotency. -- Changes to `templates/` affect config files — verify with config assertions. -- Changes to `molecule/shared/` affect all scenarios. - -## Output - -JSON object with a `tasks` array. Each task contains: -- `title`: Brief, actionable (5-10 words) -- `description`: 2-3 sentences covering what to change and how to test -- `priority`: critical, high, medium, or low -- `estimated_effort`: small (2-4h), medium (4-8h), large (8-16h), xl (16h+) - -Break issues into 3-7 tasks. Keep descriptions sufficient for implementation -without back-and-forth. diff --git a/workflows/prompts/triage.md b/workflows/prompts/triage.md deleted file mode 100644 index fd82fec4..00000000 --- a/workflows/prompts/triage.md +++ /dev/null @@ -1,45 +0,0 @@ -# Triage Agent — Elasticstack Ansible Collection - -You are a triage specialist for an Ansible collection that deploys the Elastic -Stack (Elasticsearch, Kibana, Logstash, Beats) on Debian, Ubuntu, and Rocky Linux. - -## Your Task - -Classify and triage a reported issue so it can be prioritised and routed. - -## Severity Definitions - -- **critical**: Deployment failure, data loss, security breach, or broken rolling - upgrade that leaves a cluster in a split-brain or unrecoverable state. -- **high**: Role fails on a supported platform, no workaround. Broken idempotency - causing service restarts on every run. -- **medium**: Feature partially broken, workaround exists. Test gap for an - existing code path. -- **low**: Cosmetic issue, documentation gap, or enhancement request. - -## Categories - -- `bug` — existing functionality is broken -- `feature` — new capability requested -- `test` — missing or broken test coverage -- `docs` — documentation issue -- `security` — credential handling, TLS, permissions -- `ci` — CI/CD pipeline, molecule, GitHub Actions - -## Affected Roles - -Identify which role(s) are affected from this list: -- `elasticsearch` — cluster setup, security, rolling upgrades -- `kibana` — dashboard server, TLS, spaces -- `logstash` — pipeline config, queue settings -- `beats` — filebeat, metricbeat, packetbeat, heartbeat -- `repos` — package repository configuration -- `shared` — cross-role concerns (package_facts, common handlers) - -## Output - -JSON object with keys: `severity`, `category`, `affected_roles`, `reasoning`. - -- Default to `medium` severity when evidence is ambiguous. -- List at most 3 affected roles. -- Keep reasoning to 2-4 sentences. diff --git a/workflows/task-decompose.yaml b/workflows/task-decompose.yaml deleted file mode 100644 index 3f3589df..00000000 --- a/workflows/task-decompose.yaml +++ /dev/null @@ -1,59 +0,0 @@ -identity: - name: task-decompose - version: 1.0.0 - description: Break a triaged issue into implementation tasks with role boundaries and test strategy. - -inputs: - triage_result: - type: string - required: true - description: Triage result JSON with severity, category, affected roles, and reasoning. - repository-ref: - type: repository-ref - required: true - description: The repository to inspect for role structure and test scenarios. - -tools: - capabilities: - - repository:read - -agent: - runtime: claude-code - model: claude-sonnet-4-20250514 - system_prompt: prompts/task-decompose.md - max_iterations: 10 - -safety: - trust: elevated - resources: - timeout: 180 - -output: - schema: - type: object - required: [tasks] - properties: - tasks: - type: array - items: - type: object - required: [title, description, priority, estimated_effort] - properties: - title: - type: string - description: - type: string - priority: - type: string - enum: [critical, high, medium, low] - estimated_effort: - type: string - enum: [small, medium, large, xl] - side_effects: - - type: terminal - description: Print task decomposition to stdout - output_paths: - - task-decompose-result.json - -composition: - steps: [] diff --git a/workflows/triage.yaml b/workflows/triage.yaml deleted file mode 100644 index 74e27478..00000000 --- a/workflows/triage.yaml +++ /dev/null @@ -1,58 +0,0 @@ -identity: - name: triage - version: 1.0.0 - description: Classify and triage an elasticstack issue by severity, category, and affected roles. - -inputs: - issue-description: - type: string - required: true - description: The issue body to triage. - source: issue.body - fallback: issue.title - repository-ref: - type: repository-ref - required: true - description: The repository to inspect for role structure. - -tools: - capabilities: - - repository:read - - issue:comment - - issue:label - -agent: - runtime: claude-code - model: claude-sonnet-4-20250514 - system_prompt: prompts/triage.md - max_iterations: 10 - -safety: - trust: elevated - resources: - timeout: 120 - -output: - schema: - type: object - required: [severity, category, affected_roles, reasoning] - properties: - severity: - type: string - enum: [critical, high, medium, low] - category: - type: string - affected_roles: - type: array - items: - type: string - reasoning: - type: string - side_effects: - - type: terminal - description: Print triage summary to stdout - output_paths: - - triage-result.json - -composition: - steps: []