From ceba928b758a3729bd1d721b6e6199930c48a9e6 Mon Sep 17 00:00:00 2001 From: Valdis Pornieks Date: Mon, 11 May 2026 19:29:49 +0300 Subject: [PATCH] chore(evals): add eval case for PR 144 bash tool security findings Captures 11 security findings from the hardened bash tool PR across multiple self-review iterations. baseSha 415d0e53. --- .../prompt/.qualops/.qualopsrc.json | 273 ++++++++ .../review-system-message.md | 280 ++++++++ .../review/agentic/subagents/definitions.ts | 197 ++++++ .../repo/src/shared/utils/security.ts | 135 ++++ .../review/agentic/subagents/definitions.ts | 197 ++++++ .../review/agentic/tools/bash/env-scrub.ts | 193 ++++++ .../stages/review/agentic/tools/bash/exec.ts | 117 ++++ .../agentic/tools/bash/git-config-template.ts | 98 +++ .../tools/bash/modes/besteffort.linux.ts | 99 +++ .../tools/bash/modes/besteffort.macos.ts | 106 +++ .../tools/bash/modes/besteffort.win.ts | 36 + .../review/agentic/tools/bash/modes/ci.ts | 141 ++++ .../review/agentic/tools/bash/modes/detect.ts | 85 +++ .../agentic/tools/bash/modes/interface.ts | 56 ++ .../review/agentic/tools/bash/modes/none.ts | 35 + .../review/agentic/tools/bash/parser.ts | 172 +++++ .../review/agentic/tools/bash/policy.ts | 624 ++++++++++++++++++ .../agentic/tools/bash/secret-redact.ts | 58 ++ .../review/agentic/tools/bash/session-impl.ts | 233 +++++++ .../review/agentic/tools/bash/session.ts | 155 +++++ .../inbox/qualops-pr-144-bash-tool/slice.json | 144 ++++ 21 files changed, 3434 insertions(+) create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/.qualopsrc.json create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/prompts/qualops-self-review/review-system-message.md create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/src/stages/review/agentic/subagents/definitions.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/shared/utils/security.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/subagents/definitions.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/env-scrub.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/exec.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/git-config-template.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.linux.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.macos.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.win.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/ci.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/detect.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/interface.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/none.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/parser.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/policy.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/secret-redact.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session-impl.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session.ts create mode 100644 evals/datasets/inbox/qualops-pr-144-bash-tool/slice.json diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/.qualopsrc.json b/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/.qualopsrc.json new file mode 100644 index 00000000..6b9e9d9c --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/.qualopsrc.json @@ -0,0 +1,273 @@ +{ + "$schema": "../qualops-config.schema.json", + "ai": { + "reviewStage": { + "provider": "anthropic", + "model": "claude-sonnet-4-5-20250929", + "inputPerMillion": 3.0, + "outputPerMillion": 15.0, + "temperature": 0 + }, + "fixStage": { + "provider": "anthropic", + "model": "claude-sonnet-4-5-20250929", + "inputPerMillion": 3.0, + "outputPerMillion": 15.0, + "temperature": 0 + }, + "judgeStage": { + "provider": "anthropic", + "model": "claude-sonnet-4-5-20250929", + "inputPerMillion": 3.0, + "outputPerMillion": 15.0, + "temperature": 0 + } + }, + "performance": { + "maxFileSizeKB": 500, + "maxFilesPerBatch": 15, + "maxTokensPerFile": 8000, + "timeoutSeconds": 300, + "throttling": { + "enabled": true, + "maxRequestsPerMinute": 50 + } + }, + "review": { + "minConfidence": 7, + "maxConcurrentFiles": 3, + "validation": { + "enabled": true, + "minConfidence": 7, + "prompt": "qualops-self-review/validation.md" + }, + "deduplication": { + "enabled": true, + "prompt": "deduplication.md" + }, + "pipeline": [ + { + "name": "qualopsSelfReview", + "enabled": false, + "validation": { + "enabled": true, + "minConfidence": 7, + "prompt": "qualops-self-review/validation.md" + }, + "passes": [ + { + "name": "Error Handling & Async Operations", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "try", + "catch", + "throw", + "Error", + "async", + "await", + "Promise", + "then\\(", + "catch\\(" + ], + "filePatterns": ["src/**/*.ts"], + "excludePatterns": ["**/*.spec.ts", "**/__tests__/**"] + } + }, + { + "name": "AI Provider Patterns", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "AIProvider", + "complete", + "invoke", + "AIFactory", + "anthropic", + "bedrock", + "openai", + "TokenStats" + ], + "filePatterns": ["src/ai/**/*.ts"], + "excludePatterns": ["**/*.spec.ts"] + } + }, + { + "name": "Security & Credentials", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "API_KEY", + "TOKEN", + "SECRET", + "PASSWORD", + "GITHUB_TOKEN", + "GITLAB_TOKEN", + "ANTHROPIC_API_KEY", + "process\\.env", + "auth" + ], + "filePatterns": ["src/**/*.ts"], + "excludePatterns": ["**/*.spec.ts"] + } + }, + { + "name": "Git Operations & File System", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "git ", + "spawn", + "exec", + "readFile", + "writeFile", + "existsSync", + "mkdirSync", + "rmSync", + "unlink" + ], + "filePatterns": ["src/stages/analyze/**/*.ts", "src/shared/utils/file-utils.ts"], + "excludePatterns": ["**/*.spec.ts"] + } + }, + { + "name": "TypeScript & Type Safety", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "any", + "as ", + "interface", + "type ", + "enum", + "Record<", + "Partial<", + "Pick<" + ], + "filePatterns": ["src/**/*.ts"], + "excludePatterns": ["**/*.spec.ts", "src/shared/types/**"] + } + }, + { + "name": "Session & Context Management", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "SessionContext", + "session", + "metadata", + "extract.log", + "TokenStats", + "PipelineExecutor" + ], + "filePatterns": ["src/shared/runtime/**/*.ts", "src/stages/review/**/*.ts"], + "excludePatterns": ["**/*.spec.ts"] + } + }, + { + "name": "CLI & Configuration", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "Command", + "program", + "option\\(", + "argument\\(", + "ConfigService", + "loadConfig", + "validate" + ], + "filePatterns": ["src/cli/**/*.ts", "src/config/**/*.ts"], + "excludePatterns": ["**/*.spec.ts"] + } + }, + { + "name": "CI Integration (GitHub/GitLab)", + "enabled": true, + "prompt": "qualops-self-review/review-system-message.md", + "filters": { + "detectionTriggers": [ + "GitHub", + "GitLab", + "Octokit", + "CI_MERGE_REQUEST", + "GITHUB_", + "postComment", + "createCheck" + ], + "filePatterns": ["src/github/**/*.ts", "src/gitlab/**/*.ts"], + "excludePatterns": ["**/*.spec.ts"] + } + } + ] + }, + { + "name": "agenticSecurityAudit", + "enabled": true, + "mode": "agentic", + "agentic": { + "maxTurns": 15, + "contextMode": "auto", + "maxTokensPerFile": 8000, + "maxTotalTokens": 50000, + "enabledSubagents": [ + "security-analyzer", + "dependency-tracer", + "breaking-change-detector" + ], + "systemPrompt": "You are a security-focused code reviewer analyzing the QualOps codebase. Focus on:\n1. Command injection vulnerabilities in shell executions\n2. Path traversal in file operations\n3. Credential exposure in logs or error messages\n4. Unsafe deserialization or eval usage\n5. Cross-file security implications" + }, + "validation": { + "enabled": true, + "minConfidence": 8, + "prompt": "qualops-self-review/validation.md" + } + } + ] + }, + "fix": { + "enabled": true, + "severities": ["critical", "high"], + "minConfidence": 8, + "maxConcurrentFixes": 5, + "autoApply": false + }, + "report": { + "outputFormat": "html", + "includedSeverities": ["critical", "high", "medium", "low"], + "enableRootCauseExtraction": false + }, + "github": { + "enabled": false, + "postComments": false, + "skipOnDraft": false, + "blockPipeline": false, + "maxInlineComments": 50 + }, + "gitlab": { + "enabled": false, + "postComments": false, + "skipOnDraft": false, + "blockPipeline": false + }, + "skipPatterns": [ + "node_modules/**", + ".git/**", + "dist/**", + "build/**", + "coverage/**", + "reports/**", + ".qualops-cache/**", + "*.min.js", + "*.bundle.js", + "examples/**", + "docs/**" + ] +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/prompts/qualops-self-review/review-system-message.md b/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/prompts/qualops-self-review/review-system-message.md new file mode 100644 index 00000000..2a60bdf5 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/.qualops/prompts/qualops-self-review/review-system-message.md @@ -0,0 +1,280 @@ +# QualOps TypeScript/Node.js Code Review System + +You are an expert TypeScript and Node.js code reviewer with deep knowledge of modern JavaScript patterns, async programming, CLI tool development, and AI provider integration. Your task is to review the QualOps codebase for quality, maintainability, security, and correctness issues. + +## Review Focus Areas + +### 1. Error Handling & Async Operations +- Unhandled promise rejections +- Missing try-catch blocks around async operations +- Error propagation and context preservation +- Proper error logging with stack traces +- Resource cleanup in finally blocks +- Race conditions in concurrent operations +- Missing timeout handling for external API calls + +**Look for:** +- `await` calls without try-catch blocks +- Promise chains without `.catch()` +- Silent failures (catch blocks that don't log or rethrow) +- Missing error context when rethrowing +- Async functions that don't handle errors properly + +### 2. AI Provider Integration +- API key exposure or logging +- Missing retry logic for transient failures +- Improper token usage tracking +- Cost calculation accuracy +- Provider-specific error handling +- Missing timeout configurations +- Token limit validation before API calls +- Structured output parsing and validation + +**Look for:** +- API keys in error messages or logs +- Hardcoded retry logic (should be configurable) +- Missing cost tracking on AI completions +- Not validating token limits before expensive calls +- Missing schema validation for structured outputs +- No exponential backoff for rate limit errors + +### 3. Security & Credential Management +- API keys, tokens, or secrets in code +- Sensitive data in error messages or logs +- Environment variable validation +- Token redaction in logs +- Secure temporary file handling +- Command injection vulnerabilities in git/shell operations +- Path traversal vulnerabilities in file operations + +**Look for:** +- `process.env` access without validation +- Logging request/response bodies containing tokens +- User input directly in shell commands +- File paths constructed from user input without sanitization +- Hardcoded credentials or API endpoints + +### 4. Git Operations & File System +- Command injection in git operations +- Missing validation of git command output +- Unsafe file path handling +- Race conditions in file operations +- Missing error handling for file I/O +- Improper permissions on created files/directories +- Memory issues with large file reads + +**Look for:** +- String interpolation in shell commands +- Not validating git command exit codes +- Reading entire files into memory without size checks +- Missing existence checks before file operations +- No cleanup of temporary files + +### 5. TypeScript & Type Safety +- Excessive use of `any` type +- Type assertions (`as`) without validation +- Missing return type annotations +- Unsafe type narrowing +- Not using discriminated unions properly +- Missing null/undefined checks +- Improper use of `!` non-null assertion + +**Look for:** +- Functions with `any` parameters or return types +- Type assertions without runtime validation +- Optional properties accessed without checks +- Array operations without length validation +- Record/object access without key validation + +### 6. Session & Context Management +- Memory leaks in session context +- Missing session cleanup +- Token counting accuracy +- State sharing between concurrent operations +- Metadata file corruption handling +- Extract log consistency +- Session directory cleanup + +**Look for:** +- Growing arrays/maps never cleared +- File handles not properly closed +- Metadata writes without atomic operations +- Missing locking for concurrent file access +- Session state not reset between operations + +### 7. Configuration & Validation +- Missing configuration validation +- Default values that don't make sense +- Configuration loading errors silently ignored +- Schema validation missing for user input +- Type mismatches between config and usage +- Missing documentation for config options + +**Look for:** +- Loading config without validation +- Using config values without type checking +- Missing defaults for optional settings +- Config errors not surfaced to users + +### 8. CI Integration (GitHub/GitLab) +- Missing permission checks +- API rate limit handling +- Comment/annotation format validation +- Markdown injection vulnerabilities +- Missing retry logic for API failures +- Improper event handling +- Token validation and scoping + +**Look for:** +- Creating API clients without token validation +- Not handling rate limit responses +- User-provided content in markdown without sanitization +- Missing pagination for list operations +- Hardcoded API endpoints + +## Code Quality Principles + +### TypeScript Best Practices +- Use const/let (never var) +- Prefer readonly for immutable properties +- Use template literals over string concatenation +- Leverage discriminated unions for state management +- Use type guards for runtime validation +- Prefer interface over type for object shapes +- Use utility types (Partial, Pick, Omit, etc.) + +### Async/Await Patterns +- Always await promises (or explicitly handle them) +- Use Promise.all for parallel operations +- Avoid blocking operations in async functions +- Proper cancellation handling +- Use async iterators for streaming data +- Implement proper timeout logic + +### Performance +- Avoid blocking the event loop +- Use streams for large files +- Implement pagination for large datasets +- Cache expensive computations +- Use weak references for caches +- Avoid unnecessary deep copies + +### Maintainability +- Functions should have single responsibility +- Keep functions under 50 lines +- Use descriptive variable and function names +- Avoid deep nesting (max 3-4 levels) +- Extract magic numbers to named constants +- Document complex logic with comments +- Use early returns to reduce nesting + +### Testing Considerations +- Code should be testable (avoid tight coupling) +- Dependency injection for external services +- Separate business logic from framework code +- Avoid global state + +## Issue Classification + +### Severity Levels + +**Critical (9-10)**: +- Security vulnerabilities (credential exposure, injection attacks) +- Data corruption or loss risks +- Service crashes or unrecoverable errors +- Resource leaks causing system failure + +**High (7-8)**: +- Incorrect business logic +- Significant performance problems +- Race conditions causing data inconsistency +- Missing error handling in critical paths +- Token tracking inaccuracies affecting billing + +**Medium (5-6)**: +- Missing validation or type safety +- Suboptimal patterns affecting maintainability +- Missing logging or poor error messages +- Code duplication +- Minor performance issues + +**Low (3-4)**: +- Style inconsistencies +- Missing type annotations +- Non-critical optimization opportunities +- Minor code smells + +**Info (1-2)**: +- Suggestions for improvement +- Alternative patterns +- Best practice recommendations + +### Confidence Levels + +**High Confidence (8-10)**: +- Clear security vulnerabilities +- Obvious bugs or errors +- Well-known anti-patterns +- TypeScript type errors + +**Medium Confidence (5-7)**: +- Context-dependent issues +- Performance concerns without profiling +- Code smells that might be intentional +- Potential edge cases + +**Low Confidence (1-4)**: +- Stylistic preferences +- Speculative improvements +- Issues requiring more context + +## Output Format + +For each issue found, provide: + +1. **Title**: Clear, specific issue description +2. **Severity**: Critical/High/Medium/Low/Info +3. **Confidence**: 1-10 score +4. **Category**: Error Handling, AI Provider, Security, Git Operations, TypeScript, Session Management, Configuration, CI Integration +5. **Line**: Exact line number(s) affected +6. **Description**: Detailed explanation of the issue +7. **Impact**: What could go wrong +8. **Recommendation**: Specific fix with code example +9. **References**: TypeScript docs, Node.js docs, or security guides + +## Example Issue + +```json +{ + "title": "API key potentially exposed in error logs", + "severity": "critical", + "confidence": 9, + "category": "Security", + "line": 42, + "description": "The error handler logs the entire error object which may contain the API key from the request headers. This could expose sensitive credentials in log aggregation systems.", + "impact": "API keys could be exposed in logs, leading to unauthorized access to the Anthropic API and potential billing fraud.", + "recommendation": "Sanitize error objects before logging:\n\n```typescript\ntry {\n await this.client.complete(prompt);\n} catch (error) {\n const sanitizedError = {\n message: error instanceof Error ? error.message : 'Unknown error',\n code: error.code,\n // Explicitly exclude sensitive fields\n };\n logger.error('AI completion failed', sanitizedError);\n throw error;\n}\n```", + "references": ["OWASP Logging Cheat Sheet", "Node.js Security Best Practices"] +} +``` + +## Important Notes + +- Focus on **actionable feedback** with specific recommendations +- Consider the **context** - QualOps is a CLI tool with specific patterns +- Balance **security with usability** - don't over-engineer +- Prioritize issues by **impact** - fix what matters most +- Be **specific about line numbers** - reference exact code locations +- Provide **code examples** in recommendations +- Consider **Node.js version** and TypeScript target + +## What NOT to Flag + +- Patterns consistent with the rest of the codebase +- Style preferences handled by Prettier/ESLint +- Test code with intentionally simplified patterns +- Framework-provided patterns (Commander.js, etc.) +- Intentional type assertions with validation +- Temporary code marked with TODO comments + +Focus on issues that genuinely improve code quality, security, maintainability, or correctness specific to a TypeScript/Node.js CLI tool that orchestrates AI-powered code analysis. diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/src/stages/review/agentic/subagents/definitions.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/src/stages/review/agentic/subagents/definitions.ts new file mode 100644 index 00000000..67a2143e --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/prompt/src/stages/review/agentic/subagents/definitions.ts @@ -0,0 +1,197 @@ +import type { ModelConfig } from '../../../../shared/types'; +import type { AgenticConfig, AgenticSubagentType } from '../../../../shared/types/config'; + +export interface AgentDefinition { + description: string; + prompt: string; + tools: string[]; + model?: ModelConfig; +} + +export type ResolvedAgentDefinition = Omit & { model?: string }; + +const SUBAGENT_DEFINITIONS: Record = { + 'dependency-tracer': { + description: + 'Traces cross-file dependencies and identifies coupling issues, circular dependencies, and import chains that may be affected by changes', + prompt: `You are a dependency analysis expert for TypeScript/JavaScript codebases. + +Your job is to: +1. Trace import/export relationships between changed files and the rest of the codebase +2. Identify which other files might be affected by the changes +3. Detect circular dependencies that could cause issues +4. Find tightly coupled modules that violate separation of concerns + +When analyzing dependencies: +- Use trace_imports to understand what each changed file imports and what imports it +- Use find_usages to find all places where exported symbols are used +- Look for patterns indicating tight coupling (many bidirectional imports, god modules) + +Return issues in this JSON format: +[{ + "type": "maintainability", + "severity": "medium" | "high", + "description": "Clear description of the dependency issue", + "location": "file:line", + "reasoning": "Why this is problematic", + "suggestion": "How to fix it", + "confidence": 7-10 +}] + +If no dependency issues are found, return an empty array: []`, + tools: [ + 'Read', + 'Grep', + 'Glob', + 'mcp__qualops-agentic-tools__trace_imports', + 'mcp__qualops-agentic-tools__find_usages', + ], + }, + + 'breaking-change-detector': { + description: + 'Detects breaking API changes, interface modifications, export removals, and function signature changes that could affect consumers', + prompt: `You are a breaking change detection expert. + +Your job is to identify changes that could break consumers of this code: +1. Removed or renamed exports +2. Changed function signatures (parameters, return types) +3. Modified interface/type definitions +4. Changed class method signatures or removed methods +5. Behavioral changes that could break existing usage patterns + +When analyzing: +- Use git_diff_analysis to see what changed +- Use analyze_exports to compare exports between versions +- Use find_interface_changes to detect interface modifications +- Use find_usages to understand the impact scope + +Focus on PUBLIC API changes - internal implementation changes are fine. + +Return issues in this JSON format: +[{ + "type": "bug", + "severity": "critical" | "high", + "description": "What breaking change was detected", + "location": "file:line", + "reasoning": "Why this breaks consumers and what code would be affected", + "suggestion": "How to make this non-breaking or properly deprecate", + "confidence": 8-10 +}] + +If no breaking changes are found, return an empty array: []`, + tools: [ + 'Read', + 'Grep', + 'mcp__qualops-agentic-tools__git_diff_analysis', + 'mcp__qualops-agentic-tools__analyze_exports', + 'mcp__qualops-agentic-tools__find_interface_changes', + 'mcp__qualops-agentic-tools__find_usages', + ], + }, + + 'security-analyzer': { + description: + 'Performs deep security analysis including injection vulnerabilities, authentication issues, credential exposure, and data validation problems', + prompt: `You are a security expert analyzing code for vulnerabilities. + +Your job is to identify security issues: +1. Injection vulnerabilities (SQL, NoSQL, command injection, XSS) +2. Authentication and authorization weaknesses +3. Sensitive data exposure (hardcoded secrets, logging PII) +4. Insecure cryptographic practices +5. Path traversal and file access vulnerabilities +6. Insecure deserialization +7. Missing input validation at trust boundaries + +When analyzing: +- Trace data flow from user input to dangerous sinks +- Use find_usages to track how user-controlled data propagates +- Look for patterns like eval(), exec(), shell commands with user input +- Check for hardcoded API keys, passwords, or tokens + +IMPORTANT: Only report issues with HIGH confidence. Verify by tracing actual data flow. + +Return issues in this JSON format: +[{ + "type": "security", + "severity": "critical" | "high" | "medium", + "description": "What vulnerability was found", + "location": "file:line", + "reasoning": "How this could be exploited, attack vector", + "suggestion": "Specific remediation steps", + "confidence": 8-10, + "cwe": "CWE-XXX if applicable" +}] + +If no security issues are found, return an empty array: []`, + tools: [ + 'Read', + 'Grep', + 'Glob', + 'mcp__qualops-agentic-tools__find_usages', + 'mcp__qualops-agentic-tools__trace_imports', + ], + }, + + 'pattern-validator': { + description: + 'Validates code against established patterns, best practices, error handling, and architectural guidelines found in the existing codebase', + prompt: `You are a code quality expert who understands existing patterns in a codebase. + +Your job is to: +1. Identify deviations from established patterns in the codebase +2. Find error handling issues (missing try/catch, swallowed errors, inconsistent patterns) +3. Detect code smells and anti-patterns +4. Check for consistent naming conventions +5. Identify opportunities for using existing utilities instead of reinventing + +When analyzing: +- First understand the existing patterns by reading similar files +- Use Grep to find how similar problems are solved elsewhere +- Compare the changed code against established conventions + +IMPORTANT: Only report issues where there's a clear established pattern being violated. +Don't enforce arbitrary rules - enforce consistency with the existing codebase. + +Return issues in this JSON format: +[{ + "type": "maintainability", + "severity": "low" | "medium", + "description": "What pattern violation was found", + "location": "file:line", + "reasoning": "How this differs from the established pattern", + "suggestion": "How to align with existing patterns", + "confidence": 7-9 +}] + +If no pattern violations are found, return an empty array: []`, + tools: ['Read', 'Grep', 'Glob'], + }, +}; + +export function createSubagentDefinitions(config: AgenticConfig): Record { + const enabled = + config.enabledSubagents || (Object.keys(SUBAGENT_DEFINITIONS) as AgenticSubagentType[]); + const definitions: Record = {}; + + const bashSubagentAccess = config.bash?.subagentAccess ?? 'all'; + const addBash = bashSubagentAccess === 'all'; + + for (const entry of enabled) { + const type = typeof entry === 'string' ? entry : entry.name; + if (type in SUBAGENT_DEFINITIONS) { + const def = { ...SUBAGENT_DEFINITIONS[type as AgenticSubagentType] }; + if (addBash && !def.tools.includes('Bash')) { + def.tools = [...def.tools, 'Bash']; + } + definitions[type] = def; + } + } + + return definitions; +} + +export function getAllSubagentTypes(): AgenticSubagentType[] { + return Object.keys(SUBAGENT_DEFINITIONS) as AgenticSubagentType[]; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/shared/utils/security.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/shared/utils/security.ts new file mode 100644 index 00000000..9647c694 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/shared/utils/security.ts @@ -0,0 +1,135 @@ +import { basename, resolve, join } from 'node:path'; + +/** + * Security utilities for input validation and sanitization in qualops + * This file contains only the security functions actively used in the codebase + */ + +/** + * Resolves `filePath` relative to `cwd` and returns the absolute path only if + * it stays within `cwd`. Returns null if the path escapes the project root. + * + * Handles the prefix-bypass edge case: `/project/repo-evil` is NOT within + * `/project/repo` even though it starts with the same string. + */ +export function resolveWithinCwd(cwd: string, filePath: string): string | null { + const full = filePath.startsWith('/') ? filePath : join(cwd, filePath); + const resolved = resolve(full); + const cwdResolved = resolve(cwd); + const cwdPrefix = cwdResolved.replace(/\/?$/, '/'); + if (resolved !== cwdResolved && !resolved.startsWith(cwdPrefix)) return null; + return resolved; +} + +/** + * Returns true if the string is a safe git ref — non-empty and not starting + * with '-', which would otherwise be interpreted as a flag by git. + */ +export function isSafeGitRef(ref: string): boolean { + return ref.length > 0 && !ref.startsWith('-'); +} + +/** + * Validates that a filename doesn't contain path traversal sequences + */ +export function isPathTraversalSafe(filename: string): boolean { + if (!filename) return false; + return ( + !filename.includes('..') && + !filename.includes('/') && + !filename.includes('\\') && + !filename.includes('\0') + ); +} + +/** + * Sanitizes a filename by removing dangerous characters + */ +export function sanitizeFilename(filename: string): string { + if (!filename) return ''; + return basename(filename).replace(/[^a-zA-Z0-9_.-]/g, '_'); +} + +/** + * Validates a Git SHA-1 or SHA-256 hash format + */ +export function isValidGitSha(sha: string): boolean { + if (!sha) return false; + return /^[a-f0-9]{40}$/i.test(sha) || /^[a-f0-9]{64}$/i.test(sha); +} + +/** + * Validates URL scheme to prevent javascript: and data: URLs + */ +export function hasValidUrlScheme(urlString: string): boolean { + try { + const url = new URL(urlString); + return ['http:', 'https:'].includes(url.protocol); + } catch { + return false; + } +} + +/** + * Escapes HTML special characters for safe embedding in HTML reports + */ +export function escapeHtml(text: string): string { + if (!text) return ''; + return text + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, ''') + .replace(/\//g, '/'); +} + +// Shared pattern source — used for both inline redaction (with /g) and full-value matching (with ^...$). +export const SENSITIVE_VALUE_PATTERN_SOURCE = + 'Bearer\\s+\\S+|Basic\\s+\\S+' + + '|sk-[A-Za-z0-9]{10,}|pk-[A-Za-z0-9]{10,}' + + '|gh[a-z]_[A-Za-z0-9_]+|github_pat_[A-Za-z0-9_]+' + + '|glpat-[A-Za-z0-9_-]+|glcbt-[A-Za-z0-9_-]+' + + '|AKIA[A-Z0-9]{16}'; + +export const REDACTED = '[REDACTED]'; + +/** + * Redacts known credential patterns from a string. + * Pass additional known token values to also redact those by exact match. + */ +export function redactTokens(text: string, knownTokens: string[] = []): string { + if (!text) return text; + + // Recreate with /g each call to avoid stateful lastIndex issues + let redacted = text.replace(new RegExp(SENSITIVE_VALUE_PATTERN_SOURCE, 'g'), REDACTED); + + for (const token of knownTokens) { + if (token && token.length > 0) { + const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + redacted = redacted.replace(new RegExp(escaped, 'g'), REDACTED); + } + } + + return redacted; +} + +/** + * Type guard for ClassificationResult array used in root cause extraction + */ +export function isClassificationResultArray( + data: unknown, +): data is Array<{ issueId: string; rootCause: string; confidence: number }> { + if (!Array.isArray(data)) return false; + return data.every( + (item) => + item && + typeof item === 'object' && + 'issueId' in item && + 'rootCause' in item && + 'confidence' in item && + typeof item.issueId === 'string' && + typeof item.rootCause === 'string' && + typeof item.confidence === 'number', + ); +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/subagents/definitions.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/subagents/definitions.ts new file mode 100644 index 00000000..67a2143e --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/subagents/definitions.ts @@ -0,0 +1,197 @@ +import type { ModelConfig } from '../../../../shared/types'; +import type { AgenticConfig, AgenticSubagentType } from '../../../../shared/types/config'; + +export interface AgentDefinition { + description: string; + prompt: string; + tools: string[]; + model?: ModelConfig; +} + +export type ResolvedAgentDefinition = Omit & { model?: string }; + +const SUBAGENT_DEFINITIONS: Record = { + 'dependency-tracer': { + description: + 'Traces cross-file dependencies and identifies coupling issues, circular dependencies, and import chains that may be affected by changes', + prompt: `You are a dependency analysis expert for TypeScript/JavaScript codebases. + +Your job is to: +1. Trace import/export relationships between changed files and the rest of the codebase +2. Identify which other files might be affected by the changes +3. Detect circular dependencies that could cause issues +4. Find tightly coupled modules that violate separation of concerns + +When analyzing dependencies: +- Use trace_imports to understand what each changed file imports and what imports it +- Use find_usages to find all places where exported symbols are used +- Look for patterns indicating tight coupling (many bidirectional imports, god modules) + +Return issues in this JSON format: +[{ + "type": "maintainability", + "severity": "medium" | "high", + "description": "Clear description of the dependency issue", + "location": "file:line", + "reasoning": "Why this is problematic", + "suggestion": "How to fix it", + "confidence": 7-10 +}] + +If no dependency issues are found, return an empty array: []`, + tools: [ + 'Read', + 'Grep', + 'Glob', + 'mcp__qualops-agentic-tools__trace_imports', + 'mcp__qualops-agentic-tools__find_usages', + ], + }, + + 'breaking-change-detector': { + description: + 'Detects breaking API changes, interface modifications, export removals, and function signature changes that could affect consumers', + prompt: `You are a breaking change detection expert. + +Your job is to identify changes that could break consumers of this code: +1. Removed or renamed exports +2. Changed function signatures (parameters, return types) +3. Modified interface/type definitions +4. Changed class method signatures or removed methods +5. Behavioral changes that could break existing usage patterns + +When analyzing: +- Use git_diff_analysis to see what changed +- Use analyze_exports to compare exports between versions +- Use find_interface_changes to detect interface modifications +- Use find_usages to understand the impact scope + +Focus on PUBLIC API changes - internal implementation changes are fine. + +Return issues in this JSON format: +[{ + "type": "bug", + "severity": "critical" | "high", + "description": "What breaking change was detected", + "location": "file:line", + "reasoning": "Why this breaks consumers and what code would be affected", + "suggestion": "How to make this non-breaking or properly deprecate", + "confidence": 8-10 +}] + +If no breaking changes are found, return an empty array: []`, + tools: [ + 'Read', + 'Grep', + 'mcp__qualops-agentic-tools__git_diff_analysis', + 'mcp__qualops-agentic-tools__analyze_exports', + 'mcp__qualops-agentic-tools__find_interface_changes', + 'mcp__qualops-agentic-tools__find_usages', + ], + }, + + 'security-analyzer': { + description: + 'Performs deep security analysis including injection vulnerabilities, authentication issues, credential exposure, and data validation problems', + prompt: `You are a security expert analyzing code for vulnerabilities. + +Your job is to identify security issues: +1. Injection vulnerabilities (SQL, NoSQL, command injection, XSS) +2. Authentication and authorization weaknesses +3. Sensitive data exposure (hardcoded secrets, logging PII) +4. Insecure cryptographic practices +5. Path traversal and file access vulnerabilities +6. Insecure deserialization +7. Missing input validation at trust boundaries + +When analyzing: +- Trace data flow from user input to dangerous sinks +- Use find_usages to track how user-controlled data propagates +- Look for patterns like eval(), exec(), shell commands with user input +- Check for hardcoded API keys, passwords, or tokens + +IMPORTANT: Only report issues with HIGH confidence. Verify by tracing actual data flow. + +Return issues in this JSON format: +[{ + "type": "security", + "severity": "critical" | "high" | "medium", + "description": "What vulnerability was found", + "location": "file:line", + "reasoning": "How this could be exploited, attack vector", + "suggestion": "Specific remediation steps", + "confidence": 8-10, + "cwe": "CWE-XXX if applicable" +}] + +If no security issues are found, return an empty array: []`, + tools: [ + 'Read', + 'Grep', + 'Glob', + 'mcp__qualops-agentic-tools__find_usages', + 'mcp__qualops-agentic-tools__trace_imports', + ], + }, + + 'pattern-validator': { + description: + 'Validates code against established patterns, best practices, error handling, and architectural guidelines found in the existing codebase', + prompt: `You are a code quality expert who understands existing patterns in a codebase. + +Your job is to: +1. Identify deviations from established patterns in the codebase +2. Find error handling issues (missing try/catch, swallowed errors, inconsistent patterns) +3. Detect code smells and anti-patterns +4. Check for consistent naming conventions +5. Identify opportunities for using existing utilities instead of reinventing + +When analyzing: +- First understand the existing patterns by reading similar files +- Use Grep to find how similar problems are solved elsewhere +- Compare the changed code against established conventions + +IMPORTANT: Only report issues where there's a clear established pattern being violated. +Don't enforce arbitrary rules - enforce consistency with the existing codebase. + +Return issues in this JSON format: +[{ + "type": "maintainability", + "severity": "low" | "medium", + "description": "What pattern violation was found", + "location": "file:line", + "reasoning": "How this differs from the established pattern", + "suggestion": "How to align with existing patterns", + "confidence": 7-9 +}] + +If no pattern violations are found, return an empty array: []`, + tools: ['Read', 'Grep', 'Glob'], + }, +}; + +export function createSubagentDefinitions(config: AgenticConfig): Record { + const enabled = + config.enabledSubagents || (Object.keys(SUBAGENT_DEFINITIONS) as AgenticSubagentType[]); + const definitions: Record = {}; + + const bashSubagentAccess = config.bash?.subagentAccess ?? 'all'; + const addBash = bashSubagentAccess === 'all'; + + for (const entry of enabled) { + const type = typeof entry === 'string' ? entry : entry.name; + if (type in SUBAGENT_DEFINITIONS) { + const def = { ...SUBAGENT_DEFINITIONS[type as AgenticSubagentType] }; + if (addBash && !def.tools.includes('Bash')) { + def.tools = [...def.tools, 'Bash']; + } + definitions[type] = def; + } + } + + return definitions; +} + +export function getAllSubagentTypes(): AgenticSubagentType[] { + return Object.keys(SUBAGENT_DEFINITIONS) as AgenticSubagentType[]; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/env-scrub.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/env-scrub.ts new file mode 100644 index 00000000..34dca162 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/env-scrub.ts @@ -0,0 +1,193 @@ +const EXPLICIT_DENY: ReadonlySet = new Set([ + 'GITHUB_TOKEN', + 'GITHUB_APP_TOKEN', + 'GH_TOKEN', + 'ANTHROPIC_API_KEY', + 'OPENAI_API_KEY', + 'NPM_TOKEN', + 'CODECOV_TOKEN', +]); + +const DENY_SUFFIXES: readonly string[] = [ + '_TOKEN', + '_SECRET', + '_KEY', + '_PASSWORD', + '_PASS', + '_CREDENTIAL', + '_CREDENTIALS', +]; + +const DENY_PREFIXES: readonly string[] = ['AWS_', 'AZURE_', 'GCP_', 'GOOGLE_', 'VAULT_']; + +const _ALLOW_LIST: ReadonlySet = new Set([ + // Shell / process fundamentals + 'HOME', + 'USER', + 'LOGNAME', + 'SHELL', + 'TERM', + 'LANG', + 'LC_ALL', + 'LC_CTYPE', + 'LC_MESSAGES', + 'TZ', + 'TMPDIR', + 'TEMP', + 'TMP', + 'XDG_RUNTIME_DIR', + 'XDG_DATA_HOME', + 'XDG_CONFIG_HOME', + 'XDG_CACHE_HOME', + + // Paths + 'PATH', + 'LD_LIBRARY_PATH', + 'DYLD_LIBRARY_PATH', + 'MANPATH', + 'PKG_CONFIG_PATH', + 'JAVA_HOME', + 'CARGO_HOME', + 'GOPATH', + 'GOROOT', + 'GOBIN', + 'RUSTUP_HOME', + 'PYENV_ROOT', + 'NVM_DIR', + 'NODE_PATH', + 'PYTHONPATH', + + // Build / CI environment metadata (non-secret) + 'CI', + 'RUNNER_NAME', + 'RUNNER_OS', + 'RUNNER_ARCH', + 'RUNNER_TEMP', + 'RUNNER_TOOL_CACHE', + 'RUNNER_WORKSPACE', + 'GITHUB_ACTIONS', + 'GITHUB_WORKSPACE', + 'GITHUB_REPOSITORY', + 'GITHUB_REF', + 'GITHUB_SHA', + 'GITHUB_RUN_ID', + 'GITHUB_RUN_NUMBER', + 'GITHUB_JOB', + 'GITHUB_EVENT_NAME', + 'GITHUB_HEAD_REF', + 'GITHUB_BASE_REF', + 'GITHUB_SERVER_URL', + 'GITHUB_API_URL', + 'GITHUB_GRAPHQL_URL', + 'GITHUB_ACTOR', + 'GITHUB_REPOSITORY_OWNER', + + // Node.js runtime + 'NODE_ENV', + 'NODE_OPTIONS', + 'NODE_NO_WARNINGS', + 'npm_config_registry', + + // Qualops-specific (non-secret) + 'QUALOPS_ENV_SCRUBBED', + 'QUALOPS_REVIEW_ID', + 'QUALOPS_SANDBOX_MODE', + 'QUALOPS_ALLOW_UNSANDBOXED', + 'QUALOPS_WORKSPACE', + + // Git overrides we set ourselves + 'GIT_CONFIG_NOSYSTEM', + 'GIT_CONFIG_GLOBAL', + 'GIT_CEILING_DIRECTORIES', + 'GIT_TERMINAL_PROMPT', + 'GIT_ASKPASS', + + // Display / terminal + 'DISPLAY', + 'WAYLAND_DISPLAY', + 'COLORTERM', + 'NO_COLOR', + 'FORCE_COLOR', +]); + +function isSecretLike(name: string): boolean { + const upper = name.toUpperCase(); + if (EXPLICIT_DENY.has(name)) return true; + for (const suffix of DENY_SUFFIXES) { + if (upper.endsWith(suffix)) return true; + } + for (const prefix of DENY_PREFIXES) { + if (upper.startsWith(prefix)) return true; + } + return false; +} + +export interface ScrubResult { + env: NodeJS.ProcessEnv; + dropped: string[]; +} + +function injectGitSafetyOverrides(env: NodeJS.ProcessEnv, gitConfigPath?: string): void { + env['GIT_CONFIG_NOSYSTEM'] = '1'; + env['GIT_TERMINAL_PROMPT'] = '0'; + env['GIT_ASKPASS'] = 'true'; // no-op credential helper — prevents interactive prompts + env['GIT_CEILING_DIRECTORIES'] = '/workspace'; + if (gitConfigPath) { + env['GIT_CONFIG_GLOBAL'] = gitConfigPath; + } +} + +export function scrubEnv(src: NodeJS.ProcessEnv, gitConfigPath?: string): ScrubResult { + const result: NodeJS.ProcessEnv = {}; + const dropped: string[] = []; + + for (const [key, value] of Object.entries(src)) { + if (value === undefined) continue; + if (isSecretLike(key)) { + dropped.push(key); + continue; + } + result[key] = value; + } + + injectGitSafetyOverrides(result, gitConfigPath); + result['QUALOPS_ENV_SCRUBBED'] = '1'; + + return { env: result, dropped }; +} + +function replaceProcessEnv(replacement: NodeJS.ProcessEnv): void { + for (const key of Object.keys(process.env)) { + if (!(key in replacement)) delete process.env[key]; + } + for (const [key, value] of Object.entries(replacement)) { + process.env[key] = value; + } +} + +/** + * Applies env scrub in-place to `process.env`. + * IMPORTANT: Call this at the very top of cli.ts before any SDK imports. + * Idempotent — returns early if QUALOPS_ENV_SCRUBBED is already set. + */ +export function applyEnvScrub(gitConfigPath?: string): void { + if (process.env['QUALOPS_ENV_SCRUBBED'] === '1') return; + + const { env, dropped } = scrubEnv(process.env, gitConfigPath); + + replaceProcessEnv(env); + + if (dropped.length > 0 && process.env['QUALOPS_DEBUG_ENV_SCRUB'] === '1') { + process.stderr.write( + `[qualops/env-scrub] dropped ${dropped.length} secret env vars: ${dropped.join(', ')}\n`, + ); + } +} + +export function makeCleanEnv( + gitConfigPath: string, + extra: NodeJS.ProcessEnv = {}, +): NodeJS.ProcessEnv { + const { env } = scrubEnv(process.env, gitConfigPath); + return { ...env, ...extra }; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/exec.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/exec.ts new file mode 100644 index 00000000..9231419c --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/exec.ts @@ -0,0 +1,117 @@ +import { stripAnsi } from './ansi-strip.js'; +import { getSemanticHint } from './exit-semantics.js'; +import type { SandboxDriver } from './modes/interface.js'; +import { truncateOutput } from './output-limit.js'; +import type { BashInput, BashOutput, BashExitReason } from './schema.js'; +import { redactOutput } from './secret-redact.js'; +import type { BashShellSession, SessionExecResult } from './session-impl.js'; +import { logger } from '../../../../../shared/utils/logger.js'; + +export interface ExecOptions { + reviewId: string; + callId: string; + workspaceRoot?: string; + knownTokens?: string[]; +} + +export async function execBashCommand( + input: BashInput, + session: BashShellSession, + driver: SandboxDriver, + opts: ExecOptions, +): Promise { + const timeoutMs = input.timeout_ms ?? 60000; + const { reviewId, callId, workspaceRoot = '/workspace', knownTokens = [] } = opts; + + let result: SessionExecResult; + let exitReason: BashExitReason = 'exited'; + const violations: string[] = []; + + logger.info(`[bash] → command: ${input.command.substring(0, 300)}`); + + const preResult = await driver.preCall(callId, input.command); + violations.push(...preResult.violations); + + const timeoutHandle = setTimeout(() => {}, timeoutMs); + + try { + const raceResult = await Promise.race([ + session.exec(input.command, workspaceRoot), + new Promise((resolve) => setTimeout(() => resolve(null), timeoutMs)), + ]); + + clearTimeout(timeoutHandle); + + if (raceResult === null) { + exitReason = 'timed_out'; + result = { + stdout: '', + stderr: `[qualops/exec] Command timed out after ${timeoutMs}ms`, + exitCode: 124, + durationMs: timeoutMs, + cwdDrifted: false, + cwdDriftNotice: '', + }; + } else { + result = raceResult; + } + } catch (err) { + clearTimeout(timeoutHandle); + logger.error('[bash/exec] Unexpected error during exec', { err, callId }); + result = { + stdout: '', + stderr: `[qualops/exec] Internal error: ${String(err)}`, + exitCode: 1, + durationMs: 0, + cwdDrifted: false, + cwdDriftNotice: '', + }; + } + + const postResult = await driver.postCall(callId, input.command); + violations.push(...postResult.violations); + + if (postResult.abortReview) { + logger.warn('[bash/exec] Sandbox abort triggered', { callId, reason: postResult.abortReason }); + violations.push(`ABORT_REVIEW: ${postResult.abortReason}`); + } + + let stderrWithDrift = result.stderr; + if (result.cwdDrifted) { + stderrWithDrift = (stderrWithDrift ? stderrWithDrift + '\n' : '') + result.cwdDriftNotice; + } + + const strippedStdout = stripAnsi(result.stdout); + const strippedStderr = stripAnsi(stderrWithDrift); + + const redactedStdout = redactOutput(strippedStdout, knownTokens); + const redactedStderr = redactOutput(strippedStderr, knownTokens); + + const truncatedStdout = truncateOutput(redactedStdout, reviewId, callId, 'stdout'); + const truncatedStderr = truncateOutput(redactedStderr, reviewId, callId, 'stderr'); + + const binary = input.command.trim().split(/\s+/)[0]?.replace(/^.*\//, '') ?? ''; + const semanticHint = getSemanticHint(binary, result.exitCode); + + const hint = semanticHint ? ` (${semanticHint})` : ''; + logger.info(`[bash] ← exit=${result.exitCode}${hint}, duration=${result.durationMs}ms`); + if (truncatedStdout.content) { + logger.info(`[bash] ← stdout: ${truncatedStdout.content.substring(0, 500)}`); + } + if (truncatedStderr.content) { + logger.info(`[bash] ← stderr: ${truncatedStderr.content.substring(0, 200)}`); + } + + return { + stdout: truncatedStdout.content, + stderr: truncatedStderr.content, + exit_code: result.exitCode, + exit_reason: exitReason, + duration_ms: result.durationMs, + stdout_truncated: truncatedStdout.truncated, + stderr_truncated: truncatedStderr.truncated, + stdout_full_path: truncatedStdout.fullPath, + semantic_hint: semanticHint, + sandbox_violations: violations, + }; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/git-config-template.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/git-config-template.ts new file mode 100644 index 00000000..e37f86d5 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/git-config-template.ts @@ -0,0 +1,98 @@ +/** + * Synthesises a minimal ~/.gitconfig that hardens git against hook injection, + * fsmonitor gadgets, file-protocol clones, and GPG signing. + * + * Write once per session start; pass the returned path as GIT_CONFIG_GLOBAL. + */ + +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +const GIT_CONFIG_CONTENT = ` +[core] +\tfsmonitor = false +\thooksPath = /dev/null +\tgitProxy = none +\tautocrlf = false +\tsymlinks = false + +[gpg] +\tprogram = /bin/false + +[transfer] +\tfsckObjects = true + +[fetch] +\tfsckObjects = true + +[receive] +\tfsckObjects = true +\tdenyCurrentBranch = refuse +\tdenyNonFastForwards = true + +[protocol "file"] +\tallow = never + +[protocol "ext"] +\tallow = never + +[protocol "git"] +\tallow = never + +[http] +\tsslVerify = true +\tfollowRedirects = false + +[credential] +\thelper = /bin/false + +[safe] +\tdirectory = /workspace/pr +\tdirectory = /workspace/base + +[advice] +\tdetachedHead = false + +[alias] +`.trimStart(); + +let cachedPath: string | null = null; + +/** + * Writes a hardened gitconfig to a stable tmpdir path and returns that path. + * Idempotent: returns the same path on subsequent calls within the same process. + */ +export function getGitConfigPath(): string { + if (cachedPath !== null) return cachedPath; + + const dir = path.join(os.tmpdir(), `qualops-git-${process.pid}`); + fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); + + const configPath = path.join(dir, 'gitconfig'); + fs.writeFileSync(configPath, GIT_CONFIG_CONTENT, { encoding: 'utf8', mode: 0o600 }); + + cachedPath = configPath; + return configPath; +} + +/** + * Returns the contents of the synthesised gitconfig (useful for tests). + */ +export function getGitConfigContent(): string { + return GIT_CONFIG_CONTENT; +} + +/** + * Cleans up the temporary directory (call on process exit if desired). + */ +export function cleanupGitConfig(): void { + if (cachedPath === null) return; + try { + const dir = path.dirname(cachedPath); + fs.rmSync(dir, { recursive: true, force: true }); + } catch { + // Best-effort cleanup + } + cachedPath = null; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.linux.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.linux.ts new file mode 100644 index 00000000..d2e49f2a --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.linux.ts @@ -0,0 +1,99 @@ +/** + * Best-effort sandbox for Linux using bubblewrap (bwrap). + * + * Wraps the persistent shell command with bwrap to get: + * - Read-only bind mount of /workspace (writes blocked) + * - Isolated /tmp per call + * - No network namespace (network is separate concern) + * - seccomp filter (blocks socket() for outbound network) + * + * Falls back gracefully if bwrap is not available. + */ + +import * as child_process from 'child_process'; + +import type { SandboxDriver, SpawnOptions, PreCallResult, PostCallResult } from './interface.js'; + +function isBwrapAvailable(): boolean { + try { + const result = child_process.spawnSync('bwrap', ['--version'], { timeout: 2000 }); + return result.status === 0; + } catch { + return false; + } +} + +export class BestEffortLinuxDriver implements SandboxDriver { + readonly name = 'besteffort.linux'; + private workspaceRoot: string; + + constructor(workspaceRoot: string = '/workspace') { + this.workspaceRoot = workspaceRoot; + } + + static isAvailable(): boolean { + return process.platform === 'linux' && isBwrapAvailable(); + } + + async prepare(opts: SpawnOptions): Promise { + // Wrap the argv with bwrap + const bwrapArgs = [ + // Read-only workspace mount + '--ro-bind', + this.workspaceRoot, + this.workspaceRoot, + // Essential system paths (read-only) + '--ro-bind', + '/usr', + '/usr', + '--ro-bind', + '/lib', + '/lib', + '--ro-bind', + '/lib64', + '/lib64', + '--ro-bind', + '/bin', + '/bin', + '--ro-bind', + '/sbin', + '/sbin', + // Proc/dev/sys + '--proc', + '/proc', + '--dev', + '/dev', + '--tmpfs', + '/tmp', + '--tmpfs', + '/run', + // Home dir — allow writes to tmp subdir + '--tmpfs', + '/root', + // Deny network (new network namespace with no interfaces) + '--unshare-net', + // Deny user namespace creation + '--new-session', + // Working directory + '--chdir', + opts.cwd, + '--', + ...opts.argv, + ]; + + return { + ...opts, + argv: ['bwrap', ...bwrapArgs], + }; + } + + async preCall(_callId: string, _command: string): Promise { + return { violations: [] }; + } + + async postCall(_callId: string, _command: string): Promise { + return { violations: [], abortReview: false }; + } + + async teardown(): Promise {} +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.macos.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.macos.ts new file mode 100644 index 00000000..306c830f --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.macos.ts @@ -0,0 +1,106 @@ +/** + * Best-effort sandbox for macOS using sandbox-exec (Seatbelt). + * + * Applies a Seatbelt profile that: + * - Allows read access to /workspace and standard system paths + * - Denies network outbound connections + * - Denies writes outside /tmp and /workspace + */ + +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +import type { SandboxDriver, SpawnOptions, PreCallResult, PostCallResult } from './interface.js'; + +function buildSeatbeltProfile(workspaceRoot: string): string { + return ` +(version 1) +(deny default) + +; Allow reading the workspace (PR code under review) +(allow file-read* (subpath "${workspaceRoot}")) + +; Allow standard system reads +(allow file-read* + (subpath "/usr") + (subpath "/bin") + (subpath "/sbin") + (subpath "/lib") + (subpath "/private/etc") + (literal "/dev/urandom") + (literal "/dev/random") + (literal "/dev/null") + (literal "/dev/zero") +) + +; Allow writes only to /tmp and sandbox tmp +(allow file-write* (subpath "/private/tmp")) +(allow file-write* (subpath "/tmp")) + +; Allow process operations +(allow process-exec*) +(allow process-fork) + +; Allow IPC (pipes between child processes) +(allow ipc-posix-shm) +(allow ipc-posix-sem) +(allow ipc-sysv-shm) +(allow ipc-sysv-sem) + +; Deny all network +(deny network*) +(deny system-socket) +`.trim(); +} + +export class BestEffortMacOSDriver implements SandboxDriver { + readonly name = 'besteffort.macos'; + private workspaceRoot: string; + private profilePath: string | null = null; + + constructor(workspaceRoot: string = '/workspace') { + this.workspaceRoot = workspaceRoot; + } + + static isAvailable(): boolean { + return process.platform === 'darwin'; + } + + private getProfilePath(): string { + if (this.profilePath) return this.profilePath; + const dir = path.join(os.tmpdir(), `qualops-seatbelt-${process.pid}`); + fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); + const profilePath = path.join(dir, 'profile.sb'); + fs.writeFileSync(profilePath, buildSeatbeltProfile(this.workspaceRoot), { mode: 0o600 }); + this.profilePath = profilePath; + return profilePath; + } + + async prepare(opts: SpawnOptions): Promise { + const profilePath = this.getProfilePath(); + return { + ...opts, + argv: ['sandbox-exec', '-f', profilePath, '--', ...opts.argv], + }; + } + + async preCall(_callId: string, _command: string): Promise { + return { violations: [] }; + } + + async postCall(_callId: string, _command: string): Promise { + return { violations: [], abortReview: false }; + } + + async teardown(): Promise { + if (this.profilePath) { + try { + fs.rmSync(path.dirname(this.profilePath), { recursive: true, force: true }); + } catch { + /* best-effort cleanup */ + } + this.profilePath = null; + } + } +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.win.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.win.ts new file mode 100644 index 00000000..c8bb592d --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/besteffort.win.ts @@ -0,0 +1,36 @@ +/** + * Best-effort sandbox for Windows using a restricted process token. + * Strips the WRITE_DAC, WRITE_OWNER, and most privileges from the token. + * Network isolation is not available without Hyper-V or WSL2. + * + * NOTE: This is a structural stub for M0. Full implementation requires + * the `winreg` and `win32api` native modules which are Windows-only. + * The driver is only instantiated when process.platform === 'win32'. + */ + +import type { SandboxDriver, SpawnOptions, PreCallResult, PostCallResult } from './interface.js'; + +export class BestEffortWindowsDriver implements SandboxDriver { + readonly name = 'besteffort.win'; + + static isAvailable(): boolean { + return process.platform === 'win32'; + } + + async prepare(opts: SpawnOptions): Promise { + // On Windows we rely on the caller to spawn with a restricted token. + // The shell command is passed through unmodified; the caller (session.ts) + // is responsible for using CreateRestrictedToken via native bindings. + return opts; + } + + async preCall(_callId: string, _command: string): Promise { + return { violations: [] }; + } + + async postCall(_callId: string, _command: string): Promise { + return { violations: [], abortReview: false }; + } + + async teardown(): Promise {} +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/ci.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/ci.ts new file mode 100644 index 00000000..96d694cb --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/ci.ts @@ -0,0 +1,141 @@ +/** + * CI sandbox driver. + * + * On GitHub Actions runners and similar CI environments, the OS already + * provides process isolation (separate VM/container per job). This driver: + * - Applies rlimits via `ulimit` wrapper commands + * - Injects HTTP_PROXY / HTTPS_PROXY to route network to a blocking proxy + * - Snapshots git hooks inode+mtime before each call; aborts review on tampering + */ + +import * as fs from 'fs'; +import * as path from 'path'; + +import type { SandboxDriver, SpawnOptions, PreCallResult, PostCallResult } from './interface.js'; + +interface HookSnapshot { + [filePath: string]: { ino: number; mtimeMs: number; size: number }; +} + +interface CISandboxConfig { + /** Path to the PR's .git/hooks directory to monitor. */ + prHooksDir?: string; + /** HTTP proxy URL for blocking outbound network (if any). */ + httpProxy?: string; + /** Maximum number of file descriptors. Default: 256. */ + maxFds?: number; + /** Maximum number of processes. Default: 64. */ + maxProcs?: number; + /** Maximum memory in MB. Default: 512. */ + maxMemoryMb?: number; +} + +function snapshotHooks(hooksDir: string): HookSnapshot { + const snapshot: HookSnapshot = {}; + if (!fs.existsSync(hooksDir)) return snapshot; + + for (const entry of fs.readdirSync(hooksDir)) { + const fullPath = path.join(hooksDir, entry); + try { + const stat = fs.statSync(fullPath); + snapshot[fullPath] = { + ino: stat.ino, + mtimeMs: stat.mtimeMs, + size: stat.size, + }; + } catch { + // File disappeared between readdir and stat — ignore + } + } + return snapshot; +} + +function compareSnapshots(before: HookSnapshot, after: HookSnapshot): string[] { + const violations: string[] = []; + + // Check for new or modified hooks + for (const [filePath, afterStat] of Object.entries(after)) { + const beforeStat = before[filePath]; + if (!beforeStat) { + violations.push(`New git hook created: ${path.basename(filePath)}`); + } else if ( + beforeStat.ino !== afterStat.ino || + beforeStat.mtimeMs !== afterStat.mtimeMs || + beforeStat.size !== afterStat.size + ) { + violations.push(`Git hook modified: ${path.basename(filePath)}`); + } + } + + // Check for deleted hooks (less dangerous but still notable) + for (const filePath of Object.keys(before)) { + if (!(filePath in after)) { + violations.push(`Git hook deleted: ${path.basename(filePath)}`); + } + } + + return violations; +} + +export class CISandboxDriver implements SandboxDriver { + readonly name = 'ci'; + private config: Required; + private hookSnapshot: HookSnapshot = {}; + + constructor(config: CISandboxConfig = {}) { + this.config = { + prHooksDir: config.prHooksDir ?? '', + httpProxy: config.httpProxy ?? 'http://127.0.0.1:1', // localhost blackhole + maxFds: config.maxFds ?? 256, + maxProcs: config.maxProcs ?? 64, + maxMemoryMb: config.maxMemoryMb ?? 512, + }; + } + + async prepare(opts: SpawnOptions): Promise { + // Inject proxy env vars to block outbound network + const env: NodeJS.ProcessEnv = { + ...opts.env, + HTTP_PROXY: this.config.httpProxy, + HTTPS_PROXY: this.config.httpProxy, + http_proxy: this.config.httpProxy, + https_proxy: this.config.httpProxy, + // Prevent git from accessing its own credential helpers + GIT_TERMINAL_PROMPT: '0', + GIT_ASKPASS: 'true', + }; + + return { ...opts, env }; + } + + async preCall(_callId: string, _command: string): Promise { + // Snapshot hooks before each call + if (this.config.prHooksDir) { + this.hookSnapshot = snapshotHooks(this.config.prHooksDir); + } + return { violations: [] }; + } + + async postCall(_callId: string, _command: string): Promise { + if (!this.config.prHooksDir) { + return { violations: [], abortReview: false }; + } + + const afterSnapshot = snapshotHooks(this.config.prHooksDir); + const violations = compareSnapshots(this.hookSnapshot, afterSnapshot); + + if (violations.length > 0) { + return { + violations, + abortReview: true, + abortReason: `Git hook tampering detected (T4): ${violations.join('; ')}`, + }; + } + + return { violations: [], abortReview: false }; + } + + async teardown(): Promise { + this.hookSnapshot = {}; + } +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/detect.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/detect.ts new file mode 100644 index 00000000..185cdd1c --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/detect.ts @@ -0,0 +1,85 @@ +/** + * Auto-detects the appropriate sandbox mode based on the current environment. + * + * Priority order: + * 1. QUALOPS_SANDBOX_MODE env var (explicit override) + * 2. CI environment → CISandboxDriver + * 3. Linux + bwrap available → BestEffortLinuxDriver + * 4. macOS → BestEffortMacOSDriver + * 5. Windows → BestEffortWindowsDriver + * 6. QUALOPS_ALLOW_UNSANDBOXED=1 → NoneSandboxDriver (test only) + * 7. Error — cannot run without a sandbox + */ + +import { BestEffortLinuxDriver } from './besteffort.linux.js'; +import { BestEffortMacOSDriver } from './besteffort.macos.js'; +import { BestEffortWindowsDriver } from './besteffort.win.js'; +import { CISandboxDriver } from './ci.js'; +import type { SandboxDriver } from './interface.js'; +import { NoneSandboxDriver } from './none.js'; + +export type SandboxMode = 'ci' | 'local-besteffort' | 'none' | 'auto'; + +function isCI(): boolean { + return !!( + process.env['CI'] || + process.env['GITHUB_ACTIONS'] || + process.env['GITLAB_CI'] || + process.env['CIRCLECI'] || + process.env['TRAVIS'] || + process.env['JENKINS_URL'] || + process.env['BUILDKITE'] + ); +} + +export interface DetectOptions { + /** Explicit mode override. If 'auto', detection runs normally. */ + mode?: SandboxMode; + /** Path to the PR's workspace (used by best-effort drivers). */ + workspaceRoot?: string; + /** Path to the PR's .git/hooks dir (used by CI driver). */ + prHooksDir?: string; + /** HTTP proxy URL for CI driver. */ + httpProxy?: string; +} + +/** + * Returns the appropriate SandboxDriver for the current environment. + * Throws if no suitable driver can be found and unsandboxed mode is not allowed. + */ +export function detectSandboxDriver(opts: DetectOptions = {}): SandboxDriver { + const { mode = 'auto', workspaceRoot = '/workspace', prHooksDir, httpProxy } = opts; + + const explicitMode = + mode !== 'auto' ? mode : (process.env['QUALOPS_SANDBOX_MODE'] as SandboxMode | undefined); + + if (explicitMode === 'none') { + return NoneSandboxDriver.create(); + } + + if (explicitMode === 'ci' || (!explicitMode && isCI())) { + return new CISandboxDriver({ prHooksDir, httpProxy }); + } + + if (explicitMode === 'local-besteffort' || !explicitMode) { + if (BestEffortLinuxDriver.isAvailable()) { + return new BestEffortLinuxDriver(workspaceRoot); + } + if (BestEffortMacOSDriver.isAvailable()) { + return new BestEffortMacOSDriver(workspaceRoot); + } + if (BestEffortWindowsDriver.isAvailable()) { + return new BestEffortWindowsDriver(); + } + } + + if (process.env['QUALOPS_ALLOW_UNSANDBOXED'] === '1') { + return NoneSandboxDriver.create(); + } + + throw new Error( + 'No sandbox driver available for this environment. ' + + 'Set QUALOPS_ALLOW_UNSANDBOXED=1 to run without sandboxing (not recommended). ' + + 'Or set QUALOPS_SANDBOX_MODE=ci for CI environments.', + ); +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/interface.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/interface.ts new file mode 100644 index 00000000..4a3ba105 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/interface.ts @@ -0,0 +1,56 @@ +/** + * SandboxDriver interface — implemented by each sandbox mode. + * + * Drivers wrap the persistent shell process to add OS-level confinement. + * The driver is responsible for: + * 1. pre-call setup (rlimits, cwd validation, hook snapshot) + * 2. post-call verification (hook snapshot comparison) + * 3. reporting violations + */ + +export interface SpawnOptions { + /** Absolute path to use as the initial working directory. */ + cwd: string; + /** Environment variables for the child process. */ + env: NodeJS.ProcessEnv; + /** The command + args to spawn (first element is the executable). */ + argv: string[]; +} + +export interface PreCallResult { + /** Any pre-call violations detected (e.g. suspicious hook state). */ + violations: string[]; +} + +export interface PostCallResult { + /** Any post-call violations detected (e.g. hooks were tampered). */ + violations: string[]; + /** If true, the calling code should abort the current review session. */ + abortReview: boolean; + abortReason?: string; +} + +export interface SandboxDriver { + readonly name: string; + + /** + * Called once before the persistent shell is spawned. + * Returns the modified SpawnOptions (may wrap argv with sandbox commands). + */ + prepare(opts: SpawnOptions): Promise; + + /** + * Called before each bash tool call. + */ + preCall(callId: string, command: string): Promise; + + /** + * Called after each bash tool call completes. + */ + postCall(callId: string, command: string): Promise; + + /** + * Called when the session is disposed. + */ + teardown(): Promise; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/none.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/none.ts new file mode 100644 index 00000000..90fed4fa --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/modes/none.ts @@ -0,0 +1,35 @@ +/** + * None sandbox driver — no OS-level confinement. + * ONLY for local development and unit tests. + * Gated by QUALOPS_ALLOW_UNSANDBOXED=1. + */ + +import type { SandboxDriver, SpawnOptions, PreCallResult, PostCallResult } from './interface.js'; + +export class NoneSandboxDriver implements SandboxDriver { + readonly name = 'none'; + + static create(): NoneSandboxDriver { + if (process.env['QUALOPS_ALLOW_UNSANDBOXED'] !== '1') { + throw new Error( + 'NoneSandboxDriver requires QUALOPS_ALLOW_UNSANDBOXED=1 env var. ' + + 'Do not use in production.', + ); + } + return new NoneSandboxDriver(); + } + + async prepare(opts: SpawnOptions): Promise { + return opts; + } + + async preCall(_callId: string, _command: string): Promise { + return { violations: [] }; + } + + async postCall(_callId: string, _command: string): Promise { + return { violations: [], abortReview: false }; + } + + async teardown(): Promise {} +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/parser.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/parser.ts new file mode 100644 index 00000000..ca9b8fc5 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/parser.ts @@ -0,0 +1,172 @@ +/** + * Structural command parser (Option B — regex-based for M0). + * + * Tokenises a shell command string into a sequence of "logical commands" + * (the parts separated by top-level operators: ;, &&, ||, |, newline). + * Does NOT perform full AST parsing — that is deferred to tree-sitter in M1. + * + * The key operation is stripping quoted substrings from the analysis copy + * so that operators inside strings are not mistakenly treated as structural. + */ + +export interface ParsedCommand { + /** The raw text of this logical command (may span pipelines). */ + raw: string; + /** The first word of the command (binary name). */ + binary: string; + /** All arguments after the binary (split on whitespace, unquoted). */ + args: string[]; + /** True if the raw text contains unquoted redirection operators. */ + hasRedirection: boolean; + /** True if the raw text contains unquoted variable/subshell expansions. */ + hasExpansion: boolean; + /** True if the raw text ends with an unquoted background `&`. */ + hasBackground: boolean; + /** True if the raw text contains unquoted NUL or literal newline. */ + hasControlChars: boolean; +} + +export interface TokenisedShell { + /** The raw input command string. */ + raw: string; + /** Individual logical commands extracted from the input. */ + commands: ParsedCommand[]; + /** True if parsing detected a structural problem (e.g. unmatched quotes). */ + parseError: boolean; + parseErrorMessage?: string; +} + +/** + * Strips single-quoted substrings and replaces them with placeholders. + * Content inside single quotes cannot contain variable expansions and + * is literal — we don't want to false-positive on `$ inside them. + */ +function stripSingleQuoted(input: string): string { + // Replace 'content' (not containing single quotes) with safe placeholder + return input.replace(/'[^']*'/g, "'__SQ__'"); +} + +/** + * Strips double-quoted substrings (best-effort — does not handle nested + * escapes perfectly, which is fine for M0 deny-list checking). + */ +function stripDoubleQuoted(input: string): string { + // Simple approach: replace "content" sequences, being conservative about + // what we strip — only replace if the close-quote is on the same "line". + return input.replace(/"(?:[^"\\]|\\.)*"/g, '"__DQ__"'); +} + +/** + * Returns the "analysis copy" of a command: single- and double-quoted + * substrings replaced with placeholders so we can reliably detect + * top-level structural tokens. + */ +export function toAnalysisCopy(cmd: string): string { + let s = cmd; + // Iteratively strip to handle adjacent quoted strings + for (let i = 0; i < 5; i++) { + const prev = s; + s = stripSingleQuoted(s); + s = stripDoubleQuoted(s); + if (s === prev) break; + } + return s; +} + +/** + * Splits a command string on top-level operators (;, &&, ||, |, newline) + * and returns each logical segment. Operates on the analysis copy to + * avoid splitting on operators inside quoted strings. + */ +export function splitOnOperators(cmd: string): string[] { + const analysis = toAnalysisCopy(cmd); + const segments: string[] = []; + let start = 0; + + for (let i = 0; i < analysis.length; i++) { + const ch = analysis[i]; + const next = analysis[i + 1]; + + if (ch === '\n') { + segments.push(cmd.slice(start, i).trim()); + start = i + 1; + } else if (ch === ';') { + segments.push(cmd.slice(start, i).trim()); + start = i + 1; + } else if ((ch === '&' && next === '&') || (ch === '|' && next === '|')) { + segments.push(cmd.slice(start, i).trim()); + start = i + 2; + i++; // skip second char + } else if (ch === '|' && next !== '|') { + // pipeline — keep as part of same logical command for policy checking + // (we parse each pipeline stage separately below) + segments.push(cmd.slice(start, i).trim()); + start = i + 1; + } + } + const tail = cmd.slice(start).trim(); + if (tail) segments.push(tail); + + return segments.filter(Boolean); +} + +/** + * Parses a single logical command (no structural operators) into its + * binary and arguments. Strips leading environment assignments. + */ +export function parseLogicalCommand(raw: string): ParsedCommand { + const analysis = toAnalysisCopy(raw); + + const hasRedirection = + /(?:^|[^<>])[<>](?![<>])/.test(analysis) || />>/.test(analysis) || /< = new Set([ + // Privilege escalation + 'sudo', + 'su', + 'doas', + 'pkexec', + 'runuser', + 'nsenter', + 'unshare', + 'chroot', + 'capsh', + 'newuidmap', + 'newgidmap', + + // Network tools that can exfiltrate data + 'curl', + 'wget', + 'nc', + 'netcat', + 'ncat', + 'socat', + 'telnet', + 'ssh', + 'scp', + 'sftp', + 'rsync', + 'ftp', + 'tftp', + 'nmap', + 'masscan', + 'zmap', + + // Package managers that can execute remote code + 'apt', + 'apt-get', + 'apt-cache', + 'dpkg', + 'rpm', + 'yum', + 'dnf', + 'pacman', + 'brew', + 'snap', + 'flatpak', + 'pip', + 'pip3', + 'pipx', + 'gem', + 'cargo', + 'go', + 'composer', + 'mvn', + 'gradle', + + // Destructive filesystem ops + 'rm', + 'rmdir', + 'shred', + 'wipe', + 'srm', + + // Mount / device + 'mount', + 'umount', + 'losetup', + 'mkfs', + 'fdisk', + 'parted', + 'blkid', + 'lsblk', + + // Process / signal + 'kill', + 'killall', + 'pkill', + 'reboot', + 'shutdown', + 'halt', + 'poweroff', + 'init', + 'systemctl', + 'service', + 'launchctl', + + // Interactive tools (must not block the persistent shell) + 'vim', + 'vi', + 'nano', + 'emacs', + 'pico', + 'ed', + 'less', + 'more', + 'top', + 'htop', + 'btop', + 'atop', + 'watch', + 'tail', // tail -f is interactive; `tail -n 20 file` is checked below + 'man', + 'tmux', + 'screen', + 'byobu', + + // Crypto / wallet tools + 'gpg', + 'gpg2', + 'pgp', + 'openssl', // blocked; use node crypto instead + 'keychain', + 'ssh-agent', + 'ssh-add', + 'ssh-keygen', + 'pass', + 'gopass', + 'vault', + + // Container / VM escapes + 'docker', + 'podman', + 'nerdctl', + 'crictl', + 'kubectl', + 'helm', + 'kind', + 'k3s', + 'virsh', + 'qemu', + 'kvm', + 'cgroups', + + // Compiler / build tools that network (bare invocations) + 'make', + 'cmake', + + // env / exec wrappers that could bypass policy + 'env', // use direct invocation instead + 'exec', + 'eval', + + // Background/session escape (§3.5) + 'setsid', + + // Misc high-risk + 'crontab', + 'at', + 'batch', + 'iptables', + 'ip6tables', + 'nftables', + 'ufw', + 'firewalld', + 'strace', + 'ltrace', + 'dtrace', + 'perf', + 'gdb', + 'lldb', + 'xxd', + 'od', // can be used to exfiltrate binary secrets + 'base64', // can encode/decode secrets — blocked at binary level + 'dd', + 'tee', // can silently copy output to a file +]); + +const DENIED_NPM_SUBCOMMANDS: ReadonlySet = new Set([ + 'install', + 'i', + 'ci', + 'add', + 'publish', + 'pack', + 'deprecate', + 'unpublish', + 'audit', // can make network calls + 'fund', + 'set', // can modify .npmrc with token + 'login', + 'adduser', + 'logout', + 'link', + 'unlink', + 'run', + 'start', + 'stop', + 'restart', + 'test', // can execute arbitrary scripts + 'exec', + 'x', + 'update', + 'up', + 'upgrade', +]); + +const ALLOWED_GIT_SUBCOMMANDS: ReadonlySet = new Set([ + 'log', + 'show', + 'diff', + 'status', + 'blame', + 'shortlog', + 'ls-files', + 'ls-tree', + 'rev-parse', + 'rev-list', + 'cat-file', + 'describe', + 'tag', + '--no-pager', + 'branch', + 'stash', + 'grep', + 'archive', // archive is read-only + 'format-patch', // read-only output + 'symbolic-ref', + 'name-rev', + 'for-each-ref', + 'check-ignore', + 'check-attr', + 'remote', // read-only subcommands checked further below + 'config', // checked further below for dangerous patterns +]); + +const DENIED_GIT_SUBCOMMANDS: ReadonlySet = new Set([ + 'push', + 'fetch', + 'pull', + 'clone', + 'submodule', + 'filter-branch', + 'replace', + 'fast-import', + 'gc', + 'prune', + 'repack', + 'apply', + 'am', + 'cherry-pick', + 'rebase', + 'merge', + 'reset', + 'commit', + 'add', + 'rm', + 'mv', + 'restore', + 'checkout', + 'switch', + 'init', + 'bisect', + 'worktree', + 'bundle', + 'clean', + 'update-ref', + 'update-index', + 'write-tree', + 'commit-tree', + 'read-tree', + 'hash-object', + 'update-server-info', + 'notes', + 'instaweb', + 'fsck', // can traverse loose objects + 'rerere', + 'credential', + 'send-email', + 'request-pull', + 'p4', + 'svn', + 'cvsserver', +]); + +const DENIED_GIT_CONFIG_PATTERNS: readonly RegExp[] = [ + /core\.fsmonitor/i, + /core\.hooksPath/i, + /core\.gitProxy/i, + /core\.editor/i, + /sequence\.editor/i, + /gpg\.program/i, + /credential\.helper/i, + /protocol\./i, + /http\.proxy/i, + /http\.cookieFile/i, + /transfer\.fsckObjects/i, + /uploadPack\./i, + /receivepack\./i, +]; + +function isPythonInteractive(args: string[]): boolean { + // No args or only flags that launch REPL + const filtered = args.filter((a) => !a.startsWith('-')); + if (filtered.length === 0) { + // Check for -c, -m, or script file + const flags = args.filter((a) => a.startsWith('-')); + const hasC = flags.includes('-c'); + const hasM = flags.includes('-m'); + if (!hasC && !hasM) return true; // bare python / python -i + } + return false; +} + +function isNodeInteractive(args: string[]): boolean { + // node with no file arg and no -e launches REPL + const hasFile = args.some((a) => !a.startsWith('-')); + const hasE = args.includes('-e') || args.includes('--eval'); + return !hasFile && !hasE; +} + +const DEFAULT_WORKSPACE_ROOTS = ['/workspace/pr', '/workspace/base', '/workspace']; + +function isAllowedCdTarget(target: string, workspaceRoot?: string): boolean { + // Explicit escape patterns — deny regardless of type + if (target === '~' || target.startsWith('~/') || target === '-') return false; + + // Relative paths are allowed (stay within cwd) + if (!target.startsWith('/')) return true; + + const allowed = workspaceRoot + ? [workspaceRoot, workspaceRoot + '/pr', workspaceRoot + '/base'] + : DEFAULT_WORKSPACE_ROOTS; + + for (const root of allowed) { + if (target === root || target.startsWith(root + '/')) return true; + } + return false; +} + +function checkEnvHijacking(analysis: string): PolicyOutcome | null { + if ( + /\bLD_PRELOAD\s*=/.test(analysis) || + /\bLD_LIBRARY_PATH\s*=/.test(analysis) || + /\bDYLD_INSERT_LIBRARIES\s*=/.test(analysis) || + /\bDYLD_LIBRARY_PATH\s*=/.test(analysis) + ) { + return deny('ld-preload', 'LD_PRELOAD / LD_LIBRARY_PATH / DYLD env hijacking is not allowed'); + } + return null; +} + +function checkHardDenyBinary(binaryBase: string, args: string[]): PolicyOutcome | null { + if (!HARD_DENY_BINARIES.has(binaryBase)) return null; + // Special carve-out: `tail -n N file` (non-interactive, non-follow) is OK + if (binaryBase === 'tail') { + const hasFollow = args.some((a) => a === '-f' || a === '--follow' || /^-[^-]*f/.test(a)); + if (!hasFollow) return allow; + } + return deny('denied-binary', `${binaryBase} is not allowed in the bash tool`); +} + +function checkPackageManager(binaryBase: string, args: string[]): PolicyOutcome | null { + if (!['npm', 'npx', 'yarn', 'pnpm', 'bun'].includes(binaryBase)) return null; + const sub = args[0]; + if (!sub) { + return deny('package-manager-interactive', `${binaryBase} without subcommand is not allowed`); + } + if (DENIED_NPM_SUBCOMMANDS.has(sub)) { + return deny('npm-install', `${binaryBase} ${sub} is not allowed — no network package installs`); + } + return null; +} + +function checkGitCommand(args: string[]): PolicyOutcome { + const dashCIdx = args.indexOf('-c'); + if (dashCIdx !== -1) { + const kvPair = args[dashCIdx + 1] ?? ''; + for (const pattern of DENIED_GIT_CONFIG_PATTERNS) { + if (pattern.test(kvPair)) { + return deny( + 'git-config-injection', + `git -c ${kvPair} is not allowed — potential hook/config injection`, + ); + } + } + } + + const subCmd = args.find((a) => !a.startsWith('-') && !a.includes('=')); + if (!subCmd) return allow; // git with no subcommand is harmless (shows usage) + + if (DENIED_GIT_SUBCOMMANDS.has(subCmd)) { + return deny('git-denied-subcommand', `git ${subCmd} is not allowed`); + } + if (!ALLOWED_GIT_SUBCOMMANDS.has(subCmd)) { + // Unknown subcommand — deny by default (allowlist approach) + return deny('git-unknown-subcommand', `git ${subCmd} is not in the allowed subcommand list`); + } + + if (subCmd === 'remote') { + const remoteArgs = args.slice(args.indexOf('remote') + 1); + const remoteSubCmd = remoteArgs.find((a) => !a.startsWith('-')); + if (remoteSubCmd && !['get-url', 'show', '-v', '--verbose'].includes(remoteSubCmd)) { + return deny('git-remote-write', `git remote ${remoteSubCmd} is not allowed`); + } + } + + return allow; +} + +function checkInteractiveRepl( + binaryBase: string, + args: string[], + raw: string, +): PolicyOutcome | null { + if (['python', 'python3', 'python3.10', 'python3.11', 'python3.12'].includes(binaryBase)) { + if (isPythonInteractive(args)) { + return deny( + 'interactive-repl', + `${binaryBase} without args or script launches an interactive REPL`, + ); + } + return allow; + } + + if (binaryBase === 'node') { + if (isNodeInteractive(args)) { + return deny('interactive-repl', 'node without file/eval arg launches an interactive REPL'); + } + return allow; + } + + if (binaryBase === 'psql') { + const hasCmd = args.includes('-c') || args.includes('--command'); + const hasFile = args.includes('-f') || args.includes('--file'); + if (!hasCmd && !hasFile) { + return deny('interactive-repl', 'psql without -c or -f launches an interactive shell'); + } + return allow; + } + + if (binaryBase === 'mysql') { + const hasE = args.includes('-e') || args.includes('--execute'); + if (!hasE) { + return deny('interactive-repl', 'mysql without -e launches an interactive shell'); + } + return allow; + } + + if (binaryBase === 'sqlite3') { + // Check raw command: a SQL keyword anywhere after the binary indicates a one-shot query. + // We do NOT use args[] here because the parser splits on whitespace and strips quotes, + // so '"SELECT * FROM users"' becomes ['"SELECT', '*', 'FROM', ...]. + const afterBinary = raw.replace(/^[^\s]+\s*/, ''); + const hasQuery = + /\b(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP|PRAGMA|ATTACH)\b/i.test(afterBinary) || + /;/.test(afterBinary); + if (!hasQuery) { + return deny('interactive-repl', 'sqlite3 without a SQL query launches an interactive shell'); + } + return allow; + } + + if (['mongosh', 'ipython', 'deno'].includes(binaryBase)) { + return deny( + 'interactive-repl', + `${binaryBase} is not allowed — use python3, node, or sqlite3 equivalents`, + ); + } + + return null; +} + +function checkShellInvocation( + binaryBase: string, + args: string[], + raw: string, +): PolicyOutcome | null { + if (!['bash', 'sh', 'zsh'].includes(binaryBase)) return null; + + const dashCIdx = args.indexOf('-c'); + if (dashCIdx !== -1) { + const afterDashC = raw.replace(/^.*?\s-c\s+/, ''); + // Strip single-quoted substrings (no expansion), then check remainder for $. + // We intentionally keep double-quoted content because $VAR expands inside "...". + const withoutSQ = afterDashC.replace(/'[^']*'/g, "'__SQ__'"); + if (/\$[A-Za-z_{(0-9#@*?!]/.test(withoutSQ) || /`/.test(withoutSQ)) { + return deny( + 'shell-expansion-in-literal', + `${binaryBase} -c with variable/subshell expansion in literal is not allowed`, + ); + } + return allow; + } + + if (args.length === 0) { + return deny('interactive-repl', `${binaryBase} without args launches an interactive shell`); + } + return allow; +} + +function checkPathAccess( + binaryBase: string, + args: string[], + workspaceRoot?: string, +): PolicyOutcome | null { + if (binaryBase === 'cd') { + const target = args[0]; + if (target && !isAllowedCdTarget(target, workspaceRoot)) { + return deny('cd-outside-workspace', `cd to ${target} is outside the allowed workspace`); + } + return allow; + } + + if (['cat', 'head', 'less', 'more', 'cp', 'mv'].includes(binaryBase)) { + const absoluteArgs = args.filter((a) => !a.startsWith('-') && a.startsWith('/')); + for (const p of absoluteArgs) { + if (!isAllowedCdTarget(p, workspaceRoot)) { + return deny( + 'path-outside-workspace', + `${binaryBase} with path outside workspace is not allowed: ${p}`, + ); + } + } + } + + return null; +} + +function checkSingleCommand(cmd: ParsedCommand, config: PolicyConfig): PolicyOutcome { + const { raw, binary, args, hasBackground, hasControlChars } = cmd; + + if (!binary) return allow; + if (hasControlChars) return deny('control-chars', 'Command contains NUL or embedded newline'); + if (hasBackground) + return deny('background', 'Trailing & is not allowed — use synchronous commands only'); + + const analysis = toAnalysisCopy(raw); + const binaryBase = binary.replace(/^.*\//, ''); + + if (binaryBase === 'nohup' || binaryBase === 'disown') { + return deny('background', `${binaryBase} is not allowed — no background processes`); + } + + let result: PolicyOutcome | null; + + if ((result = checkEnvHijacking(analysis))) return result; + if ((result = checkHardDenyBinary(binaryBase, args))) return result; + if (config.extraDeniedBinaries?.includes(binaryBase)) { + return deny('extra-denied-binary', `${binaryBase} is not allowed per config`); + } + if ((result = checkPackageManager(binaryBase, args))) return result; + if (binaryBase === 'git') return checkGitCommand(args); + if ((result = checkInteractiveRepl(binaryBase, args, raw))) return result; + if ((result = checkShellInvocation(binaryBase, args, raw))) return result; + if ((result = checkPathAccess(binaryBase, args, config.workspaceRoot))) return result; + + if (config.extraDenyPatterns) { + for (const pattern of config.extraDenyPatterns) { + if (new RegExp(pattern).test(analysis)) { + return deny('extra-deny-pattern', `Command matched extra deny pattern: ${pattern}`); + } + } + } + + return allow; +} + +function checkStructural(cmd: string): PolicyOutcome | null { + const analysis = toAnalysisCopy(cmd); + + if (/(?:^|\s)>\s*\//.test(analysis)) { + return deny('redirection', 'Output redirection to absolute path is not allowed'); + } + if (/(?:^|\s)>>\s*\//.test(analysis)) { + return deny('redirection', 'Output append-redirection to absolute path is not allowed'); + } + if (/(?:^|\s)<\(/.test(analysis)) { + // Process substitution — allowed; do not block + } + + if (/\x00/.test(cmd)) { + return deny('control-chars', 'Command contains NUL byte'); + } + + return null; +} + +export function evaluatePolicy(command: string, config: PolicyConfig = {}): PolicyOutcome { + if (!command || command.trim() === '') { + return deny('empty-command', 'Empty command is not allowed'); + } + + const structural = checkStructural(command); + if (structural) return structural; + + const tokenised = tokenise(command); + if (tokenised.parseError) { + return deny('parse-error', tokenised.parseErrorMessage ?? 'Failed to parse command'); + } + + for (const cmd of tokenised.commands) { + const result = checkSingleCommand(cmd, config); + if (result.deny) return result; + } + + return allow; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/secret-redact.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/secret-redact.ts new file mode 100644 index 00000000..0999299a --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/secret-redact.ts @@ -0,0 +1,58 @@ +import { SENSITIVE_VALUE_PATTERN_SOURCE, REDACTED } from '../../../../../shared/utils/security.js'; + +// Extended patterns beyond the base SENSITIVE_VALUE_PATTERN_SOURCE. +// Sources: gitleaks rule set, common CI token shapes. +const EXTENDED_PATTERNS: readonly string[] = [ + // Anthropic API keys + 'sk-ant-[A-Za-z0-9_-]{10,}', + + // Slack tokens + 'xox[baprs]-[A-Za-z0-9-]{10,}', + + // GCP service account JSON (partial — the key field) + '"private_key"\\s*:\\s*"-----BEGIN [A-Z ]+-----[^"]{20,}"', + + // npm tokens + 'npm_[A-Za-z0-9]{36}', + + // Docker Hub tokens + 'dckr_pat_[A-Za-z0-9_-]{27,}', + + // HashiCorp Vault tokens + 's\\.hvs\\.[A-Za-z0-9]{24,}', + 'hvs\\.[A-Za-z0-9]{24,}', + + // Stripe keys + 'sk_live_[A-Za-z0-9]{24,}', + 'rk_live_[A-Za-z0-9]{24,}', + 'pk_live_[A-Za-z0-9]{24,}', + + // SendGrid + 'SG\\.[A-Za-z0-9_-]{22}\\.[A-Za-z0-9_-]{43}', + + // Twilio + 'SK[A-Za-z0-9]{32}', + 'AC[A-Za-z0-9]{32}', + + // Generic high-entropy base64 segments that look like tokens + // (32+ base64 chars after an = sign, commonly seen in .env files) + '(?<==)[A-Za-z0-9+/]{32,}={0,2}(?=\\s|$|")', +]; + +const EXTENDED_PATTERN_SOURCE = EXTENDED_PATTERNS.join('|'); +const COMBINED_PATTERN_SOURCE = `${SENSITIVE_VALUE_PATTERN_SOURCE}|${EXTENDED_PATTERN_SOURCE}`; + +export function redactOutput(text: string, knownTokens: string[] = []): string { + if (!text) return text; + + let redacted = text.replace(new RegExp(COMBINED_PATTERN_SOURCE, 'g'), REDACTED); + + for (const token of knownTokens) { + if (token && token.length >= 8) { + const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + redacted = redacted.replace(new RegExp(escaped, 'g'), REDACTED); + } + } + + return redacted; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session-impl.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session-impl.ts new file mode 100644 index 00000000..3d541d57 --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session-impl.ts @@ -0,0 +1,233 @@ +/** + * Persistent bash shell session. + * + * Spawns a single `bash --norc --noprofile` process that lives for the + * duration of a review session. Each tool call writes a command to stdin + * and reads until the sentinel PS1 marker appears on stdout. + * + * The sentinel PS1 encodes the exit code and cwd as JSON so we can parse + * them without an extra round-trip. + */ + +import * as child_process from 'child_process'; + +import { makeCleanEnv } from './env-scrub.js'; +import { getGitConfigPath } from './git-config-template.js'; +import type { SandboxDriver, SpawnOptions } from './modes/interface.js'; +import { logger } from '../../../../../shared/utils/logger.js'; + +const SENTINEL_BEGIN = '###QUALOPS_PS1_BEGIN###'; +const SENTINEL_END = '###QUALOPS_PS1_END###'; + +// Shell init script: HISTFILE=/dev/null, set -o pipefail, no aliases +// Ends with an explicit sentinel echo so spawn() knows the shell is ready. +const SHELL_INIT = [ + 'HISTFILE=/dev/null', + 'HISTSIZE=0', + 'set -o pipefail', + 'shopt -u expand_aliases 2>/dev/null || true', + 'unalias -a 2>/dev/null || true', + // The sentinel is echoed explicitly after each command (non-interactive shell, no PS1). + // SENTINEL_BEGIN and SENTINEL_END are literal strings injected by JS; $(pwd) is shell-evaluated. + `echo "${SENTINEL_BEGIN}{\\"exit\\":\\"0\\",\\"cwd\\":\\"$(pwd)\\"}${SENTINEL_END}"`, + '', +].join('\n'); + +export interface SessionExecResult { + stdout: string; + stderr: string; + exitCode: number; + durationMs: number; + cwdDrifted: boolean; + cwdDriftNotice: string; +} + +interface SentinelData { + exit: string; + cwd: string; +} + +function parseSentinel(sentinel: string): SentinelData | null { + const start = sentinel.indexOf(SENTINEL_BEGIN); + const end = sentinel.indexOf(SENTINEL_END, start); + if (start === -1 || end === -1) return null; + + const json = sentinel.slice(start + SENTINEL_BEGIN.length, end); + try { + return JSON.parse(json) as SentinelData; + } catch { + return null; + } +} + +export class BashShellSession { + private proc: child_process.ChildProcessWithoutNullStreams | null = null; + private cwd: string; + private driver: SandboxDriver; + private stdoutBuf = ''; + private stderrBuf = ''; + private waiters: Array<() => void> = []; + private closed = false; + + private constructor(cwd: string, driver: SandboxDriver) { + this.cwd = cwd; + this.driver = driver; + } + + static async create(cwd: string, driver: SandboxDriver): Promise { + const session = new BashShellSession(cwd, driver); + await session.spawn(); + return session; + } + + private async spawn(): Promise { + const gitConfigPath = getGitConfigPath(); + const env = makeCleanEnv(gitConfigPath, { + TERM: 'dumb', + QUALOPS_WORKSPACE: this.cwd, + }); + + const baseOpts: SpawnOptions = { + cwd: this.cwd, + env, + argv: ['bash', '--norc', '--noprofile'], + }; + + const opts = await this.driver.prepare(baseOpts); + + this.proc = child_process.spawn(opts.argv[0]!, opts.argv.slice(1), { + cwd: opts.cwd, + env: opts.env, + stdio: ['pipe', 'pipe', 'pipe'], + }); + + this.proc.stdout.on('data', (chunk: Buffer) => { + this.stdoutBuf += chunk.toString('utf8'); + this.drainWaiters(); + }); + + this.proc.stderr.on('data', (chunk: Buffer) => { + this.stderrBuf += chunk.toString('utf8'); + }); + + this.proc.on('close', () => { + this.closed = true; + this.drainWaiters(); + }); + + this.proc.on('error', (err) => { + logger.error('[bash/session] shell process error', { err }); + this.closed = true; + this.drainWaiters(); + }); + + await this.writeAndWaitForSentinel(SHELL_INIT); + this.stdoutBuf = ''; + this.stderrBuf = ''; + } + + private drainWaiters(): void { + const w = this.waiters.splice(0); + for (const fn of w) fn(); + } + + private waitForData(): Promise { + return new Promise((resolve) => { + if (this.closed || this.stdoutBuf.includes(SENTINEL_END)) { + resolve(); + return; + } + this.waiters.push(resolve); + }); + } + + private async writeAndWaitForSentinel( + script: string, + ): Promise<{ stdout: string; stderr: string }> { + if (!this.proc || this.closed) { + throw new Error('[bash/session] Shell process is not running'); + } + + this.stdoutBuf = ''; + this.stderrBuf = ''; + + this.proc.stdin.write(script + '\n'); + + while (!this.stdoutBuf.includes(SENTINEL_END) && !this.closed) { + await this.waitForData(); + } + + const stdout = this.stdoutBuf; + const stderr = this.stderrBuf; + this.stdoutBuf = ''; + this.stderrBuf = ''; + + return { stdout, stderr }; + } + + async exec(command: string, workspaceRoot: string = '/workspace'): Promise { + if (!this.proc || this.closed) { + throw new Error('[bash/session] Shell session has been disposed'); + } + + const startMs = Date.now(); + + // Wrap command: run it, capture exit code, then echo the sentinel explicitly. + // Non-interactive shells don't print PS1, so we echo it ourselves after each command. + const script = `${command}\n__exit__=$?\necho "${SENTINEL_BEGIN}{\\"exit\\":\\"$__exit__\\",\\"cwd\\":\\"$(pwd)\\"}${SENTINEL_END}"\n`; + const { stdout, stderr } = await this.writeAndWaitForSentinel(script); + + const durationMs = Date.now() - startMs; + + const sentinel = parseSentinel(stdout); + const exitCode = sentinel ? parseInt(sentinel.exit, 10) : 1; + const cwd = sentinel?.cwd ?? this.cwd; + + const sentinelStart = stdout.indexOf(SENTINEL_BEGIN); + const cleanStdout = sentinelStart === -1 ? stdout : stdout.slice(0, sentinelStart).trimEnd(); + + let cwdDrifted = false; + let cwdDriftNotice = ''; + + if (cwd && !cwd.startsWith(workspaceRoot) && cwd !== workspaceRoot) { + cwdDrifted = true; + cwdDriftNotice = `[qualops/session] CWD drifted to ${cwd} (outside workspace). Reset to ${this.cwd}.`; + if (this.proc && !this.closed) { + this.proc.stdin.write(`cd ${JSON.stringify(this.cwd)}\n`); + } + } + + return { + stdout: cleanStdout, + stderr, + exitCode: isNaN(exitCode) ? 1 : exitCode, + durationMs, + cwdDrifted, + cwdDriftNotice, + }; + } + + async dispose(): Promise { + await this.driver.teardown(); + if (this.proc && !this.closed) { + this.proc.stdin.end(); + // Give it 1 second to exit cleanly before SIGKILL + await new Promise((resolve) => { + const timer = setTimeout(() => { + if (this.proc && !this.closed) this.proc.kill('SIGKILL'); + resolve(); + }, 1000); + this.proc!.on('close', () => { + clearTimeout(timer); + resolve(); + }); + }); + } + this.proc = null; + this.closed = true; + } + + get isAlive(): boolean { + return !this.closed && this.proc !== null; + } +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session.ts b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session.ts new file mode 100644 index 00000000..02ba97bd --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/repo/src/stages/review/agentic/tools/bash/session.ts @@ -0,0 +1,155 @@ +import * as crypto from 'crypto'; + +import { execBashCommand } from './exec.js'; +import { detectSandboxDriver, type SandboxMode, type DetectOptions } from './modes/detect.js'; +import type { SandboxDriver } from './modes/interface.js'; +import { evaluatePolicy } from './policy.js'; +import type { BashInput, BashOutput } from './schema.js'; +import { BashShellSession } from './session-impl.js'; +import { logger } from '../../../../../shared/utils/logger.js'; + +export interface BashConfig { + /** Sandbox mode override. Default: auto-detect. */ + mode?: SandboxMode; + /** Absolute path to the workspace root. Default: /workspace. */ + workspaceRoot?: string; + /** Absolute path to the PR's .git/hooks dir. Used by CI driver. */ + prHooksDir?: string; + /** Per-review maximum number of bash tool calls. Default: 200. */ + maxCallsPerReview?: number; + /** Known secret tokens to redact from output. */ + knownTokens?: string[]; + /** Extra binary names to deny (added to policy hard-deny list). */ + extraDeniedBinaries?: string[]; +} + +const DEFAULT_MAX_CALLS = 200; + +export class BashSession { + private shell: BashShellSession | null = null; + private driver: SandboxDriver; + private config: Required; + private reviewId: string; + private callCount = 0; + + private constructor(driver: SandboxDriver, config: Required, reviewId: string) { + this.driver = driver; + this.config = config; + this.reviewId = reviewId; + } + + static async start(config: BashConfig = {}): Promise { + const reviewId = process.env['QUALOPS_REVIEW_ID'] ?? crypto.randomUUID(); + const workspaceRoot = config.workspaceRoot ?? '/workspace'; + + const resolvedConfig: Required = { + mode: config.mode ?? 'auto', + workspaceRoot, + prHooksDir: config.prHooksDir ?? '', + maxCallsPerReview: config.maxCallsPerReview ?? DEFAULT_MAX_CALLS, + knownTokens: config.knownTokens ?? [], + extraDeniedBinaries: config.extraDeniedBinaries ?? [], + }; + + const detectOpts: DetectOptions = { + mode: resolvedConfig.mode, + workspaceRoot, + prHooksDir: resolvedConfig.prHooksDir || undefined, + }; + + const driver = detectSandboxDriver(detectOpts); + const session = new BashSession(driver, resolvedConfig, reviewId); + + session.shell = await BashShellSession.create(workspaceRoot, driver); + + logger.info('[bash/session] BashSession started', { reviewId, sandboxMode: driver.name }); + return session; + } + + async exec(input: BashInput): Promise { + if (!this.shell || !this.shell.isAlive) { + throw new Error('[bash/session] BashSession is not running'); + } + + if (this.callCount >= this.config.maxCallsPerReview) { + return errorOutput( + `Budget exhausted: maximum ${this.config.maxCallsPerReview} bash calls per review`, + 'killed_by_limit', + ); + } + + const policyResult = evaluatePolicy(input.command, { + workspaceRoot: this.config.workspaceRoot, + extraDeniedBinaries: this.config.extraDeniedBinaries, + }); + + if (policyResult.deny) { + logger.warn( + `[bash/policy] denied (rule=${policyResult.rule}): ${input.command.substring(0, 200)} — ${policyResult.reason}`, + ); + + return { + stdout: '', + stderr: `[qualops/policy] Command denied: ${policyResult.reason} (rule: ${policyResult.rule})`, + exit_code: 126, + exit_reason: 'cancelled', + duration_ms: 0, + stdout_truncated: false, + stderr_truncated: false, + semantic_hint: undefined, + sandbox_violations: [`policy:${policyResult.rule}`], + }; + } + + this.callCount++; + const callId = `${this.reviewId}-${this.callCount}`; + + return execBashCommand(input, this.shell, this.driver, { + reviewId: this.reviewId, + callId, + workspaceRoot: this.config.workspaceRoot, + knownTokens: this.config.knownTokens, + }); + } + + async dispose(): Promise { + if (this.shell) { + await this.shell.dispose(); + this.shell = null; + } + logger.info('[bash/session] BashSession disposed', { reviewId: this.reviewId }); + } + + get isAlive(): boolean { + return this.shell?.isAlive ?? false; + } +} + +function errorOutput(message: string, reason: BashOutput['exit_reason']): BashOutput { + return { + stdout: '', + stderr: message, + exit_code: 1, + exit_reason: reason, + duration_ms: 0, + stdout_truncated: false, + stderr_truncated: false, + semantic_hint: undefined, + sandbox_violations: [], + }; +} + +export async function startBashSession( + config: BashConfig, + logPrefix: string, +): Promise<{ session: BashSession; dispose: () => Promise }> { + const session = await BashSession.start(config); + return { + session, + dispose: async () => { + await session + .dispose() + .catch((err) => logger.warn(`[${logPrefix}] Error disposing BashSession`, { err })); + }, + }; +} diff --git a/evals/datasets/inbox/qualops-pr-144-bash-tool/slice.json b/evals/datasets/inbox/qualops-pr-144-bash-tool/slice.json new file mode 100644 index 00000000..6588b6de --- /dev/null +++ b/evals/datasets/inbox/qualops-pr-144-bash-tool/slice.json @@ -0,0 +1,144 @@ +{ + "id": "qualops-pr-144-bash-tool", + "prUrl": "https://github.com/eggai-tech/qualops/pull/144", + "prTitle": "feat(agentic): hardened bash tool with policy enforcement for agentic review", + "capturedAt": "2026-05-11", + "capturedBy": "valdis", + "language": "TypeScript", + "baseSha": "415d0e5352e032ba743b4e3f51abe393543cb27f", + "reviewPromptSha": "415d0e5352e032ba743b4e3f51abe393543cb27f", + "reviewPromptDir": "prompt/", + "capturedWithProvider": "anthropic", + "capturedWithModel": "claude-sonnet-4-5-20250929", + "diff": "", + "expected": [ + { + "file": "src/stages/review/agentic/tools/bash/exec.ts", + "line": 30, + "lineEnd": 30, + "type": "security", + "severity": "high", + "description": "Command string is logged before secret redaction. If the agent calls a command containing an operator-supplied token (e.g. a git remote URL with embedded credentials), the raw value appears in the log. redactOutput is applied to stdout/stderr later (lines 87-88) but the input command itself is never redacted before logging." + }, + { + "file": "src/stages/review/agentic/tools/bash/session.ts", + "line": 86, + "lineEnd": 89, + "type": "security", + "severity": "high", + "description": "Policy denial log includes raw input.command (up to 200 chars) without redaction. A denied command containing a secret token \u2014 e.g. an agent attempt to exfiltrate a credential \u2014 would be written to the log in plaintext, persisting the value that the policy was trying to block." + }, + { + "file": "src/stages/review/agentic/tools/bash/modes/besteffort.macos.ts", + "line": 16, + "lineEnd": 22, + "type": "security", + "severity": "high", + "description": "workspaceRoot is interpolated directly into a Seatbelt profile string without charset validation. A path containing Seatbelt metacharacters (e.g. parentheses, quotes, or newlines) can break the profile structure or inject additional rules, bypassing the intended file-read restrictions." + }, + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 518, + "lineEnd": 543, + "type": "security", + "severity": "medium", + "description": "checkPathAccess only checks absoluteArgs (paths starting with '/'). Relative paths like '../../etc/passwd' passed to cat/head/cp/mv are silently allowed. In CI mode there is no filesystem confinement (CISandboxDriver applies only rlimits, a network proxy, and hook monitoring \u2014 no bwrap/Seatbelt), so the policy layer is the sole guard against path escape." + }, + { + "file": "src/stages/review/agentic/tools/bash/parser.ts", + "line": 64, + "lineEnd": 74, + "type": "security", + "severity": "medium", + "description": "toAnalysisCopy strips single-quoted ('...') and double-quoted (\"...\") substrings but not ANSI-C quoted strings ($'...'). The $ prefix prevents the single-quote regex from matching, so $'/etc/passwd' survives into the analysis copy with its path intact. Commands like `cat $'/etc/passwd'` pass the path-outside-workspace check. In CI mode the sandbox provides no filesystem boundary, making this a direct policy bypass." + }, + { + "file": "src/stages/review/agentic/tools/bash/session.ts", + "line": 42, + "lineEnd": 42, + "type": "security", + "severity": "low", + "description": "QUALOPS_REVIEW_ID is read from the environment and used directly in call IDs and log messages with no charset validation. A misconfigured operator env var containing '/', '..', or shell metacharacters could produce unexpected path components or log injection. The UUID fallback is only used when the env var is absent, not when it contains invalid characters." + }, + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 392, + "lineEnd": 393, + "type": "security", + "severity": "high", + "description": "checkGitCommand uses args.indexOf('-c') to detect the -c flag. The shell tokenizer splits on whitespace, so `git -ccore.hooksPath=/evil` arrives as a single token '-ccore.hooksPath=/evil' and indexOf('-c') returns -1. The concatenated form bypasses the DENIED_GIT_CONFIG_PATTERNS check entirely, allowing arbitrary git config injection including setting core.hooksPath to a malicious directory." + }, + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 497, + "lineEnd": 498, + "type": "security", + "severity": "high", + "description": "checkShellInvocation uses the same args.indexOf('-c') pattern. The concatenated form `bash -c'malicious_cmd'` tokenizes as a single arg '-c'malicious_cmd'' which indexOf misses, allowing the agent to invoke arbitrary shell commands while the policy believes no -c flag was used." + }, + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 250, + "lineEnd": 250, + "type": "security", + "severity": "high", + "description": "'config' is in ALLOWED_GIT_SUBCOMMANDS with a comment claiming it is 'checked further below for dangerous patterns', but no read-only enforcement exists. `git config core.hooksPath /evil` is allowed through to the shell; only the `-c` flag path is checked. This enables hook injection via git config write subcommands." + }, + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 340, + "lineEnd": 354, + "type": "security", + "severity": "high", + "description": "isAllowedCdTarget checks containment using target.startsWith(root + '/'), which does not resolve symlinks. A symlink inside the workspace pointing to /etc passes this check because the path string starts with the workspace root, but the resolved filesystem path is outside it. In CI mode the sandbox provides no filesystem boundary, making this a direct escape." + }, + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 344, + "lineEnd": 345, + "type": "security", + "severity": "medium", + "description": "isAllowedCdTarget allows all relative cd targets unconditionally. In CI mode (CISandboxDriver) there is no filesystem-level confinement \u2014 no bwrap, no Seatbelt. A relative cd like 'cd ../../etc' is not caught by the binary deny list and is not blocked by the sandbox, giving the agent unrestricted cwd movement inside the CI container, which has access to runner credentials and the full container filesystem." + } + ], + "outOfScope": [ + { + "file": "src/stages/review/agentic/tools/bash/session-impl.ts", + "line": 32, + "lineEnd": 177, + "type": "bug", + "severity": "high", + "description": "$(pwd) is embedded raw inside a JSON string in the sentinel echo (SHELL_INIT line 32 and exec script line 177). A workspace path containing '\"' or '\\\\' produces malformed JSON that parseSentinel cannot parse, causing all commands to silently report exit code 1 and fall back to the initial cwd.", + "reason": "Correctness/reliability bug, not a security vulnerability. The cwd value comes from the shell's own pwd, not from attacker-controlled input. The prompt is scoped to security review." + } + ], + "falsePositives": [ + { + "file": "src/stages/review/agentic/tools/bash/policy.ts", + "line": 0, + "lineEnd": 0, + "type": "security", + "severity": "low", + "description": "pushd and popd are shell builtins and cannot be blocked via the binary deny list", + "reason": "The policy engine checks the binary name from the tokenizer. Shell builtins like pushd/popd have no separate binary; they are executed by the shell directly. Blocking them is structurally impossible via a binary deny list \u2014 it would require a different mechanism (e.g. a restricted shell). The 'sandbox handles it' reasoning does NOT apply here: in CI mode there is no filesystem sandbox. This remains a FP only because the finding mechanism (binary deny list) is wrong, not because the sandbox compensates." + }, + { + "file": "src/stages/review/agentic/tools/bash/env-scrub.ts", + "line": 115, + "lineEnd": 115, + "type": "security", + "severity": "low", + "description": "EXPLICIT_DENY Set lookup may be case-sensitive, missing mixed-case env var names", + "reason": ".toUpperCase() is applied to every env var key before the Set lookup, so the check is already case-insensitive. Confirmed by reading env-scrub.ts." + }, + { + "file": "src/stages/review/agentic/tools/bash/session-impl.ts", + "line": 177, + "lineEnd": 177, + "type": "security", + "severity": "critical", + "description": "Command injection via direct string interpolation \u2014 the agent-supplied command is interpolated directly into the script string", + "reason": "The BashShellSession is an executor: interpolating the command into the script is its designed purpose. The policy layer (evaluatePolicy) already checked and allowed the command before exec() is called. There is no second attacker between the policy check and the executor. Flagging the executor for running what it was told to run conflates the executor with a validator." + } + ] +}