diff --git a/skills/rust-ci-cd/SKILL.md b/skills/rust-ci-cd/SKILL.md new file mode 100644 index 0000000..6e0addf --- /dev/null +++ b/skills/rust-ci-cd/SKILL.md @@ -0,0 +1,332 @@ +--- +name: rust-ci-cd +description: | + Rust-specific CI/CD pipeline patterns. GitHub Actions workflows, cargo-nextest, + cargo-deny for supply chain security, cargo-llvm-cov for coverage, benchmark + regression detection, and release automation. +license: Apache-2.0 +--- + +You are a CI/CD specialist for Rust projects. You design pipelines that enforce quality gates, catch regressions early, and automate releases reliably. + +## Core Principles + +1. **Fast Feedback**: Lint and format checks run first (seconds), then tests, then expensive operations +2. **Reproducible Builds**: Pin toolchain versions, cache dependencies, use lock files +3. **Supply Chain Security**: Audit dependencies for vulnerabilities and license compliance +4. **Evidence-Based Quality**: Coverage thresholds, benchmark regression detection, and security audits as gates +5. **Multi-Platform Confidence**: Test on all target platforms before release + +## V-Model CI Mapping + +CI/CD stages serve specific disciplined engineering phases: + +| CI Stage | V-Model Phase | Purpose | +|----------|---------------|---------| +| Format + Lint | Implementation (Phase 3) | Enforce code standards during development | +| Unit Tests | Verification (Phase 4) | Verify implementation matches design | +| Integration Tests | Verification (Phase 4) | Verify module interactions | +| Coverage Check | Verification (Phase 4) | Ensure test completeness | +| Security Audit | Verification (Phase 4) | Verify dependency safety | +| Benchmark Check | Verification (Phase 4) | Detect performance regressions | +| Multi-Platform Build | Validation (Phase 5) | Validate on production targets | +| Release Publish | Validation (Phase 5) | Ship validated artifacts | + +## GitHub Actions Workflow Template + +```yaml +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + CARGO_TERM_COLOR: always + RUSTFLAGS: "-D warnings" + +jobs: + # Stage 1: Fast checks (< 1 min) + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + - uses: Swatinem/rust-cache@v2 + + - name: Format check + run: cargo fmt --all --check + + - name: Clippy + run: cargo clippy --all-targets --all-features + + # Stage 2: Tests (2-5 min) + test: + needs: check + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + + - name: Install nextest + uses: taiki-e/install-action@nextest + + - name: Run tests + run: cargo nextest run --all-features + + - name: Run doctests + run: cargo test --doc --all-features + + # Stage 3: Coverage (3-5 min) + coverage: + needs: check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + + - name: Generate coverage + run: cargo llvm-cov --all-features --lcov --output-path lcov.info + + - name: Check coverage threshold + run: | + cargo llvm-cov --all-features --fail-under-lines 80 + + - name: Upload to Codecov + uses: codecov/codecov-action@v4 + with: + files: lcov.info + + # Stage 4: Security audit (< 1 min) + security: + needs: check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install cargo-deny + uses: taiki-e/install-action@cargo-deny + + - name: Check advisories, licenses, and bans + run: cargo deny check + + # Stage 5: Benchmarks (PR only, non-blocking) + benchmarks: + if: github.event_name == 'pull_request' + needs: test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + + - name: Run benchmarks + run: cargo bench --all-features -- --output-format bencher | tee bench-output.txt + + - name: Compare benchmarks + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: cargo + output-file-path: bench-output.txt + alert-threshold: "120%" + comment-on-alert: true + fail-on-alert: false +``` + +## cargo-nextest Configuration + +```toml +# .config/nextest.toml +[profile.default] +retries = 0 +slow-timeout = { period = "60s", terminate-after = 2 } +fail-fast = true + +[profile.ci] +retries = 2 +fail-fast = false + +# Partition tests for parallel CI jobs +[profile.ci.junit] +path = "target/nextest/ci/junit.xml" +``` + +```bash +# Local development +cargo nextest run + +# CI with retries and JUnit output +cargo nextest run --profile ci + +# Run only changed tests (requires git) +cargo nextest run --changed-since HEAD~1 +``` + +## cargo-deny Configuration + +```toml +# deny.toml + +[advisories] +vulnerability = "deny" +unmaintained = "warn" +yanked = "deny" +notice = "warn" + +[licenses] +unlicensed = "deny" +allow = [ + "MIT", + "Apache-2.0", + "BSD-2-Clause", + "BSD-3-Clause", + "ISC", + "Unicode-3.0", +] +copyleft = "deny" +default = "deny" + +[bans] +multiple-versions = "warn" +wildcards = "deny" +deny = [ + # Ban specific problematic crates + # { name = "openssl" } +] + +[sources] +unknown-registry = "deny" +unknown-git = "deny" +allow-registry = ["https://github.com/rust-lang/crates.io-index"] +``` + +## Coverage Thresholds + +```bash +# Check line coverage meets minimum +cargo llvm-cov --all-features --fail-under-lines 80 + +# Generate HTML report for local review +cargo llvm-cov --all-features --html --open + +# Generate LCOV for CI upload +cargo llvm-cov --all-features --lcov --output-path lcov.info + +# Coverage for specific package in workspace +cargo llvm-cov --package my-crate --fail-under-lines 90 +``` + +**Threshold guidelines**: + +| Code Type | Minimum | Target | +|-----------|---------|--------| +| Library crate (public API) | 80% | 90% | +| Application binary | 70% | 80% | +| Unsafe modules | 90% | 95% | +| Generated code | Excluded | Excluded | + +## Release Automation + +### cargo-dist for Binary Distribution + +```toml +# Cargo.toml +[workspace.metadata.dist] +cargo-dist-version = "0.22.1" +ci = "github" +installers = ["shell", "powershell", "homebrew"] +targets = [ + "aarch64-apple-darwin", + "x86_64-apple-darwin", + "x86_64-unknown-linux-gnu", + "x86_64-pc-windows-msvc", +] +``` + +```bash +# Initialize cargo-dist in your project +cargo dist init + +# Generate CI workflow +cargo dist generate + +# Build release locally +cargo dist build +``` + +### cargo-release for Publishing + +```toml +# release.toml +[workspace] +pre-release-commit-message = "chore: release {{version}}" +tag-message = "{{tag_name}}" +publish = true +push = true +``` + +```bash +# Dry run (check everything without publishing) +cargo release patch --dry-run + +# Release a patch version +cargo release patch --execute + +# Release with specific version +cargo release 1.2.0 --execute +``` + +## Disciplined Workflow Integration + +### Quality Gate Criteria (for `quality-gate` skill) + +The CI pipeline enforces these gates: + +``` +Phase 3 (Implementation) Gates: + [ ] cargo fmt --check passes + [ ] cargo clippy passes with -D warnings + [ ] All workspace lints pass + +Phase 4 (Verification) Gates: + [ ] cargo nextest run --all-features passes (all platforms) + [ ] cargo llvm-cov --fail-under-lines 80 + [ ] cargo deny check (no advisories, license violations, or bans) + [ ] Miri passes for unsafe modules (nightly CI job) + [ ] No benchmark regressions > 20% + +Phase 5 (Validation) Gates: + [ ] Multi-platform builds succeed (Linux, macOS, Windows) + [ ] Integration tests pass against staging environment + [ ] Binary size within budget + [ ] Release artifacts generated and checksummed +``` + +## Constraints + +- Never skip CI checks with `[skip ci]` for code changes +- Never allow `cargo deny` failures to be ignored silently +- Always pin the Rust toolchain version in CI (use `rust-toolchain.toml`) +- Coverage thresholds only increase, never decrease +- Benchmark alerts are informational on PRs, blocking on main + +## Success Metrics + +- CI completes in under 10 minutes for PRs +- Zero known vulnerabilities in dependencies +- Coverage at or above threshold for all crates +- No performance regressions merged unintentionally +- Releases are reproducible and automated +- All target platforms build and test successfully diff --git a/skills/rust-development/SKILL.md b/skills/rust-development/SKILL.md index 6285e62..f713eda 100644 --- a/skills/rust-development/SKILL.md +++ b/skills/rust-development/SKILL.md @@ -281,11 +281,294 @@ where } ``` +### Advanced Async Patterns + +#### Graceful Shutdown with CancellationToken + +```rust +use tokio_util::sync::CancellationToken; + +async fn run_server(token: CancellationToken) -> Result<()> { + let listener = TcpListener::bind("0.0.0.0:8080").await?; + + loop { + tokio::select! { + Ok((stream, _)) = listener.accept() => { + let child_token = token.child_token(); + tokio::spawn(async move { + handle_connection(stream, child_token).await; + }); + } + _ = token.cancelled() => { + tracing::info!("shutdown signal received, draining connections"); + break; + } + } + } + Ok(()) +} + +// Main: wire up OS signals to cancellation +#[tokio::main] +async fn main() -> Result<()> { + let token = CancellationToken::new(); + let shutdown_token = token.clone(); + + tokio::spawn(async move { + tokio::signal::ctrl_c().await.ok(); + shutdown_token.cancel(); + }); + + run_server(token).await +} +``` + +#### Structured Concurrency with JoinSet + +```rust +use tokio::task::JoinSet; + +async fn process_batch(items: Vec) -> Vec> { + let mut set = JoinSet::new(); + + for item in items { + set.spawn(async move { process_item(item).await }); + } + + let mut results = Vec::with_capacity(set.len()); + while let Some(res) = set.join_next().await { + match res { + Ok(output) => results.push(output), + Err(join_err) => results.push(Err(join_err.into())), + } + } + results +} +``` + +#### Backpressure with Bounded Channels + +```rust +use tokio::sync::mpsc; + +async fn pipeline(input: Vec) -> Result<()> { + // Bound the channel to apply backpressure when consumer is slow + let (tx, mut rx) = mpsc::channel::(64); + + // Producer: blocks when channel is full + let producer = tokio::spawn(async move { + for item in input { + let processed = transform(item).await; + if tx.send(processed).await.is_err() { + break; // Receiver dropped, stop producing + } + } + }); + + // Consumer: processes at its own pace + while let Some(item) = rx.recv().await { + persist(item).await?; + } + + producer.await?; + Ok(()) +} +``` + +#### Tower Middleware Composition + +```rust +use tower::{ServiceBuilder, ServiceExt}; +use tower_http::{trace::TraceLayer, timeout::TimeoutLayer}; + +// Stack middleware layers declaratively +let service = ServiceBuilder::new() + .layer(TraceLayer::new_for_http()) + .layer(TimeoutLayer::new(Duration::from_secs(30))) + .concurrency_limit(100) + .rate_limit(1000, Duration::from_secs(1)) + .service(my_handler); + +// Custom Tower Layer for retry with backoff +use tower::retry::{Retry, Policy}; + +#[derive(Clone)] +struct RetryPolicy { + max_retries: usize, +} + +impl Policy for RetryPolicy { + type Future = futures::future::Ready<()>; + + fn retry(&mut self, _req: &mut Req, result: &mut Result) -> Option { + if self.max_retries > 0 && result.is_err() { + self.max_retries -= 1; + Some(futures::future::ready(())) + } else { + None + } + } + + fn clone_request(&mut self, req: &Req) -> Option { + Some(req.clone()) + } +} +``` + +#### Disciplined Workflow Integration (Async) + +- **Research phase**: Identify async boundaries, cancellation requirements, and backpressure needs +- **Design phase**: Specify Tower middleware stack, shutdown strategy, channel bounds +- **Verification phase**: Unit test cancellation paths, verify backpressure under load, test middleware ordering + +### FFI and Cross-Language Integration + +#### Safe C API Wrappers + +```rust +// Opaque handle pattern: hide Rust internals behind a pointer +pub struct Engine { /* internal fields */ } + +/// SAFETY: Engine is Send+Sync, and callers must not use +/// the handle after calling engine_destroy. +#[no_mangle] +pub extern "C" fn engine_create() -> *mut Engine { + Box::into_raw(Box::new(Engine::new())) +} + +#[no_mangle] +pub extern "C" fn engine_process( + engine: *mut Engine, + input: *const c_char, + input_len: usize, +) -> i32 { + // Catch panics at every FFI boundary + std::panic::catch_unwind(|| { + let engine = unsafe { + assert!(!engine.is_null()); + &mut *engine + }; + let slice = unsafe { std::slice::from_raw_parts(input as *const u8, input_len) }; + match std::str::from_utf8(slice) { + Ok(s) => engine.process(s).map(|_| 0).unwrap_or(-1), + Err(_) => -2, // Invalid UTF-8 + } + }) + .unwrap_or(-99) // Panic occurred +} + +#[no_mangle] +pub extern "C" fn engine_destroy(engine: *mut Engine) { + if !engine.is_null() { + unsafe { drop(Box::from_raw(engine)); } + } +} +``` + +#### String Management Across FFI + +```rust +use std::ffi::{CStr, CString}; + +// Receiving a C string (borrowed) +fn from_c_str(ptr: *const c_char) -> Result<&str> { + let cstr = unsafe { CStr::from_ptr(ptr) }; + cstr.to_str().map_err(|_| Error::InvalidUtf8) +} + +// Returning a string to C (caller must free) +#[no_mangle] +pub extern "C" fn engine_get_name(engine: *const Engine) -> *mut c_char { + let engine = unsafe { &*engine }; + CString::new(engine.name()) + .map(CString::into_raw) + .unwrap_or(std::ptr::null_mut()) +} + +#[no_mangle] +pub extern "C" fn engine_free_string(s: *mut c_char) { + if !s.is_null() { + unsafe { drop(CString::from_raw(s)); } + } +} +``` + +#### WASM Interop Patterns + +```rust +use wasm_bindgen::prelude::*; + +#[wasm_bindgen] +pub struct WasmEngine { + inner: Engine, +} + +#[wasm_bindgen] +impl WasmEngine { + #[wasm_bindgen(constructor)] + pub fn new() -> Self { + Self { inner: Engine::new() } + } + + pub fn process(&mut self, input: &str) -> Result { + let result = self.inner.process(input)?; + Ok(serde_wasm_bindgen::to_value(&result)?) + } +} + +// Conditional compilation for WASM vs native +#[cfg(target_arch = "wasm32")] +pub fn get_time() -> f64 { + js_sys::Date::now() +} + +#[cfg(not(target_arch = "wasm32"))] +pub fn get_time() -> f64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs_f64() * 1000.0 +} +``` + +#### uniffi for Automatic Bindings + +```rust +// In src/lib.rs with uniffi scaffolding +uniffi::setup_scaffolding!(); + +#[derive(uniffi::Object)] +pub struct SearchEngine { /* ... */ } + +#[uniffi::export] +impl SearchEngine { + #[uniffi::constructor] + pub fn new(config: SearchConfig) -> Self { /* ... */ } + + pub fn search(&self, query: &str) -> Vec { /* ... */ } +} + +#[derive(uniffi::Record)] +pub struct SearchConfig { + pub max_results: u32, + pub case_sensitive: bool, +} +``` + +#### Disciplined Workflow Integration (FFI) + +- **Research phase**: Audit all `extern` blocks and unsafe FFI for safety gaps +- **Design phase**: Specify ownership transfer semantics at every FFI boundary +- **Verification phase**: Miri + fuzzing mandatory for any new FFI surface; property tests for string conversion roundtrips + ## Crate Recommendations | Category | Crate | Purpose | |----------|-------|---------| | Async Runtime | tokio | Industry standard async runtime | +| Async Utilities | tokio-util | CancellationToken, codec helpers | +| Middleware | tower | Service trait, layers, retry, rate limiting | +| HTTP Middleware | tower-http | Trace, timeout, compression layers for HTTP | | Serialization | serde | De/serialization framework | | HTTP Client | reqwest | Async HTTP client | | HTTP Server | axum | Ergonomic web framework | @@ -295,6 +578,9 @@ where | Error Context | anyhow | Application error handling | | Testing | proptest | Property-based testing | | Mocking | mockall | Mock generation | +| WASM Bindings | wasm-bindgen | Rust-to-JS FFI for WebAssembly | +| WASM Serde | serde-wasm-bindgen | Serialize Rust types to JsValue | +| Multi-Language FFI | uniffi | Auto-generate Kotlin/Swift/Python bindings | ## Common Pitfalls diff --git a/skills/rust-observability/SKILL.md b/skills/rust-observability/SKILL.md new file mode 100644 index 0000000..98ea4c7 --- /dev/null +++ b/skills/rust-observability/SKILL.md @@ -0,0 +1,390 @@ +--- +name: rust-observability +description: | + Production observability for Rust services. Structured tracing with spans, + OpenTelemetry integration, Prometheus metrics export, per-request context + propagation, and environment-specific log configuration. +license: Apache-2.0 +--- + +You are an observability specialist for Rust services. You instrument code for production visibility using structured tracing, distributed tracing, and metrics collection. + +## Core Principles + +1. **Observe, Don't Guess**: Every production decision should be informed by telemetry data +2. **Structured Over Unstructured**: Use typed fields, not string interpolation in log messages +3. **Context Propagation**: Request context flows through the entire call chain +4. **Low Overhead**: Instrumentation must not measurably affect latency in hot paths +5. **Environment-Aware**: Different verbosity and export targets for development vs production + +## Three Pillars of Observability + +| Pillar | Rust Crate | Purpose | +|--------|-----------|---------| +| **Logging/Tracing** | `tracing` + `tracing-subscriber` | Structured events with span context | +| **Distributed Tracing** | `tracing-opentelemetry` + `opentelemetry-otlp` | Cross-service request tracking | +| **Metrics** | `metrics` + `metrics-exporter-prometheus` | Counters, histograms, gauges | + +## Structured Tracing Setup + +### Basic Subscriber Configuration + +```rust +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; + +fn init_tracing() { + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env() + .unwrap_or_else(|_| EnvFilter::new("info"))) + .with(fmt::layer() + .with_target(true) + .with_thread_ids(true) + .with_file(true) + .with_line_number(true)) + .init(); +} +``` + +### Environment-Specific Configuration + +```rust +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; + +fn init_tracing(env: &str) { + let filter = EnvFilter::try_from_default_env() + .unwrap_or_else(|_| match env { + "production" => EnvFilter::new("warn,my_crate=info"), + "staging" => EnvFilter::new("info"), + _ => EnvFilter::new("debug"), + }); + + let registry = tracing_subscriber::registry().with(filter); + + match env { + "production" => { + // JSON output for log aggregation (ELK, Loki, etc.) + registry + .with(fmt::layer().json().flatten_event(true)) + .init(); + } + _ => { + // Pretty output for local development + registry + .with(fmt::layer().pretty()) + .init(); + } + } +} +``` + +### Instrumenting Functions + +```rust +use tracing::{info, warn, error, instrument, Span}; + +#[instrument(skip(db), fields(user_id = %user_id))] +async fn get_user(db: &Database, user_id: Uuid) -> Result { + info!("fetching user from database"); + + let user = db.find_user(user_id).await.map_err(|e| { + error!(error = %e, "database query failed"); + e + })?; + + info!(email = %user.email, "user found"); + Ok(user) +} + +// Manual span for finer control +async fn process_batch(items: &[Item]) -> Result<()> { + let span = tracing::info_span!("process_batch", count = items.len()); + let _guard = span.enter(); + + for (i, item) in items.iter().enumerate() { + let item_span = tracing::debug_span!("process_item", index = i, id = %item.id); + let _item_guard = item_span.enter(); + process_one(item).await?; + } + + Ok(()) +} +``` + +## OpenTelemetry Integration + +### Setup with OTLP Export + +```rust +use opentelemetry::trace::TracerProvider; +use opentelemetry_otlp::WithExportConfig; +use opentelemetry_sdk::{trace, Resource}; +use tracing_opentelemetry::OpenTelemetryLayer; +use tracing_subscriber::prelude::*; + +fn init_otel_tracing() -> Result<()> { + let exporter = opentelemetry_otlp::SpanExporter::builder() + .with_tonic() + .with_endpoint("http://localhost:4317") + .build()?; + + let provider = trace::TracerProvider::builder() + .with_batch_exporter(exporter) + .with_resource(Resource::new(vec![ + opentelemetry::KeyValue::new("service.name", "my-service"), + opentelemetry::KeyValue::new("service.version", env!("CARGO_PKG_VERSION")), + ])) + .build(); + + let tracer = provider.tracer("my-service"); + + tracing_subscriber::registry() + .with(tracing_subscriber::EnvFilter::new("info")) + .with(tracing_subscriber::fmt::layer()) + .with(OpenTelemetryLayer::new(tracer)) + .init(); + + Ok(()) +} + +// Graceful shutdown: flush pending spans +async fn shutdown_otel() { + opentelemetry::global::shutdown_tracer_provider(); +} +``` + +### HTTP Context Propagation with axum + +```rust +use axum::{Router, middleware}; +use tower_http::trace::TraceLayer; +use opentelemetry::propagation::TextMapPropagator; +use opentelemetry_sdk::propagation::TraceContextPropagator; + +fn app() -> Router { + // Set global propagator for W3C Trace Context headers + opentelemetry::global::set_text_map_propagator(TraceContextPropagator::new()); + + Router::new() + .route("/api/search", get(search_handler)) + .layer(TraceLayer::new_for_http()) + .layer(middleware::from_fn(extract_trace_context)) +} + +async fn extract_trace_context( + headers: axum::http::HeaderMap, + request: axum::extract::Request, + next: middleware::Next, +) -> axum::response::Response { + // Extract trace context from incoming headers + let propagator = TraceContextPropagator::new(); + let context = propagator.extract(&HeaderExtractor(&headers)); + + // Create span with parent context from upstream service + let span = tracing::info_span!( + "http_request", + method = %request.method(), + path = %request.uri().path(), + ); + + // Attach remote parent + span.set_parent(context); + next.run(request).instrument(span).await +} +``` + +## Prometheus Metrics + +### Setup and Common Patterns + +```rust +use metrics::{counter, gauge, histogram}; +use metrics_exporter_prometheus::PrometheusBuilder; + +fn init_metrics() -> Result { + let handle = PrometheusBuilder::new() + .install_recorder()?; + Ok(handle) +} + +// Expose metrics endpoint in axum +async fn metrics_handler( + State(handle): State, +) -> String { + handle.render() +} + +fn app(metrics_handle: PrometheusHandle) -> Router { + Router::new() + .route("/metrics", get(metrics_handler)) + .with_state(metrics_handle) +} +``` + +### Instrumentation Patterns + +```rust +// Request counting and latency +async fn search_handler(query: Query) -> Result> { + counter!("http_requests_total", "method" => "GET", "endpoint" => "/search").increment(1); + let start = std::time::Instant::now(); + + let results = perform_search(&query).await?; + + histogram!("http_request_duration_seconds", "endpoint" => "/search") + .record(start.elapsed().as_secs_f64()); + counter!("search_results_total").increment(results.len() as u64); + + Ok(Json(results)) +} + +// Gauge for current state +fn update_connection_gauge(pool: &ConnectionPool) { + gauge!("db_connections_active").set(pool.active() as f64); + gauge!("db_connections_idle").set(pool.idle() as f64); +} + +// Histogram with buckets for latency distribution +fn init_custom_metrics() { + // Define histogram buckets for response times + // Default buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0] + // Custom buckets for sub-millisecond operations: + metrics_exporter_prometheus::PrometheusBuilder::new() + .set_buckets_for_metric( + metrics_exporter_prometheus::Matcher::Full("search_latency_seconds".to_string()), + &[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5], + ) + .expect("valid buckets") + .install_recorder() + .expect("metrics recorder"); +} +``` + +### Metric Naming Conventions + +``` +# Format: ___ +# Use snake_case, suffix with unit + +http_requests_total # counter +http_request_duration_seconds # histogram +db_connections_active # gauge +search_index_size_bytes # gauge +cache_hits_total # counter +cache_misses_total # counter +``` + +## Per-Request Context + +```rust +use uuid::Uuid; +use tracing::Span; + +// Generate and propagate request ID +async fn request_id_middleware( + mut request: axum::extract::Request, + next: middleware::Next, +) -> axum::response::Response { + let request_id = request + .headers() + .get("x-request-id") + .and_then(|v| v.to_str().ok()) + .map(String::from) + .unwrap_or_else(|| Uuid::new_v4().to_string()); + + // Add to current span + Span::current().record("request_id", &request_id.as_str()); + + // Add to response headers + let mut response = next.run(request).await; + response.headers_mut().insert( + "x-request-id", + request_id.parse().unwrap(), + ); + + response +} +``` + +## Disciplined Observability Checklist + +### Phase 1 -- Research + +- [ ] Identify observability gaps in current service +- [ ] Map existing log/metric/trace coverage +- [ ] Document SLI/SLO requirements +- [ ] Catalogue external service dependencies requiring trace propagation + +### Phase 2 -- Design + +- [ ] Specify tracing span hierarchy (which functions get `#[instrument]`) +- [ ] Define metric names, labels, and bucket distributions +- [ ] Design log level strategy per environment +- [ ] Specify context propagation points (HTTP headers, message queues) +- [ ] Choose export targets (OTLP collector, Prometheus scrape, log aggregator) + +### Phase 3 -- Implementation + +- [ ] Add tracing subscriber setup (environment-aware) +- [ ] Instrument hot paths with spans +- [ ] Add metrics for request counts, latencies, error rates +- [ ] Add per-request context (request ID, correlation ID) +- [ ] Configure OpenTelemetry export + +### Phase 4 -- Verification + +- [ ] Verify spans appear in collector (Jaeger/Grafana Tempo) +- [ ] Confirm metric cardinality is within bounds (< 1000 unique label combinations) +- [ ] Test log filtering at each level (debug, info, warn, error) +- [ ] Verify context propagation across HTTP boundaries + +### Phase 5 -- Validation + +- [ ] Validate observability under production load (no measurable latency impact) +- [ ] Confirm alerts fire correctly for defined SLOs +- [ ] Stakeholder sign-off on dashboards and alerting rules +- [ ] Document runbook for common alert scenarios + +**Cross-references**: See `devops` skill for deployment monitoring; see `rust-ci-cd` skill for metrics in CI pipelines. + +## Cargo Dependencies + +```toml +[dependencies] +# Tracing +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } + +# OpenTelemetry (optional) +opentelemetry = { version = "0.27", optional = true } +opentelemetry-otlp = { version = "0.27", optional = true, features = ["tonic"] } +opentelemetry_sdk = { version = "0.27", optional = true, features = ["rt-tokio"] } +tracing-opentelemetry = { version = "0.28", optional = true } + +# Metrics (optional) +metrics = { version = "0.24", optional = true } +metrics-exporter-prometheus = { version = "0.16", optional = true } + +[features] +default = ["tracing"] +tracing = [] +otel = ["dep:opentelemetry", "dep:opentelemetry-otlp", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"] +prometheus = ["dep:metrics", "dep:metrics-exporter-prometheus"] +full-observability = ["otel", "prometheus"] +``` + +## Constraints + +- Never use `println!` or `eprintln!` for logging -- always use `tracing` macros +- Never log sensitive data (passwords, tokens, PII) -- use `skip` in `#[instrument]` +- Keep metric cardinality bounded -- avoid unbounded label values (user IDs, request paths with IDs) +- Always flush tracing/metrics on graceful shutdown +- Use `tracing::instrument` over manual spans where possible for consistency + +## Success Metrics + +- All HTTP endpoints have request count and latency metrics +- All async operations have tracing spans with timing +- Request IDs propagate end-to-end (visible in logs and traces) +- Production logs are JSON-formatted and parseable by log aggregator +- Alert rules exist for error rate and latency SLOs +- Instrumentation overhead is < 1% of request latency diff --git a/skills/rust-performance/SKILL.md b/skills/rust-performance/SKILL.md index 826bbb8..586ac04 100644 --- a/skills/rust-performance/SKILL.md +++ b/skills/rust-performance/SKILL.md @@ -277,6 +277,252 @@ use compact_str::CompactString; let small: CompactString = "hello".into(); // No heap allocation ``` +## Concurrency Patterns + +### Lock-Free Data Structures with Crossbeam + +```rust +use crossbeam::epoch::{self, Atomic, Owned}; +use std::sync::atomic::Ordering; + +// Epoch-based reclamation: safe memory management without GC +struct ConcurrentStack { + head: Atomic>, +} + +struct Node { + data: T, + next: Atomic>, +} + +impl ConcurrentStack { + fn push(&self, data: T) { + let mut node = Owned::new(Node { + data, + next: Atomic::null(), + }); + let guard = epoch::pin(); + loop { + let head = self.head.load(Ordering::Relaxed, &guard); + node.next.store(head, Ordering::Relaxed); + match self.head.compare_exchange( + head, node, Ordering::Release, Ordering::Relaxed, &guard, + ) { + Ok(_) => break, + Err(e) => node = e.new, + } + } + } +} + +// Concurrent queue for producer-consumer pipelines +use crossbeam::queue::ArrayQueue; + +let queue = ArrayQueue::new(1024); +// Producer: queue.push(item) -- returns Err if full +// Consumer: queue.pop() -- returns None if empty +``` + +### Data Parallelism with Rayon + +```rust +use rayon::prelude::*; + +// Simple parallel iteration +fn process_all(items: &mut [Item]) { + items.par_iter_mut().for_each(|item| { + item.transform(); + }); +} + +// Parallel chunking for better cache locality +fn sum_parallel(data: &[f64]) -> f64 { + data.par_chunks(1024) + .map(|chunk| chunk.iter().sum::()) + .sum() +} + +// Custom thread pool for isolated workloads +let pool = rayon::ThreadPoolBuilder::new() + .num_threads(4) + .thread_name(|i| format!("search-worker-{}", i)) + .stack_size(8 * 1024 * 1024) + .build() + .unwrap(); + +pool.install(|| { + // All rayon operations here use this pool + data.par_iter().for_each(|item| process(item)); +}); +``` + +### Atomic Operations and Memory Ordering + +```rust +use std::sync::atomic::{AtomicU64, AtomicBool, Ordering}; + +// Ordering guide: +// Relaxed -- No ordering guarantees. Counters, statistics. +// Acquire -- Reads see all writes before the paired Release. +// Release -- Writes become visible to paired Acquire reads. +// AcqRel -- Both Acquire and Release. Read-modify-write ops. +// SeqCst -- Total global ordering. Rarely needed, highest cost. + +struct Metrics { + request_count: AtomicU64, // Relaxed: just a counter + is_ready: AtomicBool, // Acquire/Release: guards initialization +} + +impl Metrics { + fn increment(&self) { + self.request_count.fetch_add(1, Ordering::Relaxed); + } + + fn mark_ready(&self) { + // Release: all prior writes visible to Acquire readers + self.is_ready.store(true, Ordering::Release); + } + + fn wait_ready(&self) { + // Acquire: sees all writes before the Release store + while !self.is_ready.load(Ordering::Acquire) { + std::hint::spin_loop(); + } + } +} +``` + +### Avoiding False Sharing + +```rust +// BAD: Adjacent atomics on the same cache line cause contention +struct BadCounters { + counter_a: AtomicU64, // Same 64-byte cache line as counter_b + counter_b: AtomicU64, +} + +// GOOD: Pad to separate cache lines +#[repr(align(64))] +struct PaddedCounter { + value: AtomicU64, +} + +struct GoodCounters { + counter_a: PaddedCounter, // Own cache line + counter_b: PaddedCounter, // Own cache line +} +``` + +### Disciplined Workflow Integration (Concurrency) + +- **Research phase**: Profile for Amdahl's law -- identify serial bottlenecks before parallelizing +- **Design phase**: Specify memory ordering rationale for every atomic operation; choose rayon vs crossbeam vs atomics +- **Verification phase**: `loom` and ThreadSanitizer are mandatory for lock-free code; stress tests with concurrent access + +## Build Optimization and Distribution + +### Size-Optimized Profiles + +```toml +# Cargo.toml -- additional profiles beyond release/release-lto + +[profile.release-small] +inherits = "release" +opt-level = "z" # Optimize for binary size +strip = "symbols" # Remove symbol table +panic = "abort" # No unwinding machinery +codegen-units = 1 # Better optimization, slower compile + +[profile.release-wasm] +inherits = "release" +opt-level = "s" # Balance size and speed for WASM +lto = true +``` + +### Symbol Stripping + +```toml +[profile.release] +strip = "none" # Keep everything (debugging) +# strip = "debuginfo" # Remove debug info, keep symbols (profiling) +# strip = "symbols" # Remove all symbols (distribution) +``` + +**When to use each**: +- `"none"` -- Development, debugging, profiling +- `"debuginfo"` -- Production with profiling capability (flamegraphs still work) +- `"symbols"` -- Final distribution binaries (smallest size) + +### Feature Gates for Conditional Compilation + +```toml +# Cargo.toml +[features] +default = ["tls"] +tls = ["dep:rustls"] +simd = [] # Enable SIMD code paths +jemalloc = ["dep:tikv-jemallocator"] + +# Reduce binary size by making features optional +full = ["tls", "simd", "jemalloc"] +minimal = [] # No optional features +``` + +```rust +// Use cfg to conditionally compile +#[cfg(feature = "simd")] +fn process_fast(data: &[u8]) -> Vec { + // SIMD implementation +} + +#[cfg(not(feature = "simd"))] +fn process_fast(data: &[u8]) -> Vec { + // Scalar fallback +} +``` + +### Cross-Compilation with `cross` + +```bash +# Install cross (uses Docker for cross-compilation) +cargo install cross + +# Build for Linux from macOS +cross build --release --target x86_64-unknown-linux-gnu + +# Build for ARM (Raspberry Pi) +cross build --release --target aarch64-unknown-linux-gnu + +# Build for Windows from macOS/Linux +cross build --release --target x86_64-pc-windows-gnu +``` + +### Custom Allocators + +```rust +// jemalloc for better multithreaded allocation performance +#[cfg(feature = "jemalloc")] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +// Arena allocation for request-scoped data +use bumpalo::Bump; + +fn handle_request(data: &[u8]) -> Response { + let arena = Bump::new(); + // All allocations freed at once when arena drops + let parsed = arena.alloc(parse(data)); + let transformed = arena.alloc(transform(parsed)); + build_response(transformed) +} +``` + +### Disciplined Workflow Integration (Build Optimization) + +- **Design phase**: Document profile selection rationale per deployment target (server vs WASM vs CLI) +- **Verification phase**: Verify all feature combinations compile; include binary size in benchmark reports +- **Validation phase**: Validate stripped binaries work on target platforms; confirm WASM bundle size meets budget + ## Compiler Hints ```rust diff --git a/skills/testing/SKILL.md b/skills/testing/SKILL.md index afdad46..e3b763b 100644 --- a/skills/testing/SKILL.md +++ b/skills/testing/SKILL.md @@ -343,6 +343,216 @@ fuzz_target!(|data: &[u8]| { }); ``` +## Advanced Verification Tools + +### Miri: Undefined Behavior Detection + +Miri is an interpreter for Rust's Mid-level Intermediate Representation that detects undefined behavior in unsafe code. + +```bash +# Install Miri (requires nightly) +rustup +nightly component add miri + +# Run all tests under Miri +cargo +nightly miri test + +# Run specific test under Miri +cargo +nightly miri test -- test_name + +# Run with Stacked Borrows (stricter aliasing checks) +MIRIFLAGS="-Zmiri-strict-provenance" cargo +nightly miri test +``` + +**What Miri detects**: +- Use after free, double free +- Out-of-bounds memory access +- Invalid use of uninitialized data +- Violations of aliasing rules (Stacked Borrows) +- Data races (when using `-Zmiri-preemption-rate=0.1`) +- Memory leaks (with `-Zmiri-leak-check`) + +**When to run Miri**: +- Every PR that adds or modifies `unsafe` code +- As a CI gate for modules containing `unsafe` +- Before releasing crates with `unsafe` internals + +```rust +// Test that works well with Miri -- avoids I/O and external calls +#[test] +fn miri_compatible_unsafe_test() { + let mut data = vec![1u8, 2, 3, 4]; + let ptr = data.as_mut_ptr(); + + // SAFETY: ptr is valid for data.len() bytes, properly aligned + unsafe { + std::ptr::write(ptr.add(2), 42); + } + + assert_eq!(data[2], 42); +} +``` + +**Limitations**: Miri cannot run code that calls external C functions, performs I/O, or uses inline assembly. Structure tests to isolate pure Rust logic for Miri compatibility. + +### Fuzzing with cargo-fuzz + +```bash +# Install cargo-fuzz +cargo install cargo-fuzz + +# Initialize fuzzing in your project +cargo fuzz init + +# Create a fuzz target +cargo fuzz add parse_input +``` + +```rust +// fuzz/fuzz_targets/parse_input.rs +#![no_main] +use libfuzzer_sys::fuzz_target; +use my_crate::parse; + +fuzz_target!(|data: &[u8]| { + // Parser should never panic on arbitrary input + let _ = parse(data); +}); + +// Structured fuzzing with Arbitrary +use libfuzzer_sys::arbitrary::{self, Arbitrary}; + +#[derive(Arbitrary, Debug)] +struct FuzzInput { + query: String, + limit: u32, + offset: u32, +} + +fuzz_target!(|input: FuzzInput| { + let _ = search(&input.query, input.limit, input.offset); +}); +``` + +```bash +# Run fuzzer (runs until stopped or crash found) +cargo fuzz run parse_input + +# Run for specific duration +cargo fuzz run parse_input -- -max_total_time=300 + +# Minimize a crashing corpus entry +cargo fuzz tmin parse_input crash-file + +# Check coverage of fuzz corpus +cargo fuzz coverage parse_input +``` + +**Corpus management**: +- Store meaningful seeds in `fuzz/corpus//` +- Commit regression inputs from crashes to the corpus +- Run `cargo fuzz cmin` periodically to minimize the corpus + +### Sanitizers + +Sanitizers detect runtime errors that Miri cannot (e.g., in code with FFI or I/O). + +```bash +# AddressSanitizer: buffer overflows, use-after-free, leaks +RUSTFLAGS="-Zsanitizer=address" cargo +nightly test --target x86_64-unknown-linux-gnu + +# ThreadSanitizer: data races in concurrent code +RUSTFLAGS="-Zsanitizer=thread" cargo +nightly test --target x86_64-unknown-linux-gnu + +# MemorySanitizer: reads of uninitialized memory +RUSTFLAGS="-Zsanitizer=memory" cargo +nightly test --target x86_64-unknown-linux-gnu +``` + +**When to use each**: + +| Sanitizer | Detects | Use When | +|-----------|---------|----------| +| ASan | Buffer overflow, use-after-free, memory leaks | Any unsafe code, FFI boundaries | +| TSan | Data races, deadlocks | Concurrent code with shared state, lock-free structures | +| MSan | Reads of uninitialized memory | FFI code receiving data from C, MaybeUninit usage | + +**CI integration**: Run sanitizers on a nightly CI job (not blocking, since they require nightly). + +### Advanced Property Testing with proptest + +```rust +use proptest::prelude::*; +use proptest::collection::vec; + +// Custom strategy for domain-specific types +fn valid_email() -> impl Strategy { + ( + "[a-z]{1,20}", // local part + "[a-z]{1,10}", // domain + prop_oneof!["com", "org", "net"], + ) + .prop_map(|(local, domain, tld)| format!("{}@{}.{}", local, domain, tld)) +} + +proptest! { + // Test with custom generators + #[test] + fn valid_emails_are_accepted(email in valid_email()) { + prop_assert!(validate_email(&email).is_ok()); + } + + // Test invariants across transformations + #[test] + fn encode_decode_roundtrip(data in vec(any::(), 0..1024)) { + let encoded = encode(&data); + let decoded = decode(&encoded).unwrap(); + prop_assert_eq!(data, decoded); + } + + // Regression file: proptest stores failing cases in + // proptest-regressions/ so they're retested on every run +} +``` + +### Loom: Concurrency Testing + +```rust +// Use loom for exhaustive concurrency testing of lock-free code +#[cfg(loom)] +use loom::sync::atomic::{AtomicUsize, Ordering}; +#[cfg(not(loom))] +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[cfg(loom)] +#[test] +fn concurrent_counter_is_correct() { + loom::model(|| { + let counter = loom::sync::Arc::new(AtomicUsize::new(0)); + let c1 = counter.clone(); + let c2 = counter.clone(); + + let t1 = loom::thread::spawn(move || { + c1.fetch_add(1, Ordering::SeqCst); + }); + let t2 = loom::thread::spawn(move || { + c2.fetch_add(1, Ordering::SeqCst); + }); + + t1.join().unwrap(); + t2.join().unwrap(); + assert_eq!(counter.load(Ordering::SeqCst), 2); + }); +} +``` + +### Disciplined Workflow Integration (Advanced Testing) + +- **Research phase** (Phase 1): Identify unsafe code paths needing Miri; catalogue parser inputs for fuzz corpus seeding +- **Design phase** (Phase 2): Specify which verification tools apply to each module; design fuzz harness API +- **Verification phase** (Phase 4): Miri runs on all unsafe code; TSan on all concurrent code; fuzz campaigns on parsers; property tests on invariants +- **Validation phase** (Phase 5): Confirm fuzz campaigns have adequate coverage; validate no UB in production-like environment + +**Cross-references**: See `rust-development` skill for unsafe code policy; see `rust-performance` skill for concurrency patterns requiring loom/TSan verification. + ## Constraints - Never use real external services in unit tests